mirror of
https://github.com/xroche/httrack.git
synced 2026-06-20 17:18:14 +03:00
Follow-up to the CONNECT-tunnel change, from an adversarial review (the proxy response is hostile input: a malicious or MITM proxy controls every byte). - Bound the response read so a proxy cannot stall the single-threaded back_wait crawl: proxy_getline now fails on an over-long line instead of consuming it forever, the header drain is capped at 64 lines, and the send loop gives up rather than spin against a socket that reports writable but never accepts. - Size `authority` to hold any url_adr host (HTS_URLMAXSIZE*2) so an oversized hostname can't trip the abort-on-overflow buff helpers; grow `req` to match. - Reject control bytes in the CONNECT authority as a local backstop; today the CR/LF defense lives entirely upstream (escape_remove_control / header-line splitting). - Test: the origin now records the headers it receives, and the test asserts Proxy-Authorization never reaches the origin through the tunnel (the previous assertions couldn't see a leak). Added a flooding-proxy scenario that proves the crawl terminates instead of hanging on an unbounded response. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>
137 lines
4.6 KiB
Bash
137 lines
4.6 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Issue #85: an https crawl must go through the configured proxy (CONNECT
|
|
# tunnel), not bypass it and hit the origin directly. Fully local: a self-signed
|
|
# TLS origin plus a logging CONNECT proxy, so no network access is needed.
|
|
|
|
set -euo pipefail
|
|
|
|
: "${top_srcdir:=..}"
|
|
|
|
if test "${HTTPS_SUPPORT:-}" == "no"; then
|
|
echo "no https support compiled, skipping"
|
|
exit 77
|
|
fi
|
|
if ! command -v python3 >/dev/null 2>&1 || ! command -v openssl >/dev/null 2>&1; then
|
|
echo "python3/openssl missing, skipping"
|
|
exit 77
|
|
fi
|
|
|
|
server="$top_srcdir/tests/proxy-https-server.py"
|
|
tmpdir=$(mktemp -d)
|
|
pids=
|
|
|
|
cleanup() {
|
|
for pid in $pids; do
|
|
kill "$pid" 2>/dev/null || true
|
|
done
|
|
rm -rf "$tmpdir"
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
# self-signed cert for the local TLS origin (httrack does not verify certs)
|
|
openssl req -x509 -newkey rsa:2048 -keyout "$tmpdir/key.pem" \
|
|
-out "$tmpdir/cert.pem" -days 2 -nodes -subj "/CN=127.0.0.1" \
|
|
>/dev/null 2>&1
|
|
cat "$tmpdir/key.pem" "$tmpdir/cert.pem" >"$tmpdir/both.pem"
|
|
|
|
# start_server <logdir> <mode>: launches a proxy+origin pair, sets $origin_port
|
|
# and $proxy_port from its announced ephemeral ports.
|
|
start_server() {
|
|
local dir="$1" mode="$2" ports
|
|
mkdir -p "$dir"
|
|
ports="$dir/ports.txt"
|
|
python3 "$server" "$tmpdir/both.pem" "$dir" "$mode" \
|
|
>"$ports" 2>"$dir/server.err" &
|
|
pids="$pids $!"
|
|
for _ in $(seq 1 100); do
|
|
grep -q "^ready" "$ports" 2>/dev/null && break
|
|
sleep 0.1
|
|
done
|
|
grep -q "^ready" "$ports" 2>/dev/null || {
|
|
echo "server ($mode) did not start" >&2
|
|
cat "$dir/server.err" >&2
|
|
exit 1
|
|
}
|
|
origin_port=$(awk '/^ORIGIN/{print $2}' "$ports")
|
|
proxy_port=$(awk '/^PROXY/{print $2}' "$ports")
|
|
}
|
|
|
|
# Run httrack, but kill it after a deadline so a hang (e.g. a missing bound on
|
|
# the proxy response) surfaces as the kill code $HANG_RC instead of stalling the
|
|
# whole job. A portable stand-in for `timeout`, which macOS lacks.
|
|
HANG_RC=137 # 128 + SIGKILL
|
|
run_crawl() {
|
|
local out="$1" proxy="$2" port="$3"
|
|
rm -rf "$out"
|
|
httrack "https://127.0.0.1:${port}/" --proxy "$proxy" \
|
|
-O "$out" -r1 -s0 --timeout=10 >"$out.log" 2>&1 &
|
|
local pid=$!
|
|
(sleep 60 && kill -9 "$pid" 2>/dev/null) &
|
|
local guard=$!
|
|
local rc=0
|
|
wait "$pid" 2>/dev/null || rc=$?
|
|
kill "$guard" 2>/dev/null || true
|
|
wait "$guard" 2>/dev/null || true
|
|
return "$rc"
|
|
}
|
|
|
|
# --- working proxy ----------------------------------------------------------
|
|
ok="$tmpdir/ok"
|
|
start_server "$ok" ok
|
|
|
|
# 1. page retrieved AND the proxy saw a CONNECT to the origin
|
|
run_crawl "$ok/out" "127.0.0.1:${proxy_port}" "$origin_port"
|
|
grep -rq "ORIGIN-PAGE-85" "$ok/out" || {
|
|
echo "FAIL: origin page not downloaded through proxy" >&2
|
|
cat "$ok/out.log" >&2
|
|
exit 1
|
|
}
|
|
grep -q "^CONNECT 127.0.0.1:${origin_port} " "$ok/proxy.log" || {
|
|
echo "FAIL: proxy never received a CONNECT (https bypassed the proxy)" >&2
|
|
cat "$ok/proxy.log" >&2
|
|
exit 1
|
|
}
|
|
echo "OK: https tunneled through proxy via CONNECT"
|
|
|
|
# 2. authenticated proxy: creds ride the CONNECT, and NEVER reach the origin
|
|
: >"$ok/proxy.log"
|
|
: >"$ok/origin-headers.log"
|
|
run_crawl "$ok/out2" "user:secret@127.0.0.1:${proxy_port}" "$origin_port"
|
|
grep -rq "ORIGIN-PAGE-85" "$ok/out2" || {
|
|
echo "FAIL: origin page not downloaded through authenticated proxy" >&2
|
|
exit 1
|
|
}
|
|
got=$(awk '/^AUTH Basic /{print $3}' "$ok/proxy.log" | head -1)
|
|
# base64("user:secret"); compared as a literal to stay portable (no base64 -d,
|
|
# which differs between GNU and BSD)
|
|
test "$got" == "dXNlcjpzZWNyZXQ=" || {
|
|
echo "FAIL: Proxy-Authorization not carried on CONNECT (got '$got')" >&2
|
|
cat "$ok/proxy.log" >&2
|
|
exit 1
|
|
}
|
|
if grep -qi "proxy-authorization" "$ok/origin-headers.log"; then
|
|
echo "FAIL: proxy credentials leaked to the origin through the tunnel" >&2
|
|
cat "$ok/origin-headers.log" >&2
|
|
exit 1
|
|
fi
|
|
echo "OK: proxy credentials carried on CONNECT, not leaked to origin"
|
|
|
|
# --- hostile proxy ----------------------------------------------------------
|
|
# A proxy that answers 200 then streams headers forever must not hang the crawl:
|
|
# the client bounds the response. run_crawl kills a hung httrack after 60s, so a
|
|
# missing bound surfaces as $HANG_RC here.
|
|
flood="$tmpdir/flood"
|
|
start_server "$flood" flood
|
|
rc=0
|
|
run_crawl "$flood/out" "127.0.0.1:${proxy_port}" "$origin_port" || rc=$?
|
|
test "$rc" -ne "$HANG_RC" || {
|
|
echo "FAIL: crawl hung on a flooding proxy (bounded read missing)" >&2
|
|
exit 1
|
|
}
|
|
grep -rq "ORIGIN-PAGE-85" "$flood/out" 2>/dev/null && {
|
|
echo "FAIL: flooding proxy unexpectedly served the page" >&2
|
|
exit 1
|
|
}
|
|
echo "OK: bounded proxy response, no hang on a flooding proxy"
|