#!/bin/bash # # Issue #85: an https crawl must go through the configured proxy (CONNECT # tunnel), not bypass it and hit the origin directly. Fully local: a self-signed # TLS origin plus a logging CONNECT proxy, so no network access is needed. set -euo pipefail : "${top_srcdir:=..}" if test "${HTTPS_SUPPORT:-}" == "no"; then echo "no https support compiled, skipping" exit 77 fi if ! command -v python3 >/dev/null 2>&1 || ! command -v openssl >/dev/null 2>&1; then echo "python3/openssl missing, skipping" exit 77 fi server="$top_srcdir/tests/proxy-https-server.py" tmpdir=$(mktemp -d) pids= cleanup() { for pid in $pids; do kill "$pid" 2>/dev/null || true done rm -rf "$tmpdir" } trap cleanup EXIT # self-signed cert for the local TLS origin (httrack does not verify certs) openssl req -x509 -newkey rsa:2048 -keyout "$tmpdir/key.pem" \ -out "$tmpdir/cert.pem" -days 2 -nodes -subj "/CN=127.0.0.1" \ >/dev/null 2>&1 cat "$tmpdir/key.pem" "$tmpdir/cert.pem" >"$tmpdir/both.pem" # start_server : launches a proxy+origin pair, sets $origin_port # and $proxy_port from its announced ephemeral ports. start_server() { local dir="$1" mode="$2" ports mkdir -p "$dir" ports="$dir/ports.txt" python3 "$server" "$tmpdir/both.pem" "$dir" "$mode" \ >"$ports" 2>"$dir/server.err" & pids="$pids $!" for _ in $(seq 1 100); do grep -q "^ready" "$ports" 2>/dev/null && break sleep 0.1 done grep -q "^ready" "$ports" 2>/dev/null || { echo "server ($mode) did not start" >&2 cat "$dir/server.err" >&2 exit 1 } origin_port=$(awk '/^ORIGIN/{print $2}' "$ports") proxy_port=$(awk '/^PROXY/{print $2}' "$ports") } # Run httrack, but kill it after a deadline so a hang (e.g. a missing bound on # the proxy response) surfaces as the kill code $HANG_RC instead of stalling the # whole job. A portable stand-in for `timeout`, which macOS lacks. HANG_RC=137 # 128 + SIGKILL run_crawl() { local out="$1" proxy="$2" port="$3" rm -rf "$out" httrack "https://127.0.0.1:${port}/" --proxy "$proxy" \ -O "$out" -r1 -s0 --timeout=10 >"$out.log" 2>&1 & local pid=$! (sleep 60 && kill -9 "$pid" 2>/dev/null) & local guard=$! local rc=0 wait "$pid" 2>/dev/null || rc=$? kill "$guard" 2>/dev/null || true wait "$guard" 2>/dev/null || true return "$rc" } # --- working proxy ---------------------------------------------------------- ok="$tmpdir/ok" start_server "$ok" ok # 1. page retrieved AND the proxy saw a CONNECT to the origin run_crawl "$ok/out" "127.0.0.1:${proxy_port}" "$origin_port" grep -rq "ORIGIN-PAGE-85" "$ok/out" || { echo "FAIL: origin page not downloaded through proxy" >&2 cat "$ok/out.log" >&2 exit 1 } grep -q "^CONNECT 127.0.0.1:${origin_port} " "$ok/proxy.log" || { echo "FAIL: proxy never received a CONNECT (https bypassed the proxy)" >&2 cat "$ok/proxy.log" >&2 exit 1 } echo "OK: https tunneled through proxy via CONNECT" # 2. authenticated proxy: creds ride the CONNECT, and NEVER reach the origin : >"$ok/proxy.log" : >"$ok/origin-headers.log" run_crawl "$ok/out2" "user:secret@127.0.0.1:${proxy_port}" "$origin_port" grep -rq "ORIGIN-PAGE-85" "$ok/out2" || { echo "FAIL: origin page not downloaded through authenticated proxy" >&2 exit 1 } got=$(awk '/^AUTH Basic /{print $3}' "$ok/proxy.log" | head -1) # base64("user:secret"); compared as a literal to stay portable (no base64 -d, # which differs between GNU and BSD) test "$got" == "dXNlcjpzZWNyZXQ=" || { echo "FAIL: Proxy-Authorization not carried on CONNECT (got '$got')" >&2 cat "$ok/proxy.log" >&2 exit 1 } if grep -qi "proxy-authorization" "$ok/origin-headers.log"; then echo "FAIL: proxy credentials leaked to the origin through the tunnel" >&2 cat "$ok/origin-headers.log" >&2 exit 1 fi echo "OK: proxy credentials carried on CONNECT, not leaked to origin" # --- hostile proxy ---------------------------------------------------------- # A proxy that answers 200 then streams headers forever must not hang the crawl: # the client bounds the response. run_crawl kills a hung httrack after 60s, so a # missing bound surfaces as $HANG_RC here. flood="$tmpdir/flood" start_server "$flood" flood rc=0 run_crawl "$flood/out" "127.0.0.1:${proxy_port}" "$origin_port" || rc=$? test "$rc" -ne "$HANG_RC" || { echo "FAIL: crawl hung on a flooding proxy (bounded read missing)" >&2 exit 1 } grep -rq "ORIGIN-PAGE-85" "$flood/out" 2>/dev/null && { echo "FAIL: flooding proxy unexpectedly served the page" >&2 exit 1 } echo "OK: bounded proxy response, no hang on a flooding proxy"