Files
httrack/tests/proxy-https-server.py
Xavier Roche 1907621d37 htslib: tunnel https through the proxy via CONNECT (#85)
httrack opened https connections straight to the origin even when a proxy
was configured, so --proxy was silently ignored for https and the crawler
used the real IP. http_xfopen bypassed the proxy for any https:// URL,
because the absolute-URI proxy form it uses for http cannot carry https.

Connect to the proxy instead and, once the TCP connection is up, open an
HTTP CONNECT tunnel (http_proxy_tunnel) before the TLS handshake, so TLS
runs end-to-end with the origin. Proxy credentials now ride the CONNECT
request rather than the tunneled GET, where they would leak to the origin.
The exchange is a bounded blocking read inside the back_wait connect path:
no new async state, no struct/ABI change (the helpers stay visibility-hidden).

Verified end-to-end by 13_crawl_proxy_https.test: it crawls a local
self-signed https origin through a logging CONNECT proxy and asserts the
proxy saw the CONNECT and that credentials ride it. The assertion fails on
the pre-fix bypass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-19 08:43:56 +02:00

124 lines
3.7 KiB
Python

#!/usr/bin/env python3
"""Local CONNECT proxy + self-signed HTTPS origin for the issue #85 test.
Starts a TLS origin server and an HTTP proxy that honours CONNECT, on ephemeral
ports. Every request line the proxy receives (and any Proxy-Authorization) is
appended to the log file, so the test can assert that an https crawl really
tunneled through the proxy instead of bypassing it.
Usage: proxy-https-server.py <cert.pem> <logfile>
Prints "ORIGIN <port>", "PROXY <port>", then "ready" (one per line) on stdout.
"""
import http.server
import socket
import socketserver
import ssl
import sys
import threading
ORIGIN_BODY = b"<html><body>ORIGIN-PAGE-85</body></html>"
class Origin(http.server.BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.send_header("Content-Type", "text/html")
self.send_header("Content-Length", str(len(ORIGIN_BODY)))
self.end_headers()
self.wfile.write(ORIGIN_BODY)
def log_message(self, *args):
pass
def start_origin(certfile):
httpd = socketserver.TCPServer(("127.0.0.1", 0), Origin)
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ctx.load_cert_chain(certfile)
httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
port = httpd.socket.getsockname()[1]
threading.Thread(target=httpd.serve_forever, daemon=True).start()
return port
def pipe(src, dst):
try:
while True:
data = src.recv(65536)
if not data:
break
dst.sendall(data)
except OSError:
pass
finally:
for sock in (src, dst):
try:
sock.shutdown(socket.SHUT_RDWR)
except OSError:
pass
def handle_client(conn, logfile):
rfile = conn.makefile("rb")
request_line = rfile.readline().decode("latin-1").strip()
auth = None
while True:
line = rfile.readline().decode("latin-1")
if line in ("\r\n", "\n", ""):
break
key, _, value = line.partition(":")
if key.strip().lower() == "proxy-authorization":
auth = value.strip()
with open(logfile, "a") as handle:
handle.write(request_line + "\n")
if auth is not None:
handle.write("AUTH " + auth + "\n")
parts = request_line.split()
if len(parts) >= 2 and parts[0] == "CONNECT":
host, _, port = parts[1].partition(":")
try:
upstream = socket.create_connection((host, int(port or 443)))
except OSError:
conn.sendall(b"HTTP/1.0 502 Bad Gateway\r\n\r\n")
conn.close()
return
conn.sendall(b"HTTP/1.0 200 Connection established\r\n\r\n")
threading.Thread(target=pipe, args=(conn, upstream), daemon=True).start()
pipe(upstream, conn)
else:
conn.sendall(b"HTTP/1.0 501 Not Implemented\r\n\r\n")
conn.close()
def start_proxy(logfile):
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
srv.bind(("127.0.0.1", 0))
srv.listen(16)
port = srv.getsockname()[1]
def serve():
while True:
conn, _ = srv.accept()
threading.Thread(
target=handle_client, args=(conn, logfile), daemon=True
).start()
threading.Thread(target=serve, daemon=True).start()
return port
def main():
certfile, logfile = sys.argv[1], sys.argv[2]
open(logfile, "w").close()
origin_port = start_origin(certfile)
proxy_port = start_proxy(logfile)
print("ORIGIN %d" % origin_port, flush=True)
print("PROXY %d" % proxy_port, flush=True)
print("ready", flush=True)
threading.Event().wait()
if __name__ == "__main__":
main()