mirror of
https://github.com/xroche/httrack.git
synced 2026-06-20 09:09:02 +03:00
htslib: tunnel https through the proxy via CONNECT (#85)
httrack opened https connections straight to the origin even when a proxy was configured, so --proxy was silently ignored for https and the crawler used the real IP. http_xfopen bypassed the proxy for any https:// URL, because the absolute-URI proxy form it uses for http cannot carry https. Connect to the proxy instead and, once the TCP connection is up, open an HTTP CONNECT tunnel (http_proxy_tunnel) before the TLS handshake, so TLS runs end-to-end with the origin. Proxy credentials now ride the CONNECT request rather than the tunneled GET, where they would leak to the origin. The exchange is a bounded blocking read inside the back_wait connect path: no new async state, no struct/ABI change (the helpers stay visibility-hidden). Verified end-to-end by 13_crawl_proxy_https.test: it crawls a local self-signed https origin through a logging CONNECT proxy and asserts the proxy saw the CONNECT and that credentials ride it. The assertion fails on the pre-fix bypass. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>
This commit is contained in:
123
tests/proxy-https-server.py
Normal file
123
tests/proxy-https-server.py
Normal file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Local CONNECT proxy + self-signed HTTPS origin for the issue #85 test.
|
||||
|
||||
Starts a TLS origin server and an HTTP proxy that honours CONNECT, on ephemeral
|
||||
ports. Every request line the proxy receives (and any Proxy-Authorization) is
|
||||
appended to the log file, so the test can assert that an https crawl really
|
||||
tunneled through the proxy instead of bypassing it.
|
||||
|
||||
Usage: proxy-https-server.py <cert.pem> <logfile>
|
||||
Prints "ORIGIN <port>", "PROXY <port>", then "ready" (one per line) on stdout.
|
||||
"""
|
||||
import http.server
|
||||
import socket
|
||||
import socketserver
|
||||
import ssl
|
||||
import sys
|
||||
import threading
|
||||
|
||||
ORIGIN_BODY = b"<html><body>ORIGIN-PAGE-85</body></html>"
|
||||
|
||||
|
||||
class Origin(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/html")
|
||||
self.send_header("Content-Length", str(len(ORIGIN_BODY)))
|
||||
self.end_headers()
|
||||
self.wfile.write(ORIGIN_BODY)
|
||||
|
||||
def log_message(self, *args):
|
||||
pass
|
||||
|
||||
|
||||
def start_origin(certfile):
|
||||
httpd = socketserver.TCPServer(("127.0.0.1", 0), Origin)
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
ctx.load_cert_chain(certfile)
|
||||
httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
|
||||
port = httpd.socket.getsockname()[1]
|
||||
threading.Thread(target=httpd.serve_forever, daemon=True).start()
|
||||
return port
|
||||
|
||||
|
||||
def pipe(src, dst):
|
||||
try:
|
||||
while True:
|
||||
data = src.recv(65536)
|
||||
if not data:
|
||||
break
|
||||
dst.sendall(data)
|
||||
except OSError:
|
||||
pass
|
||||
finally:
|
||||
for sock in (src, dst):
|
||||
try:
|
||||
sock.shutdown(socket.SHUT_RDWR)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def handle_client(conn, logfile):
|
||||
rfile = conn.makefile("rb")
|
||||
request_line = rfile.readline().decode("latin-1").strip()
|
||||
auth = None
|
||||
while True:
|
||||
line = rfile.readline().decode("latin-1")
|
||||
if line in ("\r\n", "\n", ""):
|
||||
break
|
||||
key, _, value = line.partition(":")
|
||||
if key.strip().lower() == "proxy-authorization":
|
||||
auth = value.strip()
|
||||
with open(logfile, "a") as handle:
|
||||
handle.write(request_line + "\n")
|
||||
if auth is not None:
|
||||
handle.write("AUTH " + auth + "\n")
|
||||
parts = request_line.split()
|
||||
if len(parts) >= 2 and parts[0] == "CONNECT":
|
||||
host, _, port = parts[1].partition(":")
|
||||
try:
|
||||
upstream = socket.create_connection((host, int(port or 443)))
|
||||
except OSError:
|
||||
conn.sendall(b"HTTP/1.0 502 Bad Gateway\r\n\r\n")
|
||||
conn.close()
|
||||
return
|
||||
conn.sendall(b"HTTP/1.0 200 Connection established\r\n\r\n")
|
||||
threading.Thread(target=pipe, args=(conn, upstream), daemon=True).start()
|
||||
pipe(upstream, conn)
|
||||
else:
|
||||
conn.sendall(b"HTTP/1.0 501 Not Implemented\r\n\r\n")
|
||||
conn.close()
|
||||
|
||||
|
||||
def start_proxy(logfile):
|
||||
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
srv.bind(("127.0.0.1", 0))
|
||||
srv.listen(16)
|
||||
port = srv.getsockname()[1]
|
||||
|
||||
def serve():
|
||||
while True:
|
||||
conn, _ = srv.accept()
|
||||
threading.Thread(
|
||||
target=handle_client, args=(conn, logfile), daemon=True
|
||||
).start()
|
||||
|
||||
threading.Thread(target=serve, daemon=True).start()
|
||||
return port
|
||||
|
||||
|
||||
def main():
|
||||
certfile, logfile = sys.argv[1], sys.argv[2]
|
||||
open(logfile, "w").close()
|
||||
origin_port = start_origin(certfile)
|
||||
proxy_port = start_proxy(logfile)
|
||||
print("ORIGIN %d" % origin_port, flush=True)
|
||||
print("PROXY %d" % proxy_port, flush=True)
|
||||
print("ready", flush=True)
|
||||
threading.Event().wait()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user