mirror of
https://github.com/xroche/httrack.git
synced 2026-06-20 00:58:47 +03:00
Compare commits
12 Commits
fix/cookie
...
fix/html-u
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
799c045061 | ||
|
|
fb1ee3bf2e | ||
|
|
6a08ca7d39 | ||
|
|
a8b491e509 | ||
|
|
a8e4bb3b81 | ||
|
|
0145ec37a3 | ||
|
|
a80fab38ba | ||
|
|
c52a524a63 | ||
|
|
1907621d37 | ||
|
|
3b2d7afdaa | ||
|
|
6ee539619e | ||
|
|
fb098b27b4 |
@@ -2532,8 +2532,26 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
#if HTS_USEOPENSSL
|
||||
/* SSL mode */
|
||||
if (back[i].r.ssl) {
|
||||
int tunnel_ok = 1;
|
||||
|
||||
// https via proxy: CONNECT-tunnel before TLS (#85)
|
||||
if (back[i].r.req.proxy.active && back[i].r.ssl_con == NULL) {
|
||||
const int timeout = back[i].timeout > 0 ? back[i].timeout : 30;
|
||||
|
||||
tunnel_ok =
|
||||
http_proxy_tunnel(opt, &back[i].r, back[i].url_adr, timeout);
|
||||
if (!tunnel_ok) {
|
||||
if (!strnotempty(back[i].r.msg))
|
||||
strcpybuff(back[i].r.msg, "proxy CONNECT failed");
|
||||
deletehttp(&back[i].r);
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
back[i].r.statuscode = STATUSCODE_NON_FATAL;
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
}
|
||||
}
|
||||
// handshake not yet launched
|
||||
if (!back[i].r.ssl_con) {
|
||||
if (tunnel_ok && !back[i].r.ssl_con) {
|
||||
SSL_CTX_set_options(openssl_ctx, SSL_OP_ALL);
|
||||
// new session
|
||||
back[i].r.ssl_con = SSL_new(openssl_ctx);
|
||||
@@ -2551,7 +2569,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
back[i].r.statuscode = STATUSCODE_SSL_HANDSHAKE;
|
||||
}
|
||||
/* Error */
|
||||
if (back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
|
||||
if (tunnel_ok && back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
|
||||
strcpybuff(back[i].r.msg, "bad SSL/TLS handshake");
|
||||
deletehttp(&back[i].r);
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
|
||||
@@ -953,9 +953,11 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
p = buff;
|
||||
do {
|
||||
int insert_after_argc;
|
||||
int quoted; /* "" unquotes to empty but is still a real token (#106) */
|
||||
|
||||
// read next
|
||||
lastp = p;
|
||||
quoted = (p != NULL && *p == '"');
|
||||
if (p) {
|
||||
p = next_token(p, 1);
|
||||
if (p) {
|
||||
@@ -966,7 +968,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
|
||||
/* Insert parameters BUT so that they can be in the same order */
|
||||
if (lastp) {
|
||||
if (strnotempty(lastp)) {
|
||||
if (strnotempty(lastp) || quoted) {
|
||||
insert_after_argc = argc - insert_after;
|
||||
cmdl_ins(lastp, insert_after_argc, (argv + insert_after), x_argvblk,
|
||||
x_argvblk_size, x_ptr);
|
||||
|
||||
193
src/htslib.c
193
src/htslib.c
@@ -644,6 +644,165 @@ T_SOC http_fopen(httrackp * opt, const char *adr, const char *fil, htsblk * reto
|
||||
return http_xfopen(opt, 0, 1, 1, NULL, adr, fil, retour);
|
||||
}
|
||||
|
||||
// Read a CRLF line from a non-blocking socket (waits up to timeout per recv).
|
||||
// Returns the line length (0 = empty), or -1 on timeout/EOF/error.
|
||||
static int proxy_getline(T_SOC soc, char *s, int max, int timeout) {
|
||||
int j = 0;
|
||||
|
||||
for (;;) {
|
||||
unsigned char ch;
|
||||
int n;
|
||||
|
||||
if (!check_readinput_t(soc, timeout))
|
||||
return -1; // timed out waiting for data
|
||||
n = (int) recv(soc, &ch, 1, 0);
|
||||
if (n == 1) {
|
||||
if (ch == 13) // CR
|
||||
continue;
|
||||
if (ch == 10) // LF: end of line
|
||||
break;
|
||||
if (j >= max - 1)
|
||||
return -1; // line too long: bound the read against a hostile proxy
|
||||
s[j++] = (char) ch;
|
||||
} else if (n == 0) {
|
||||
return -1; // connection closed
|
||||
} else {
|
||||
#ifdef _WIN32
|
||||
if (WSAGetLastError() == WSAEWOULDBLOCK)
|
||||
continue;
|
||||
#else
|
||||
if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
|
||||
continue;
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
s[j] = '\0';
|
||||
return j;
|
||||
}
|
||||
|
||||
int http_proxy_tunnel(httrackp *opt, htsblk *retour, const char *adr,
|
||||
int timeout) {
|
||||
const T_SOC soc = retour->soc;
|
||||
const char *const host = jump_identification_const(adr); // host[:port]
|
||||
const char *const portsep = jump_toport_const(adr); // ":port" or NULL
|
||||
char BIGSTK authority[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK req[HTS_URLMAXSIZE * 4 + 1100];
|
||||
char line[1024];
|
||||
int code;
|
||||
|
||||
if (soc == INVALID_SOCKET)
|
||||
return 0;
|
||||
|
||||
// CONNECT needs an explicit host:port; default the https port
|
||||
authority[0] = '\0';
|
||||
if (portsep != NULL)
|
||||
strlcatbuff(authority, host, sizeof(authority)); // already host:port
|
||||
else
|
||||
snprintf(authority, sizeof(authority), "%s:%d", host, 443);
|
||||
|
||||
// backstop: never let a stray CR/LF in the host smuggle a second line into
|
||||
// the CONNECT request (the host is already sanitized upstream)
|
||||
{
|
||||
const char *c;
|
||||
|
||||
for (c = authority; *c != '\0'; c++) {
|
||||
if ((unsigned char) *c < ' ') {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: invalid host");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
snprintf(req, sizeof(req), "CONNECT %s HTTP/1.0" H_CRLF "Host: %s" H_CRLF,
|
||||
authority, authority);
|
||||
|
||||
// creds go on the CONNECT, not the tunneled origin request
|
||||
if (link_has_authorization(retour->req.proxy.name)) {
|
||||
const char *a = jump_identification_const(retour->req.proxy.name);
|
||||
const char *astart = jump_protocol_const(retour->req.proxy.name);
|
||||
char autorisation[1100];
|
||||
char user_pass[256];
|
||||
|
||||
autorisation[0] = user_pass[0] = '\0';
|
||||
strncatbuff(user_pass, astart, (int) (a - astart) - 1);
|
||||
strcpybuff(user_pass, unescape_http(OPT_GET_BUFF(opt),
|
||||
OPT_GET_BUFF_SIZE(opt), user_pass));
|
||||
code64((unsigned char *) user_pass, (int) strlen(user_pass),
|
||||
(unsigned char *) autorisation, 0);
|
||||
strlcatbuff(req, "Proxy-Authorization: Basic ", sizeof(req));
|
||||
strlcatbuff(req, autorisation, sizeof(req));
|
||||
strlcatbuff(req, H_CRLF, sizeof(req));
|
||||
}
|
||||
strlcatbuff(req, H_CRLF, sizeof(req)); // end of request headers
|
||||
|
||||
// raw send: ssl is set, so sendc() would route to TLS
|
||||
{
|
||||
const char *p = req;
|
||||
size_t remain = strlen(req);
|
||||
int stalls = 0;
|
||||
|
||||
while (remain > 0) {
|
||||
const int n = (int) send(soc, p, (int) remain, 0);
|
||||
|
||||
if (n > 0) {
|
||||
p += n;
|
||||
remain -= (size_t) n;
|
||||
stalls = 0;
|
||||
} else {
|
||||
#ifdef _WIN32
|
||||
const int wouldblock = (WSAGetLastError() == WSAEWOULDBLOCK);
|
||||
#else
|
||||
const int wouldblock =
|
||||
(errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR);
|
||||
#endif
|
||||
// don't spin forever on a fatal error or an unwritable socket
|
||||
if (!wouldblock || !check_writeinput_t(soc, timeout) ||
|
||||
++stalls > 100) {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: write error");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// proxy status line: "HTTP/1.x <code> ..."
|
||||
if (proxy_getline(soc, line, sizeof(line), timeout) < 0) {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: no response");
|
||||
return 0;
|
||||
}
|
||||
if (sscanf(line, "HTTP/%*d.%*d %d", &code) < 1)
|
||||
code = 0;
|
||||
if (code < 200 || code >= 300) {
|
||||
snprintf(retour->msg, sizeof(retour->msg), "proxy CONNECT refused: %s",
|
||||
strnotempty(line) ? line : "(no status)");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// drain headers to the blank line; cap the count so a flooding proxy can't
|
||||
// stall the crawl
|
||||
{
|
||||
int headers = 0;
|
||||
|
||||
for (;;) {
|
||||
const int n = proxy_getline(soc, line, sizeof(line), timeout);
|
||||
|
||||
if (n < 0) {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: truncated response");
|
||||
return 0;
|
||||
}
|
||||
if (n == 0)
|
||||
break; // blank line: tunnel ready
|
||||
if (++headers > 64) {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: too many response headers");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
// ouverture d'une liaison http, envoi d'une requète
|
||||
// mode: 0 GET 1 HEAD [2 POST]
|
||||
// treat: traiter header?
|
||||
@@ -680,14 +839,14 @@ T_SOC http_xfopen(httrackp * opt, int mode, int treat, int waitconnect,
|
||||
|
||||
/* connexion */
|
||||
if (retour) {
|
||||
if ((!(retour->req.proxy.active))
|
||||
|| ((strcmp(adr, "file://") == 0)
|
||||
|| (strncmp(adr, "https://", 8) == 0)
|
||||
)
|
||||
) { /* pas de proxy, ou non utilisable ici */
|
||||
/* no proxy, or proxy not usable here (local file) */
|
||||
if ((!(retour->req.proxy.active)) || (strcmp(adr, "file://") == 0)) {
|
||||
soc = newhttp(opt, adr, retour, -1, waitconnect);
|
||||
} else {
|
||||
soc = newhttp(opt, retour->req.proxy.name, retour, retour->req.proxy.port, waitconnect); // ouvrir sur le proxy à la place
|
||||
// to the proxy; https tunnels to the origin via CONNECT in back_wait
|
||||
// (#85)
|
||||
soc = newhttp(opt, retour->req.proxy.name, retour, retour->req.proxy.port,
|
||||
waitconnect);
|
||||
}
|
||||
} else {
|
||||
soc = newhttp(opt, adr, NULL, -1, waitconnect);
|
||||
@@ -1043,8 +1202,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
if (xsend)
|
||||
print_buffer(&bstr, "%s", xsend); // éventuelles autres lignes
|
||||
|
||||
// tester proxy authentication
|
||||
if (retour->req.proxy.active) {
|
||||
// for https, auth rides the CONNECT (the tunneled GET would leak it)
|
||||
if (retour->req.proxy.active && strncmp(adr, "https://", 8) != 0) {
|
||||
if (link_has_authorization(retour->req.proxy.name)) { // et hop, authentification proxy!
|
||||
const char *a = jump_identification_const(retour->req.proxy.name);
|
||||
const char *astart = jump_protocol_const(retour->req.proxy.name);
|
||||
@@ -1827,6 +1986,24 @@ int check_readinput_t(T_SOC soc, int timeout) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// wait until the socket is writable, up to timeout seconds
|
||||
int check_writeinput_t(T_SOC soc, int timeout) {
|
||||
if (soc != INVALID_SOCKET) {
|
||||
fd_set fds;
|
||||
struct timeval tv;
|
||||
const int isoc = (int) soc;
|
||||
|
||||
assertf(isoc == soc);
|
||||
FD_ZERO(&fds);
|
||||
FD_SET(isoc, &fds);
|
||||
tv.tv_sec = timeout;
|
||||
tv.tv_usec = 0;
|
||||
select(isoc + 1, NULL, &fds, NULL, &tv);
|
||||
return FD_ISSET(isoc, &fds) ? 1 : 0;
|
||||
} else
|
||||
return 0;
|
||||
}
|
||||
|
||||
// idem, sauf qu'ici on peut choisir la taille max de données à recevoir
|
||||
// SI bufl==0 alors le buffer est censé être de 8kos, et on recoit par bloc de lignes
|
||||
// en éliminant les cr (ex: header), arrêt si double-lf
|
||||
|
||||
11
src/htslib.h
11
src/htslib.h
@@ -198,6 +198,17 @@ HTS_INLINE void deletesoc_r(htsblk * r);
|
||||
htsblk http_test(httrackp * opt, const char *adr, const char *fil, char *loc);
|
||||
int check_readinput(htsblk * r);
|
||||
int check_readinput_t(T_SOC soc, int timeout);
|
||||
int check_writeinput_t(T_SOC soc, int timeout);
|
||||
|
||||
/* Open an HTTP CONNECT tunnel through the active proxy for an https request:
|
||||
`retour->soc` must already be TCP-connected to the proxy, and `adr` is the
|
||||
origin authority (url_adr, e.g. "https://host:port"). Sends the CONNECT
|
||||
request (with Proxy-Authorization when the proxy carries credentials) and
|
||||
reads the proxy's status line, so the caller's TLS handshake then runs
|
||||
end-to-end with the origin. Blocks up to `timeout` seconds. Returns 1 on a
|
||||
2xx tunnel, 0 on failure (retour->msg/statuscode set). */
|
||||
int http_proxy_tunnel(httrackp *opt, htsblk *retour, const char *adr,
|
||||
int timeout);
|
||||
void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * retour,
|
||||
char *rcvd);
|
||||
void treatfirstline(htsblk * retour, const char *rcvd);
|
||||
|
||||
@@ -296,6 +296,12 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
|
||||
return dst;
|
||||
}
|
||||
|
||||
/* Byte before html, or a space sentinel at the buffer start where html[-1]
|
||||
would underflow; space reads as the word boundary the guards want there. */
|
||||
static HTS_INLINE char html_prevc(const char *html, const char *start) {
|
||||
return html > start ? html[-1] : ' ';
|
||||
}
|
||||
|
||||
/* Main parser */
|
||||
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
@@ -556,7 +562,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (opt->getmode & HTS_GETMODE_HTML) {
|
||||
p = strfield(html, "title");
|
||||
if (p) {
|
||||
if (*(html - 1) == '/')
|
||||
if (html_prevc(html, r->adr) == '/')
|
||||
p = 0; // /title
|
||||
} else {
|
||||
if (strfield(html, "/html"))
|
||||
@@ -1341,6 +1347,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int can_avoid_quotes = 0;
|
||||
char quotes_replacement = '\0';
|
||||
int ensure_not_mime = 0;
|
||||
// @import: the quoted token is the URL; a trailing
|
||||
// media/supports/layer condition is not part of it
|
||||
int is_import = 0;
|
||||
|
||||
if (inscript_tag)
|
||||
expected_end = ";\"\'"; // voir a href="javascript:doc.location='foo'"
|
||||
@@ -1357,9 +1366,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (!nc)
|
||||
nc = strfield(html, ":location"); // javascript:location="doc"
|
||||
if (!nc) { // location="doc"
|
||||
if ((nc = strfield(html, "location"))
|
||||
&& !isspace(*(html - 1))
|
||||
)
|
||||
if ((nc = strfield(html, "location")) &&
|
||||
!isspace(html_prevc(html, r->adr)))
|
||||
nc = 0;
|
||||
}
|
||||
if (!nc)
|
||||
@@ -1380,7 +1388,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
expected = '('; // parenthèse
|
||||
expected_end = ")"; // fin: parenthèse
|
||||
}
|
||||
if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') { // url(url)
|
||||
if (!nc && (nc = strfield(html, "url")) &&
|
||||
(!isalnum(html_prevc(html, r->adr))) &&
|
||||
html_prevc(html, r->adr) != '_') { // url(url)
|
||||
expected = '('; // parenthèse
|
||||
expected_end = ")"; // fin: parenthèse
|
||||
can_avoid_quotes = 1;
|
||||
@@ -1390,6 +1400,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if ((nc = strfield(html, "import"))) { // import "url"
|
||||
if (is_space(*(html + nc))) {
|
||||
expected = 0; // no char expected
|
||||
is_import = 1;
|
||||
} else
|
||||
nc = 0;
|
||||
}
|
||||
@@ -1407,6 +1418,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
|
||||
const char *b, *c;
|
||||
int ndelim = 1;
|
||||
int valid_url = 0;
|
||||
|
||||
if ((*a == 34) || (*a == '\''))
|
||||
a++;
|
||||
@@ -1421,12 +1433,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
b++;
|
||||
}
|
||||
c = b--;
|
||||
c += ndelim;
|
||||
while(*c == ' ')
|
||||
c++;
|
||||
if ((strchr(expected_end, *c)) || (*c == '\n')
|
||||
|| (*c == '\r')) {
|
||||
c -= (ndelim + 1);
|
||||
// no closing delimiter here (truncated input):
|
||||
// Don't scan past the buffer NUL or capture it.
|
||||
if (*c != '\0') {
|
||||
c += ndelim;
|
||||
while (*c == ' ')
|
||||
c++;
|
||||
valid_url =
|
||||
(strchr(expected_end, *c)) || (*c == '\n') ||
|
||||
(*c == '\r') ||
|
||||
(is_import && *(b + 1 + ndelim) == ' ');
|
||||
}
|
||||
if (valid_url) {
|
||||
// URL end = last char (b), not the delimiter
|
||||
c = b;
|
||||
if ((int) (c - a + 1)) {
|
||||
if (ensure_not_mime) {
|
||||
int i = 0;
|
||||
@@ -1485,7 +1505,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1692,6 +1711,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
hts_nodetect[i -
|
||||
1]);
|
||||
}
|
||||
// xmlns / xmlns:prefix declare
|
||||
// XML namespaces, not resources
|
||||
// (#191)
|
||||
else {
|
||||
const int xl = strfield(
|
||||
intag_startattr, "xmlns");
|
||||
const char xc =
|
||||
intag_startattr[xl];
|
||||
if (xl &&
|
||||
(xc == ':' || xc == '=' ||
|
||||
is_space(xc))) {
|
||||
url_ok = 0;
|
||||
hts_log_print(
|
||||
opt, LOG_DEBUG,
|
||||
"dirty parsing: xmlns "
|
||||
"namespace avoided");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -89,4 +89,37 @@ grep -q NEWCONTENT "$(find "$out" -path '*/a.html' -print -quit)" || {
|
||||
exit 1
|
||||
}
|
||||
|
||||
# --- 3. an empty quoted arg survives the doit.log round-trip (#106) ----------
|
||||
# -%F "" (empty footer) records an empty "" token in doit.log; -r2 follows it so
|
||||
# a "drop the empty token" bug shifts -r2 into -%F's slot (the reprise then sees
|
||||
# -%F -r2 and panics "%F needs to be followed by ..."), making the bug visible
|
||||
# rather than a harmless run off the end of argv.
|
||||
out2="$tmp/out2"
|
||||
rc=0
|
||||
"$bin" "$url" -O "$out2" --quiet -n -%v0 -%F "" -r2 >/dev/null 2>&1 || rc=$?
|
||||
test "$rc" -eq 0 || {
|
||||
echo "FAIL: initial mirror with empty footer exited $rc"
|
||||
exit 1
|
||||
}
|
||||
# precondition: the writer put the empty token on disk for the reader to reload.
|
||||
grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
|
||||
echo "FAIL: empty footer not recorded as -%F \"\" -r2 in doit.log"
|
||||
grep -- '-%F' "$out2/hts-cache/doit.log" || true
|
||||
exit 1
|
||||
}
|
||||
# no-url reprise: the reader rebuilds argv from doit.log and rewrites doit.log
|
||||
# from it. The empty token surviving in the regenerated file proves the reader
|
||||
# kept it (a drop/swallow would panic above or rewrite -%F without the "").
|
||||
rc=0
|
||||
"$bin" -O "$out2" --quiet >/dev/null 2>&1 || rc=$?
|
||||
test "$rc" -eq 0 || {
|
||||
echo "FAIL: empty-footer reprise exited $rc (empty token dropped from doit.log?)"
|
||||
exit 1
|
||||
}
|
||||
grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
|
||||
echo "FAIL: empty footer did not survive the doit.log reload round-trip"
|
||||
grep -- '-%F' "$out2/hts-cache/doit.log" || true
|
||||
exit 1
|
||||
}
|
||||
|
||||
exit 0
|
||||
|
||||
@@ -154,4 +154,81 @@ grep -Eq "style=\"background-image:url\('ibgs\.gif'\)\"" "$saved2" ||
|
||||
grep -q 'title="file://' "$saved2" ||
|
||||
! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1
|
||||
|
||||
# xmlns / xmlns:prefix decls must not be crawled (#191). Local file:// targets so a
|
||||
# regression downloads them; each is the LAST attr (heuristic only scans a value before '>').
|
||||
site3="$tmp/xmlns"
|
||||
mkdir -p "$site3"
|
||||
for f in ns og rdfs real; do gif "$site3/$f.gif"; done
|
||||
cat >"$site3/index.html" <<EOF
|
||||
<html xmlns="file://$site3/ns.gif"><body>
|
||||
<svg xmlns:og="file://$site3/og.gif"></svg>
|
||||
<div class="c" xmlns:rdfs="file://$site3/rdfs.gif"></div>
|
||||
<a href="file://$site3/real.gif">real link</a>
|
||||
</body></html>
|
||||
EOF
|
||||
out3="$tmp/xmlns-out"
|
||||
crawl "$site3/index.html" "$out3"
|
||||
|
||||
# the real link is still captured
|
||||
found "real.gif" "$out3"
|
||||
# namespace-declaration targets must not be fetched (default + prefixed forms)
|
||||
notfound "ns.gif" "$out3"
|
||||
notfound "og.gif" "$out3"
|
||||
notfound "rdfs.gif" "$out3"
|
||||
|
||||
# CSS @import (#94): every form's target is captured, crawling the .css directly.
|
||||
# The "cond"/"sup"/"spc" cases carry a trailing media/supports/layer condition (or
|
||||
# a space before ';'); they are the negative controls: without the parser fix the
|
||||
# URL is dropped, so a regression fails these found() checks.
|
||||
site4="$tmp/cssimport"
|
||||
mkdir -p "$site4"
|
||||
for f in nq dqu squ dqs sqs med cond sup lay spc; do printf 'body{}\n' >"$site4/$f.css"; done
|
||||
cat >"$site4/main.css" <<'EOF'
|
||||
@import url(nq.css);
|
||||
@import url("dqu.css");
|
||||
@import url('squ.css');
|
||||
@import "dqs.css";
|
||||
@import 'sqs.css';
|
||||
@import url(med.css) screen and (min-width: 400px);
|
||||
@import "cond.css" screen;
|
||||
@import "sup.css" supports(display: flex);
|
||||
@import url(lay.css) layer(base);
|
||||
@import "spc.css" ;
|
||||
EOF
|
||||
out4="$tmp/cssimport-out"
|
||||
crawl "$site4/main.css" "$out4"
|
||||
for f in nq dqu squ dqs sqs med cond sup lay spc; do found "$f.css" "$out4"; done
|
||||
|
||||
# Over-capture guard: the trailing condition is not part of the URL, so it must
|
||||
# survive the rewrite verbatim. A regression that grabs it would mangle these.
|
||||
m4=$(find "$out4" -type f -path '*/file/*' -name main.css -print -quit)
|
||||
test -n "$m4" || ! echo "FAIL: saved main.css not found" || exit 1
|
||||
for cond in '@import "cond.css" screen;' 'supports(display: flex)' 'layer(base)'; do
|
||||
grep -Fq "$cond" "$m4" ||
|
||||
! echo "FAIL #94: '$cond' altered on rewrite (condition captured as URL?)" || exit 1
|
||||
done
|
||||
|
||||
# Malformed input: an unterminated @import quote (truncated CSS) must not crash or
|
||||
# capture a bogus link; a valid sibling import is still captured. Guards a heap
|
||||
# overflow on the URL-end scan that aborts under ASan (CI sanitizer job).
|
||||
site5="$tmp/cssimport-trunc"
|
||||
mkdir -p "$site5"
|
||||
printf 'body{}\n' >"$site5/good.css"
|
||||
printf '@import "good.css";\n@import "trunc' >"$site5/main.css"
|
||||
out5="$tmp/cssimport-trunc-out"
|
||||
crawl "$site5/main.css" "$out5"
|
||||
found "good.css" "$out5"
|
||||
notfound "trunc" "$out5"
|
||||
|
||||
# Offset-0 underflow (#396): a token at the buffer start makes the detector's
|
||||
# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
|
||||
# url() target is still captured; here it just must not underflow.
|
||||
site6="$tmp/parse-off0"
|
||||
mkdir -p "$site6"
|
||||
printf 'body{}\n' >"$site6/off0.css"
|
||||
printf 'url(off0.css)\n' >"$site6/main.css"
|
||||
out6="$tmp/parse-off0-out"
|
||||
crawl "$site6/main.css" "$out6"
|
||||
found "off0.css" "$out6"
|
||||
|
||||
exit 0
|
||||
|
||||
136
tests/13_crawl_proxy_https.test
Normal file
136
tests/13_crawl_proxy_https.test
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Issue #85: an https crawl must go through the configured proxy (CONNECT
|
||||
# tunnel), not bypass it and hit the origin directly. Fully local: a self-signed
|
||||
# TLS origin plus a logging CONNECT proxy, so no network access is needed.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
if test "${HTTPS_SUPPORT:-}" == "no"; then
|
||||
echo "no https support compiled, skipping"
|
||||
exit 77
|
||||
fi
|
||||
if ! command -v python3 >/dev/null 2>&1 || ! command -v openssl >/dev/null 2>&1; then
|
||||
echo "python3/openssl missing, skipping"
|
||||
exit 77
|
||||
fi
|
||||
|
||||
server="$top_srcdir/tests/proxy-https-server.py"
|
||||
tmpdir=$(mktemp -d)
|
||||
pids=
|
||||
|
||||
cleanup() {
|
||||
for pid in $pids; do
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# self-signed cert for the local TLS origin (httrack does not verify certs)
|
||||
openssl req -x509 -newkey rsa:2048 -keyout "$tmpdir/key.pem" \
|
||||
-out "$tmpdir/cert.pem" -days 2 -nodes -subj "/CN=127.0.0.1" \
|
||||
>/dev/null 2>&1
|
||||
cat "$tmpdir/key.pem" "$tmpdir/cert.pem" >"$tmpdir/both.pem"
|
||||
|
||||
# start_server <logdir> <mode>: launches a proxy+origin pair, sets $origin_port
|
||||
# and $proxy_port from its announced ephemeral ports.
|
||||
start_server() {
|
||||
local dir="$1" mode="$2" ports
|
||||
mkdir -p "$dir"
|
||||
ports="$dir/ports.txt"
|
||||
python3 "$server" "$tmpdir/both.pem" "$dir" "$mode" \
|
||||
>"$ports" 2>"$dir/server.err" &
|
||||
pids="$pids $!"
|
||||
for _ in $(seq 1 100); do
|
||||
grep -q "^ready" "$ports" 2>/dev/null && break
|
||||
sleep 0.1
|
||||
done
|
||||
grep -q "^ready" "$ports" 2>/dev/null || {
|
||||
echo "server ($mode) did not start" >&2
|
||||
cat "$dir/server.err" >&2
|
||||
exit 1
|
||||
}
|
||||
origin_port=$(awk '/^ORIGIN/{print $2}' "$ports")
|
||||
proxy_port=$(awk '/^PROXY/{print $2}' "$ports")
|
||||
}
|
||||
|
||||
# Run httrack, but kill it after a deadline so a hang (e.g. a missing bound on
|
||||
# the proxy response) surfaces as the kill code $HANG_RC instead of stalling the
|
||||
# whole job. A portable stand-in for `timeout`, which macOS lacks.
|
||||
HANG_RC=137 # 128 + SIGKILL
|
||||
run_crawl() {
|
||||
local out="$1" proxy="$2" port="$3"
|
||||
rm -rf "$out"
|
||||
httrack "https://127.0.0.1:${port}/" --proxy "$proxy" \
|
||||
-O "$out" -r1 -s0 --timeout=10 >"$out.log" 2>&1 &
|
||||
local pid=$!
|
||||
(sleep 60 && kill -9 "$pid" 2>/dev/null) &
|
||||
local guard=$!
|
||||
local rc=0
|
||||
wait "$pid" 2>/dev/null || rc=$?
|
||||
kill "$guard" 2>/dev/null || true
|
||||
wait "$guard" 2>/dev/null || true
|
||||
return "$rc"
|
||||
}
|
||||
|
||||
# --- working proxy ----------------------------------------------------------
|
||||
ok="$tmpdir/ok"
|
||||
start_server "$ok" ok
|
||||
|
||||
# 1. page retrieved AND the proxy saw a CONNECT to the origin
|
||||
run_crawl "$ok/out" "127.0.0.1:${proxy_port}" "$origin_port"
|
||||
grep -rq "ORIGIN-PAGE-85" "$ok/out" || {
|
||||
echo "FAIL: origin page not downloaded through proxy" >&2
|
||||
cat "$ok/out.log" >&2
|
||||
exit 1
|
||||
}
|
||||
grep -q "^CONNECT 127.0.0.1:${origin_port} " "$ok/proxy.log" || {
|
||||
echo "FAIL: proxy never received a CONNECT (https bypassed the proxy)" >&2
|
||||
cat "$ok/proxy.log" >&2
|
||||
exit 1
|
||||
}
|
||||
echo "OK: https tunneled through proxy via CONNECT"
|
||||
|
||||
# 2. authenticated proxy: creds ride the CONNECT, and NEVER reach the origin
|
||||
: >"$ok/proxy.log"
|
||||
: >"$ok/origin-headers.log"
|
||||
run_crawl "$ok/out2" "user:secret@127.0.0.1:${proxy_port}" "$origin_port"
|
||||
grep -rq "ORIGIN-PAGE-85" "$ok/out2" || {
|
||||
echo "FAIL: origin page not downloaded through authenticated proxy" >&2
|
||||
exit 1
|
||||
}
|
||||
got=$(awk '/^AUTH Basic /{print $3}' "$ok/proxy.log" | head -1)
|
||||
# base64("user:secret"); compared as a literal to stay portable (no base64 -d,
|
||||
# which differs between GNU and BSD)
|
||||
test "$got" == "dXNlcjpzZWNyZXQ=" || {
|
||||
echo "FAIL: Proxy-Authorization not carried on CONNECT (got '$got')" >&2
|
||||
cat "$ok/proxy.log" >&2
|
||||
exit 1
|
||||
}
|
||||
if grep -qi "proxy-authorization" "$ok/origin-headers.log"; then
|
||||
echo "FAIL: proxy credentials leaked to the origin through the tunnel" >&2
|
||||
cat "$ok/origin-headers.log" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "OK: proxy credentials carried on CONNECT, not leaked to origin"
|
||||
|
||||
# --- hostile proxy ----------------------------------------------------------
|
||||
# A proxy that answers 200 then streams headers forever must not hang the crawl:
|
||||
# the client bounds the response. run_crawl kills a hung httrack after 60s, so a
|
||||
# missing bound surfaces as $HANG_RC here.
|
||||
flood="$tmpdir/flood"
|
||||
start_server "$flood" flood
|
||||
rc=0
|
||||
run_crawl "$flood/out" "127.0.0.1:${proxy_port}" "$origin_port" || rc=$?
|
||||
test "$rc" -ne "$HANG_RC" || {
|
||||
echo "FAIL: crawl hung on a flooding proxy (bounded read missing)" >&2
|
||||
exit 1
|
||||
}
|
||||
grep -rq "ORIGIN-PAGE-85" "$flood/out" 2>/dev/null && {
|
||||
echo "FAIL: flooding proxy unexpectedly served the page" >&2
|
||||
exit 1
|
||||
}
|
||||
echo "OK: bounded proxy response, no hang on a flooding proxy"
|
||||
@@ -2,6 +2,7 @@
|
||||
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
||||
# silently drop it from the dist tarball and break "make distcheck".
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
proxy-https-server.py \
|
||||
fixtures/cache-golden/hts-cache/new.zip
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
@@ -44,6 +45,7 @@ TESTS = \
|
||||
11_crawl-international.test \
|
||||
11_crawl-longurl.test \
|
||||
11_crawl-parsing.test \
|
||||
12_crawl_https.test
|
||||
12_crawl_https.test \
|
||||
13_crawl_proxy_https.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
151
tests/proxy-https-server.py
Normal file
151
tests/proxy-https-server.py
Normal file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Local CONNECT proxy + self-signed HTTPS origin for the issue #85 test.
|
||||
|
||||
Starts a TLS origin server and an HTTP proxy that honours CONNECT, on ephemeral
|
||||
ports. Every request line the proxy receives (and any Proxy-Authorization) is
|
||||
appended to the proxy log; every header the origin receives over the tunnel is
|
||||
appended to the origin log. That lets the test assert both that an https crawl
|
||||
tunneled through the proxy and that proxy credentials never leaked to the origin.
|
||||
|
||||
Proxy modes (argv[3], default "ok"):
|
||||
ok - honour CONNECT and tunnel to the origin
|
||||
flood - answer 200 then stream headers forever with no blank line, to exercise
|
||||
the client's bound on the proxy response (must not hang the crawl)
|
||||
|
||||
Usage: proxy-https-server.py <cert.pem> <logdir> [mode]
|
||||
Prints "ORIGIN <port>", "PROXY <port>", then "ready" (one per line) on stdout.
|
||||
"""
|
||||
import http.server
|
||||
import os
|
||||
import socket
|
||||
import socketserver
|
||||
import ssl
|
||||
import sys
|
||||
import threading
|
||||
|
||||
ORIGIN_BODY = b"<html><body>ORIGIN-PAGE-85</body></html>"
|
||||
PROXY_LOG = "proxy.log"
|
||||
ORIGIN_LOG = "origin-headers.log"
|
||||
|
||||
|
||||
def make_origin(logdir):
|
||||
class Origin(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
with open(os.path.join(logdir, ORIGIN_LOG), "a") as handle:
|
||||
for key in self.headers.keys():
|
||||
handle.write(key + "\n")
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/html")
|
||||
self.send_header("Content-Length", str(len(ORIGIN_BODY)))
|
||||
self.end_headers()
|
||||
self.wfile.write(ORIGIN_BODY)
|
||||
|
||||
def log_message(self, *args):
|
||||
pass
|
||||
|
||||
return Origin
|
||||
|
||||
|
||||
def start_origin(certfile, logdir):
|
||||
httpd = socketserver.TCPServer(("127.0.0.1", 0), make_origin(logdir))
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
ctx.load_cert_chain(certfile)
|
||||
httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
|
||||
port = httpd.socket.getsockname()[1]
|
||||
threading.Thread(target=httpd.serve_forever, daemon=True).start()
|
||||
return port
|
||||
|
||||
|
||||
def pipe(src, dst):
|
||||
try:
|
||||
while True:
|
||||
data = src.recv(65536)
|
||||
if not data:
|
||||
break
|
||||
dst.sendall(data)
|
||||
except OSError:
|
||||
pass
|
||||
finally:
|
||||
for sock in (src, dst):
|
||||
try:
|
||||
sock.shutdown(socket.SHUT_RDWR)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def handle_client(conn, logdir, mode):
|
||||
rfile = conn.makefile("rb")
|
||||
request_line = rfile.readline().decode("latin-1").strip()
|
||||
auth = None
|
||||
while True:
|
||||
line = rfile.readline().decode("latin-1")
|
||||
if line in ("\r\n", "\n", ""):
|
||||
break
|
||||
key, _, value = line.partition(":")
|
||||
if key.strip().lower() == "proxy-authorization":
|
||||
auth = value.strip()
|
||||
with open(os.path.join(logdir, PROXY_LOG), "a") as handle:
|
||||
handle.write(request_line + "\n")
|
||||
if auth is not None:
|
||||
handle.write("AUTH " + auth + "\n")
|
||||
parts = request_line.split()
|
||||
if not (len(parts) >= 2 and parts[0] == "CONNECT"):
|
||||
conn.sendall(b"HTTP/1.0 501 Not Implemented\r\n\r\n")
|
||||
conn.close()
|
||||
return
|
||||
if mode == "flood":
|
||||
# 200, then an endless header stream with no terminating blank line: the
|
||||
# client must bound this and give up, not hang.
|
||||
try:
|
||||
conn.sendall(b"HTTP/1.0 200 Connection established\r\n")
|
||||
while True:
|
||||
conn.sendall(b"X-Pad: 0123456789\r\n")
|
||||
except OSError:
|
||||
pass
|
||||
conn.close()
|
||||
return
|
||||
host, _, port = parts[1].partition(":")
|
||||
try:
|
||||
upstream = socket.create_connection((host, int(port or 443)))
|
||||
except OSError:
|
||||
conn.sendall(b"HTTP/1.0 502 Bad Gateway\r\n\r\n")
|
||||
conn.close()
|
||||
return
|
||||
conn.sendall(b"HTTP/1.0 200 Connection established\r\n\r\n")
|
||||
threading.Thread(target=pipe, args=(conn, upstream), daemon=True).start()
|
||||
pipe(upstream, conn)
|
||||
|
||||
|
||||
def start_proxy(logdir, mode):
|
||||
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
srv.bind(("127.0.0.1", 0))
|
||||
srv.listen(16)
|
||||
port = srv.getsockname()[1]
|
||||
|
||||
def serve():
|
||||
while True:
|
||||
conn, _ = srv.accept()
|
||||
threading.Thread(
|
||||
target=handle_client, args=(conn, logdir, mode), daemon=True
|
||||
).start()
|
||||
|
||||
threading.Thread(target=serve, daemon=True).start()
|
||||
return port
|
||||
|
||||
|
||||
def main():
|
||||
certfile, logdir = sys.argv[1], sys.argv[2]
|
||||
mode = sys.argv[3] if len(sys.argv) > 3 else "ok"
|
||||
for name in (PROXY_LOG, ORIGIN_LOG):
|
||||
open(os.path.join(logdir, name), "w").close()
|
||||
origin_port = start_origin(certfile, logdir)
|
||||
proxy_port = start_proxy(logdir, mode)
|
||||
print("ORIGIN %d" % origin_port, flush=True)
|
||||
print("PROXY %d" % proxy_port, flush=True)
|
||||
print("ready", flush=True)
|
||||
threading.Event().wait()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user