htsparse: don't read *(html-1) before the parse buffer (#396 )

The link detector's word-boundary guards dereference *(html-1) to check the byte preceding a matched token. When the token sits at the very start of the parse buffer (html == r->adr), that reads one byte before the allocation: a heap-buffer-overflow under ASan, silent on a normal build. A stylesheet beginning with a url() token is enough to hit it. Route the three reachable guards (url(), location=, the makeindex /title check) through html_prevc(), which returns a space sentinel at the buffer start. Space is the right value for these tests: a token at offset 0 is at a word boundary, so it stays a valid match. The other *(html-1) sites only run after html has advanced past an opening tag or quote. Covers it with an offset-0 url() fixture in 01_engine-parse.test; without the fix it aborts at htsparse.c:1386 under the CI sanitizer job. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>
Merge pull request #397 from xroche/fix/css-import-94
2026-06-20 00:58:47 +03:00 · 2026-06-19 19:44:25 +02:00 · 2026-06-19 19:30:21 +02:00 · 2026-06-19 19:25:39 +02:00 · 2026-06-19 18:46:31 +02:00 · 2026-06-19 18:28:23 +02:00
10 changed files with 668 additions and 24 deletions
--- a/src/htsback.c
+++ b/src/htsback.c
@@ -2532,8 +2532,26 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
 #if HTS_USEOPENSSL
          /* SSL mode */
          if (back[i].r.ssl) {
+            int tunnel_ok = 1;
+
+            // https via proxy: CONNECT-tunnel before TLS (#85)
+            if (back[i].r.req.proxy.active && back[i].r.ssl_con == NULL) {
+              const int timeout = back[i].timeout > 0 ? back[i].timeout : 30;
+
+              tunnel_ok =
+                  http_proxy_tunnel(opt, &back[i].r, back[i].url_adr, timeout);
+              if (!tunnel_ok) {
+                if (!strnotempty(back[i].r.msg))
+                  strcpybuff(back[i].r.msg, "proxy CONNECT failed");
+                deletehttp(&back[i].r);
+                back[i].r.soc = INVALID_SOCKET;
+                back[i].r.statuscode = STATUSCODE_NON_FATAL;
+                back[i].status = STATUS_READY;
+                back_set_finished(sback, i);
+              }
+            }
            // handshake not yet launched
-            if (!back[i].r.ssl_con) {
+            if (tunnel_ok && !back[i].r.ssl_con) {
              SSL_CTX_set_options(openssl_ctx, SSL_OP_ALL);
              // new session
              back[i].r.ssl_con = SSL_new(openssl_ctx);
@@ -2551,7 +2569,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
                back[i].r.statuscode = STATUSCODE_SSL_HANDSHAKE;
            }
            /* Error */
-            if (back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
+            if (tunnel_ok && back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
              strcpybuff(back[i].r.msg, "bad SSL/TLS handshake");
              deletehttp(&back[i].r);
              back[i].r.soc = INVALID_SOCKET;
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -953,9 +953,11 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
      p = buff;
      do {
        int insert_after_argc;
+        int quoted; /* "" unquotes to empty but is still a real token (#106) */

        // read next
        lastp = p;
+        quoted = (p != NULL && *p == '"');
        if (p) {
          p = next_token(p, 1);
          if (p) {
@@ -966,7 +968,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {

        /* Insert parameters BUT so that they can be in the same order */
        if (lastp) {
-          if (strnotempty(lastp)) {
+          if (strnotempty(lastp) || quoted) {
            insert_after_argc = argc - insert_after;
            cmdl_ins(lastp, insert_after_argc, (argv + insert_after), x_argvblk,
                     x_argvblk_size, x_ptr);
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -644,6 +644,165 @@ T_SOC http_fopen(httrackp * opt, const char *adr, const char *fil, htsblk * reto
  return http_xfopen(opt, 0, 1, 1, NULL, adr, fil, retour);
 }

+// Read a CRLF line from a non-blocking socket (waits up to timeout per recv).
+// Returns the line length (0 = empty), or -1 on timeout/EOF/error.
+static int proxy_getline(T_SOC soc, char *s, int max, int timeout) {
+  int j = 0;
+
+  for (;;) {
+    unsigned char ch;
+    int n;
+
+    if (!check_readinput_t(soc, timeout))
+      return -1; // timed out waiting for data
+    n = (int) recv(soc, &ch, 1, 0);
+    if (n == 1) {
+      if (ch == 13) // CR
+        continue;
+      if (ch == 10) // LF: end of line
+        break;
+      if (j >= max - 1)
+        return -1; // line too long: bound the read against a hostile proxy
+      s[j++] = (char) ch;
+    } else if (n == 0) {
+      return -1; // connection closed
+    } else {
+#ifdef _WIN32
+      if (WSAGetLastError() == WSAEWOULDBLOCK)
+        continue;
+#else
+      if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
+        continue;
+#endif
+      return -1;
+    }
+  }
+  s[j] = '\0';
+  return j;
+}
+
+int http_proxy_tunnel(httrackp *opt, htsblk *retour, const char *adr,
+                      int timeout) {
+  const T_SOC soc = retour->soc;
+  const char *const host = jump_identification_const(adr); // host[:port]
+  const char *const portsep = jump_toport_const(adr);      // ":port" or NULL
+  char BIGSTK authority[HTS_URLMAXSIZE * 2];
+  char BIGSTK req[HTS_URLMAXSIZE * 4 + 1100];
+  char line[1024];
+  int code;
+
+  if (soc == INVALID_SOCKET)
+    return 0;
+
+  // CONNECT needs an explicit host:port; default the https port
+  authority[0] = '\0';
+  if (portsep != NULL)
+    strlcatbuff(authority, host, sizeof(authority)); // already host:port
+  else
+    snprintf(authority, sizeof(authority), "%s:%d", host, 443);
+
+  // backstop: never let a stray CR/LF in the host smuggle a second line into
+  // the CONNECT request (the host is already sanitized upstream)
+  {
+    const char *c;
+
+    for (c = authority; *c != '\0'; c++) {
+      if ((unsigned char) *c < ' ') {
+        strcpybuff(retour->msg, "proxy CONNECT: invalid host");
+        return 0;
+      }
+    }
+  }
+
+  snprintf(req, sizeof(req), "CONNECT %s HTTP/1.0" H_CRLF "Host: %s" H_CRLF,
+           authority, authority);
+
+  // creds go on the CONNECT, not the tunneled origin request
+  if (link_has_authorization(retour->req.proxy.name)) {
+    const char *a = jump_identification_const(retour->req.proxy.name);
+    const char *astart = jump_protocol_const(retour->req.proxy.name);
+    char autorisation[1100];
+    char user_pass[256];
+
+    autorisation[0] = user_pass[0] = '\0';
+    strncatbuff(user_pass, astart, (int) (a - astart) - 1);
+    strcpybuff(user_pass, unescape_http(OPT_GET_BUFF(opt),
+                                        OPT_GET_BUFF_SIZE(opt), user_pass));
+    code64((unsigned char *) user_pass, (int) strlen(user_pass),
+           (unsigned char *) autorisation, 0);
+    strlcatbuff(req, "Proxy-Authorization: Basic ", sizeof(req));
+    strlcatbuff(req, autorisation, sizeof(req));
+    strlcatbuff(req, H_CRLF, sizeof(req));
+  }
+  strlcatbuff(req, H_CRLF, sizeof(req)); // end of request headers
+
+  // raw send: ssl is set, so sendc() would route to TLS
+  {
+    const char *p = req;
+    size_t remain = strlen(req);
+    int stalls = 0;
+
+    while (remain > 0) {
+      const int n = (int) send(soc, p, (int) remain, 0);
+
+      if (n > 0) {
+        p += n;
+        remain -= (size_t) n;
+        stalls = 0;
+      } else {
+#ifdef _WIN32
+        const int wouldblock = (WSAGetLastError() == WSAEWOULDBLOCK);
+#else
+        const int wouldblock =
+            (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR);
+#endif
+        // don't spin forever on a fatal error or an unwritable socket
+        if (!wouldblock || !check_writeinput_t(soc, timeout) ||
+            ++stalls > 100) {
+          strcpybuff(retour->msg, "proxy CONNECT: write error");
+          return 0;
+        }
+      }
+    }
+  }
+
+  // proxy status line: "HTTP/1.x <code> ..."
+  if (proxy_getline(soc, line, sizeof(line), timeout) < 0) {
+    strcpybuff(retour->msg, "proxy CONNECT: no response");
+    return 0;
+  }
+  if (sscanf(line, "HTTP/%*d.%*d %d", &code) < 1)
+    code = 0;
+  if (code < 200 || code >= 300) {
+    snprintf(retour->msg, sizeof(retour->msg), "proxy CONNECT refused: %s",
+             strnotempty(line) ? line : "(no status)");
+    return 0;
+  }
+
+  // drain headers to the blank line; cap the count so a flooding proxy can't
+  // stall the crawl
+  {
+    int headers = 0;
+
+    for (;;) {
+      const int n = proxy_getline(soc, line, sizeof(line), timeout);
+
+      if (n < 0) {
+        strcpybuff(retour->msg, "proxy CONNECT: truncated response");
+        return 0;
+      }
+      if (n == 0)
+        break; // blank line: tunnel ready
+      if (++headers > 64) {
+        strcpybuff(retour->msg, "proxy CONNECT: too many response headers");
+        return 0;
+      }
+    }
+  }
+
+  return 1;
+}
+
 // ouverture d'une liaison http, envoi d'une requète
 // mode: 0 GET  1 HEAD  [2 POST]
 // treat: traiter header?
@@ -680,14 +839,14 @@ T_SOC http_xfopen(httrackp * opt, int mode, int treat, int waitconnect,

  /* connexion */
  if (retour) {
-    if ((!(retour->req.proxy.active))
-        || ((strcmp(adr, "file://") == 0)
-            || (strncmp(adr, "https://", 8) == 0)
-        )
-      ) {                       /* pas de proxy, ou non utilisable ici */
+    /* no proxy, or proxy not usable here (local file) */
+    if ((!(retour->req.proxy.active)) || (strcmp(adr, "file://") == 0)) {
      soc = newhttp(opt, adr, retour, -1, waitconnect);
    } else {
-      soc = newhttp(opt, retour->req.proxy.name, retour, retour->req.proxy.port, waitconnect);  // ouvrir sur le proxy à la place
+      // to the proxy; https tunnels to the origin via CONNECT in back_wait
+      // (#85)
+      soc = newhttp(opt, retour->req.proxy.name, retour, retour->req.proxy.port,
+                    waitconnect);
    }
  } else {
    soc = newhttp(opt, adr, NULL, -1, waitconnect);
@@ -1043,8 +1202,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
    if (xsend)
      print_buffer(&bstr, "%s", xsend);  // éventuelles autres lignes

-    // tester proxy authentication
-    if (retour->req.proxy.active) {
+    // for https, auth rides the CONNECT (the tunneled GET would leak it)
+    if (retour->req.proxy.active && strncmp(adr, "https://", 8) != 0) {
      if (link_has_authorization(retour->req.proxy.name)) {     // et hop, authentification proxy!
        const char *a = jump_identification_const(retour->req.proxy.name);
        const char *astart = jump_protocol_const(retour->req.proxy.name);
@@ -1827,6 +1986,24 @@ int check_readinput_t(T_SOC soc, int timeout) {
    return 0;
 }

+// wait until the socket is writable, up to timeout seconds
+int check_writeinput_t(T_SOC soc, int timeout) {
+  if (soc != INVALID_SOCKET) {
+    fd_set fds;
+    struct timeval tv;
+    const int isoc = (int) soc;
+
+    assertf(isoc == soc);
+    FD_ZERO(&fds);
+    FD_SET(isoc, &fds);
+    tv.tv_sec = timeout;
+    tv.tv_usec = 0;
+    select(isoc + 1, NULL, &fds, NULL, &tv);
+    return FD_ISSET(isoc, &fds) ? 1 : 0;
+  } else
+    return 0;
+}
+
 // idem, sauf qu'ici on peut choisir la taille max de données à recevoir
 // SI bufl==0 alors le buffer est censé être de 8kos, et on recoit par bloc de lignes
 // en éliminant les cr (ex: header), arrêt si double-lf
--- a/src/htslib.h
+++ b/src/htslib.h
@@ -198,6 +198,17 @@ HTS_INLINE void deletesoc_r(htsblk * r);
 htsblk http_test(httrackp * opt, const char *adr, const char *fil, char *loc);
 int check_readinput(htsblk * r);
 int check_readinput_t(T_SOC soc, int timeout);
+int check_writeinput_t(T_SOC soc, int timeout);
+
+/* Open an HTTP CONNECT tunnel through the active proxy for an https request:
+   `retour->soc` must already be TCP-connected to the proxy, and `adr` is the
+   origin authority (url_adr, e.g. "https://host:port"). Sends the CONNECT
+   request (with Proxy-Authorization when the proxy carries credentials) and
+   reads the proxy's status line, so the caller's TLS handshake then runs
+   end-to-end with the origin. Blocks up to `timeout` seconds. Returns 1 on a
+   2xx tunnel, 0 on failure (retour->msg/statuscode set). */
+int http_proxy_tunnel(httrackp *opt, htsblk *retour, const char *adr,
+                      int timeout);
 void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * retour,
               char *rcvd);
 void treatfirstline(htsblk * retour, const char *rcvd);
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -296,6 +296,12 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
  return dst;
 }

+/* Byte before html, or a space sentinel at the buffer start where html[-1]
+   would underflow; space reads as the word boundary the guards want there. */
+static HTS_INLINE char html_prevc(const char *html, const char *start) {
+  return html > start ? html[-1] : ' ';
+}
+
 /* Main parser */
 int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
  char catbuff[CATBUFF_SIZE];
@@ -556,7 +562,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                  if (opt->getmode & HTS_GETMODE_HTML) {
                    p = strfield(html, "title");
                    if (p) {
-                      if (*(html - 1) == '/')
+                      if (html_prevc(html, r->adr) == '/')
                        p = 0;  // /title
                    } else {
                      if (strfield(html, "/html"))
@@ -1341,6 +1347,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                    int can_avoid_quotes = 0;
                    char quotes_replacement = '\0';
                    int ensure_not_mime = 0;
+                    // @import: the quoted token is the URL; a trailing
+                    // media/supports/layer condition is not part of it
+                    int is_import = 0;

                    if (inscript_tag)
                      expected_end = ";\"\'";   // voir a href="javascript:doc.location='foo'"
@@ -1357,9 +1366,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                      if (!nc)
                        nc = strfield(html, ":location");        // javascript:location="doc"
                      if (!nc) {        // location="doc"
-                        if ((nc = strfield(html, "location"))
-                            && !isspace(*(html - 1))
-                          )
+                        if ((nc = strfield(html, "location")) &&
+                            !isspace(html_prevc(html, r->adr)))
                          nc = 0;
                      }
                      if (!nc)
@@ -1380,7 +1388,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                          expected = '(';       // parenthèse
                          expected_end = ")";   // fin: parenthèse
                        }
-                      if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') {  // url(url)
+                      if (!nc && (nc = strfield(html, "url")) &&
+                          (!isalnum(html_prevc(html, r->adr))) &&
+                          html_prevc(html, r->adr) != '_') { // url(url)
                        expected = '('; // parenthèse
                        expected_end = ")";     // fin: parenthèse
                        can_avoid_quotes = 1;
@@ -1390,6 +1400,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                        if ((nc = strfield(html, "import"))) {   // import "url"
                          if (is_space(*(html + nc))) {
                            expected = 0;       // no char expected
+                            is_import = 1;
                          } else
                            nc = 0;
                        }
@@ -1407,6 +1418,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                          if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
                            const char *b, *c;
                            int ndelim = 1;
+                            int valid_url = 0;

                            if ((*a == 34) || (*a == '\''))
                              a++;
@@ -1421,12 +1433,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                                b++;
                            }
                            c = b--;
-                            c += ndelim;
-                            while(*c == ' ')
-                              c++;
-                            if ((strchr(expected_end, *c)) || (*c == '\n')
-                                || (*c == '\r')) {
-                              c -= (ndelim + 1);
+                            // no closing delimiter here (truncated input):
+                            // Don't scan past the buffer NUL or capture it.
+                            if (*c != '\0') {
+                              c += ndelim;
+                              while (*c == ' ')
+                                c++;
+                              valid_url =
+                                  (strchr(expected_end, *c)) || (*c == '\n') ||
+                                  (*c == '\r') ||
+                                  (is_import && *(b + 1 + ndelim) == ' ');
+                            }
+                            if (valid_url) {
+                              // URL end = last char (b), not the delimiter
+                              c = b;
                              if ((int) (c - a + 1)) {
                                if (ensure_not_mime) {
                                  int i = 0;
@@ -1485,7 +1505,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                                }
                              }
                            }
-
                          }
                        }
                      }
@@ -1692,6 +1711,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                                                              hts_nodetect[i -
                                                                           1]);
                                              }
+                                              // xmlns / xmlns:prefix declare
+                                              // XML namespaces, not resources
+                                              // (#191)
+                                              else {
+                                                const int xl = strfield(
+                                                    intag_startattr, "xmlns");
+                                                const char xc =
+                                                    intag_startattr[xl];
+                                                if (xl &&
+                                                    (xc == ':' || xc == '=' ||
+                                                     is_space(xc))) {
+                                                  url_ok = 0;
+                                                  hts_log_print(
+                                                      opt, LOG_DEBUG,
+                                                      "dirty parsing: xmlns "
+                                                      "namespace avoided");
+                                                }
+                                              }
                                            }
                                  }

--- a/tests/01_engine-doitlog.test
+++ b/tests/01_engine-doitlog.test
@@ -89,4 +89,37 @@ grep -q NEWCONTENT "$(find "$out" -path '*/a.html' -print -quit)" || {
    exit 1
 }

+# --- 3. an empty quoted arg survives the doit.log round-trip (#106) ----------
+# -%F "" (empty footer) records an empty "" token in doit.log; -r2 follows it so
+# a "drop the empty token" bug shifts -r2 into -%F's slot (the reprise then sees
+# -%F -r2 and panics "%F needs to be followed by ..."), making the bug visible
+# rather than a harmless run off the end of argv.
+out2="$tmp/out2"
+rc=0
+"$bin" "$url" -O "$out2" --quiet -n -%v0 -%F "" -r2 >/dev/null 2>&1 || rc=$?
+test "$rc" -eq 0 || {
+    echo "FAIL: initial mirror with empty footer exited $rc"
+    exit 1
+}
+# precondition: the writer put the empty token on disk for the reader to reload.
+grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
+    echo "FAIL: empty footer not recorded as -%F \"\" -r2 in doit.log"
+    grep -- '-%F' "$out2/hts-cache/doit.log" || true
+    exit 1
+}
+# no-url reprise: the reader rebuilds argv from doit.log and rewrites doit.log
+# from it. The empty token surviving in the regenerated file proves the reader
+# kept it (a drop/swallow would panic above or rewrite -%F without the "").
+rc=0
+"$bin" -O "$out2" --quiet >/dev/null 2>&1 || rc=$?
+test "$rc" -eq 0 || {
+    echo "FAIL: empty-footer reprise exited $rc (empty token dropped from doit.log?)"
+    exit 1
+}
+grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
+    echo "FAIL: empty footer did not survive the doit.log reload round-trip"
+    grep -- '-%F' "$out2/hts-cache/doit.log" || true
+    exit 1
+}
+
 exit 0
--- a/tests/01_engine-parse.test
+++ b/tests/01_engine-parse.test
@@ -154,4 +154,81 @@ grep -Eq "style=\"background-image:url\('ibgs\.gif'\)\"" "$saved2" ||
 grep -q 'title="file://' "$saved2" ||
    ! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1

+# xmlns / xmlns:prefix decls must not be crawled (#191). Local file:// targets so a
+# regression downloads them; each is the LAST attr (heuristic only scans a value before '>').
+site3="$tmp/xmlns"
+mkdir -p "$site3"
+for f in ns og rdfs real; do gif "$site3/$f.gif"; done
+cat >"$site3/index.html" <<EOF
+<html xmlns="file://$site3/ns.gif"><body>
+<svg xmlns:og="file://$site3/og.gif"></svg>
+<div class="c" xmlns:rdfs="file://$site3/rdfs.gif"></div>
+<a href="file://$site3/real.gif">real link</a>
+</body></html>
+EOF
+out3="$tmp/xmlns-out"
+crawl "$site3/index.html" "$out3"
+
+# the real link is still captured
+found "real.gif" "$out3"
+# namespace-declaration targets must not be fetched (default + prefixed forms)
+notfound "ns.gif" "$out3"
+notfound "og.gif" "$out3"
+notfound "rdfs.gif" "$out3"
+
+# CSS @import (#94): every form's target is captured, crawling the .css directly.
+# The "cond"/"sup"/"spc" cases carry a trailing media/supports/layer condition (or
+# a space before ';'); they are the negative controls: without the parser fix the
+# URL is dropped, so a regression fails these found() checks.
+site4="$tmp/cssimport"
+mkdir -p "$site4"
+for f in nq dqu squ dqs sqs med cond sup lay spc; do printf 'body{}\n' >"$site4/$f.css"; done
+cat >"$site4/main.css" <<'EOF'
+@import url(nq.css);
+@import url("dqu.css");
+@import url('squ.css');
+@import "dqs.css";
+@import 'sqs.css';
+@import url(med.css) screen and (min-width: 400px);
+@import "cond.css" screen;
+@import "sup.css" supports(display: flex);
+@import url(lay.css) layer(base);
+@import "spc.css" ;
+EOF
+out4="$tmp/cssimport-out"
+crawl "$site4/main.css" "$out4"
+for f in nq dqu squ dqs sqs med cond sup lay spc; do found "$f.css" "$out4"; done
+
+# Over-capture guard: the trailing condition is not part of the URL, so it must
+# survive the rewrite verbatim. A regression that grabs it would mangle these.
+m4=$(find "$out4" -type f -path '*/file/*' -name main.css -print -quit)
+test -n "$m4" || ! echo "FAIL: saved main.css not found" || exit 1
+for cond in '@import "cond.css" screen;' 'supports(display: flex)' 'layer(base)'; do
+    grep -Fq "$cond" "$m4" ||
+        ! echo "FAIL #94: '$cond' altered on rewrite (condition captured as URL?)" || exit 1
+done
+
+# Malformed input: an unterminated @import quote (truncated CSS) must not crash or
+# capture a bogus link; a valid sibling import is still captured. Guards a heap
+# overflow on the URL-end scan that aborts under ASan (CI sanitizer job).
+site5="$tmp/cssimport-trunc"
+mkdir -p "$site5"
+printf 'body{}\n' >"$site5/good.css"
+printf '@import "good.css";\n@import "trunc' >"$site5/main.css"
+out5="$tmp/cssimport-trunc-out"
+crawl "$site5/main.css" "$out5"
+found "good.css" "$out5"
+notfound "trunc" "$out5"
+
+# Offset-0 underflow (#396): a token at the buffer start makes the detector's
+# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
+# url() target is still captured; here it just must not underflow.
+site6="$tmp/parse-off0"
+mkdir -p "$site6"
+printf 'body{}\n' >"$site6/off0.css"
+printf 'url(off0.css)\n' >"$site6/main.css"
+out6="$tmp/parse-off0-out"
+crawl "$site6/main.css" "$out6"
+found "off0.css" "$out6"
+
 exit 0
--- a/tests/13_crawl_proxy_https.test
+++ b/tests/13_crawl_proxy_https.test
@@ -0,0 +1,136 @@
+#!/bin/bash
+#
+# Issue #85: an https crawl must go through the configured proxy (CONNECT
+# tunnel), not bypass it and hit the origin directly. Fully local: a self-signed
+# TLS origin plus a logging CONNECT proxy, so no network access is needed.
+
+set -euo pipefail
+
+: "${top_srcdir:=..}"
+
+if test "${HTTPS_SUPPORT:-}" == "no"; then
+    echo "no https support compiled, skipping"
+    exit 77
+fi
+if ! command -v python3 >/dev/null 2>&1 || ! command -v openssl >/dev/null 2>&1; then
+    echo "python3/openssl missing, skipping"
+    exit 77
+fi
+
+server="$top_srcdir/tests/proxy-https-server.py"
+tmpdir=$(mktemp -d)
+pids=
+
+cleanup() {
+    for pid in $pids; do
+        kill "$pid" 2>/dev/null || true
+    done
+    rm -rf "$tmpdir"
+}
+trap cleanup EXIT
+
+# self-signed cert for the local TLS origin (httrack does not verify certs)
+openssl req -x509 -newkey rsa:2048 -keyout "$tmpdir/key.pem" \
+    -out "$tmpdir/cert.pem" -days 2 -nodes -subj "/CN=127.0.0.1" \
+    >/dev/null 2>&1
+cat "$tmpdir/key.pem" "$tmpdir/cert.pem" >"$tmpdir/both.pem"
+
+# start_server <logdir> <mode>: launches a proxy+origin pair, sets $origin_port
+# and $proxy_port from its announced ephemeral ports.
+start_server() {
+    local dir="$1" mode="$2" ports
+    mkdir -p "$dir"
+    ports="$dir/ports.txt"
+    python3 "$server" "$tmpdir/both.pem" "$dir" "$mode" \
+        >"$ports" 2>"$dir/server.err" &
+    pids="$pids $!"
+    for _ in $(seq 1 100); do
+        grep -q "^ready" "$ports" 2>/dev/null && break
+        sleep 0.1
+    done
+    grep -q "^ready" "$ports" 2>/dev/null || {
+        echo "server ($mode) did not start" >&2
+        cat "$dir/server.err" >&2
+        exit 1
+    }
+    origin_port=$(awk '/^ORIGIN/{print $2}' "$ports")
+    proxy_port=$(awk '/^PROXY/{print $2}' "$ports")
+}
+
+# Run httrack, but kill it after a deadline so a hang (e.g. a missing bound on
+# the proxy response) surfaces as the kill code $HANG_RC instead of stalling the
+# whole job. A portable stand-in for `timeout`, which macOS lacks.
+HANG_RC=137 # 128 + SIGKILL
+run_crawl() {
+    local out="$1" proxy="$2" port="$3"
+    rm -rf "$out"
+    httrack "https://127.0.0.1:${port}/" --proxy "$proxy" \
+        -O "$out" -r1 -s0 --timeout=10 >"$out.log" 2>&1 &
+    local pid=$!
+    (sleep 60 && kill -9 "$pid" 2>/dev/null) &
+    local guard=$!
+    local rc=0
+    wait "$pid" 2>/dev/null || rc=$?
+    kill "$guard" 2>/dev/null || true
+    wait "$guard" 2>/dev/null || true
+    return "$rc"
+}
+
+# --- working proxy ----------------------------------------------------------
+ok="$tmpdir/ok"
+start_server "$ok" ok
+
+# 1. page retrieved AND the proxy saw a CONNECT to the origin
+run_crawl "$ok/out" "127.0.0.1:${proxy_port}" "$origin_port"
+grep -rq "ORIGIN-PAGE-85" "$ok/out" || {
+    echo "FAIL: origin page not downloaded through proxy" >&2
+    cat "$ok/out.log" >&2
+    exit 1
+}
+grep -q "^CONNECT 127.0.0.1:${origin_port} " "$ok/proxy.log" || {
+    echo "FAIL: proxy never received a CONNECT (https bypassed the proxy)" >&2
+    cat "$ok/proxy.log" >&2
+    exit 1
+}
+echo "OK: https tunneled through proxy via CONNECT"
+
+# 2. authenticated proxy: creds ride the CONNECT, and NEVER reach the origin
+: >"$ok/proxy.log"
+: >"$ok/origin-headers.log"
+run_crawl "$ok/out2" "user:secret@127.0.0.1:${proxy_port}" "$origin_port"
+grep -rq "ORIGIN-PAGE-85" "$ok/out2" || {
+    echo "FAIL: origin page not downloaded through authenticated proxy" >&2
+    exit 1
+}
+got=$(awk '/^AUTH Basic /{print $3}' "$ok/proxy.log" | head -1)
+# base64("user:secret"); compared as a literal to stay portable (no base64 -d,
+# which differs between GNU and BSD)
+test "$got" == "dXNlcjpzZWNyZXQ=" || {
+    echo "FAIL: Proxy-Authorization not carried on CONNECT (got '$got')" >&2
+    cat "$ok/proxy.log" >&2
+    exit 1
+}
+if grep -qi "proxy-authorization" "$ok/origin-headers.log"; then
+    echo "FAIL: proxy credentials leaked to the origin through the tunnel" >&2
+    cat "$ok/origin-headers.log" >&2
+    exit 1
+fi
+echo "OK: proxy credentials carried on CONNECT, not leaked to origin"
+
+# --- hostile proxy ----------------------------------------------------------
+# A proxy that answers 200 then streams headers forever must not hang the crawl:
+# the client bounds the response. run_crawl kills a hung httrack after 60s, so a
+# missing bound surfaces as $HANG_RC here.
+flood="$tmpdir/flood"
+start_server "$flood" flood
+rc=0
+run_crawl "$flood/out" "127.0.0.1:${proxy_port}" "$origin_port" || rc=$?
+test "$rc" -ne "$HANG_RC" || {
+    echo "FAIL: crawl hung on a flooding proxy (bounded read missing)" >&2
+    exit 1
+}
+grep -rq "ORIGIN-PAGE-85" "$flood/out" 2>/dev/null && {
+    echo "FAIL: flooding proxy unexpectedly served the page" >&2
+    exit 1
+}
+echo "OK: bounded proxy response, no hang on a flooding proxy"
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -2,6 +2,7 @@
 # explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
 # silently drop it from the dist tarball and break "make distcheck".
 EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
+	proxy-https-server.py \
 	fixtures/cache-golden/hts-cache/new.zip

 TESTS_ENVIRONMENT =
@@ -44,6 +45,7 @@ TESTS = \
 	11_crawl-international.test \
 	11_crawl-longurl.test \
 	11_crawl-parsing.test \
-	12_crawl_https.test
+	12_crawl_https.test \
+	13_crawl_proxy_https.test

 CLEANFILES = check-network_sh.cache
--- a/tests/proxy-https-server.py
+++ b/tests/proxy-https-server.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Local CONNECT proxy + self-signed HTTPS origin for the issue #85 test.
+
+Starts a TLS origin server and an HTTP proxy that honours CONNECT, on ephemeral
+ports. Every request line the proxy receives (and any Proxy-Authorization) is
+appended to the proxy log; every header the origin receives over the tunnel is
+appended to the origin log. That lets the test assert both that an https crawl
+tunneled through the proxy and that proxy credentials never leaked to the origin.
+
+Proxy modes (argv[3], default "ok"):
+  ok    - honour CONNECT and tunnel to the origin
+  flood - answer 200 then stream headers forever with no blank line, to exercise
+          the client's bound on the proxy response (must not hang the crawl)
+
+Usage: proxy-https-server.py <cert.pem> <logdir> [mode]
+Prints "ORIGIN <port>", "PROXY <port>", then "ready" (one per line) on stdout.
+"""
+import http.server
+import os
+import socket
+import socketserver
+import ssl
+import sys
+import threading
+
+ORIGIN_BODY = b"<html><body>ORIGIN-PAGE-85</body></html>"
+PROXY_LOG = "proxy.log"
+ORIGIN_LOG = "origin-headers.log"
+
+
+def make_origin(logdir):
+    class Origin(http.server.BaseHTTPRequestHandler):
+        def do_GET(self):
+            with open(os.path.join(logdir, ORIGIN_LOG), "a") as handle:
+                for key in self.headers.keys():
+                    handle.write(key + "\n")
+            self.send_response(200)
+            self.send_header("Content-Type", "text/html")
+            self.send_header("Content-Length", str(len(ORIGIN_BODY)))
+            self.end_headers()
+            self.wfile.write(ORIGIN_BODY)
+
+        def log_message(self, *args):
+            pass
+
+    return Origin
+
+
+def start_origin(certfile, logdir):
+    httpd = socketserver.TCPServer(("127.0.0.1", 0), make_origin(logdir))
+    ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+    ctx.load_cert_chain(certfile)
+    httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
+    port = httpd.socket.getsockname()[1]
+    threading.Thread(target=httpd.serve_forever, daemon=True).start()
+    return port
+
+
+def pipe(src, dst):
+    try:
+        while True:
+            data = src.recv(65536)
+            if not data:
+                break
+            dst.sendall(data)
+    except OSError:
+        pass
+    finally:
+        for sock in (src, dst):
+            try:
+                sock.shutdown(socket.SHUT_RDWR)
+            except OSError:
+                pass
+
+
+def handle_client(conn, logdir, mode):
+    rfile = conn.makefile("rb")
+    request_line = rfile.readline().decode("latin-1").strip()
+    auth = None
+    while True:
+        line = rfile.readline().decode("latin-1")
+        if line in ("\r\n", "\n", ""):
+            break
+        key, _, value = line.partition(":")
+        if key.strip().lower() == "proxy-authorization":
+            auth = value.strip()
+    with open(os.path.join(logdir, PROXY_LOG), "a") as handle:
+        handle.write(request_line + "\n")
+        if auth is not None:
+            handle.write("AUTH " + auth + "\n")
+    parts = request_line.split()
+    if not (len(parts) >= 2 and parts[0] == "CONNECT"):
+        conn.sendall(b"HTTP/1.0 501 Not Implemented\r\n\r\n")
+        conn.close()
+        return
+    if mode == "flood":
+        # 200, then an endless header stream with no terminating blank line: the
+        # client must bound this and give up, not hang.
+        try:
+            conn.sendall(b"HTTP/1.0 200 Connection established\r\n")
+            while True:
+                conn.sendall(b"X-Pad: 0123456789\r\n")
+        except OSError:
+            pass
+        conn.close()
+        return
+    host, _, port = parts[1].partition(":")
+    try:
+        upstream = socket.create_connection((host, int(port or 443)))
+    except OSError:
+        conn.sendall(b"HTTP/1.0 502 Bad Gateway\r\n\r\n")
+        conn.close()
+        return
+    conn.sendall(b"HTTP/1.0 200 Connection established\r\n\r\n")
+    threading.Thread(target=pipe, args=(conn, upstream), daemon=True).start()
+    pipe(upstream, conn)
+
+
+def start_proxy(logdir, mode):
+    srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    srv.bind(("127.0.0.1", 0))
+    srv.listen(16)
+    port = srv.getsockname()[1]
+
+    def serve():
+        while True:
+            conn, _ = srv.accept()
+            threading.Thread(
+                target=handle_client, args=(conn, logdir, mode), daemon=True
+            ).start()
+
+    threading.Thread(target=serve, daemon=True).start()
+    return port
+
+
+def main():
+    certfile, logdir = sys.argv[1], sys.argv[2]
+    mode = sys.argv[3] if len(sys.argv) > 3 else "ok"
+    for name in (PROXY_LOG, ORIGIN_LOG):
+        open(os.path.join(logdir, name), "w").close()
+    origin_port = start_origin(certfile, logdir)
+    proxy_port = start_proxy(logdir, mode)
+    print("ORIGIN %d" % origin_port, flush=True)
+    print("PROXY %d" % proxy_port, flush=True)
+    print("ready", flush=True)
+    threading.Event().wait()
+
+
+if __name__ == "__main__":
+    main()
Author	SHA1	Message	Date
Xavier Roche	799c045061	htsparse: don't read (html-1) before the parse buffer (#396 ) The link detector's word-boundary guards dereference (html-1) to check the byte preceding a matched token. When the token sits at the very start of the parse buffer (html == r->adr), that reads one byte before the allocation: a heap-buffer-overflow under ASan, silent on a normal build. A stylesheet beginning with a url() token is enough to hit it. Route the three reachable guards (url(), location=, the makeindex /title check) through html_prevc(), which returns a space sentinel at the buffer start. Space is the right value for these tests: a token at offset 0 is at a word boundary, so it stays a valid match. The other *(html-1) sites only run after html has advanced past an opening tag or quote. Covers it with an offset-0 url() fixture in 01_engine-parse.test; without the fix it aborts at htsparse.c:1386 under the CI sanitizer job. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 19:44:25 +02:00
Xavier Roche	fb1ee3bf2e	Merge pull request #397 from xroche/fix/css-import-94 CSS @import: capture URLs that carry a media/supports/layer condition (#94)	2026-06-19 19:30:21 +02:00
Xavier Roche	6a08ca7d39	htsparse: bound the URL-end scan against a missing closing delimiter Reviewing the @import change, ASan flagged a pre-existing heap overflow: when a quoted/parenthesized link token has no closing delimiter before the buffer ends (truncated input such as `@import "x`, `@import "`, `url("x`), the scan stops at the terminating NUL, then `c += ndelim` steps past it and `while (c == ' ')` / the terminator test read out of bounds. Such input aborts under ASan on master. Skip the URL-end scan and capture when no closing delimiter was found (`c == '\0'` right after the scan); c never advances past the NUL. Well-formed tokens are unaffected. 01_engine-parse.test gains a truncated-@import fixture (the valid sibling import is still captured, the unterminated one is not) that trips the overflow under the CI ASan job, plus a check that an @import's trailing media/supports/layer condition survives the rewrite verbatim. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 19:25:39 +02:00
Xavier Roche	a8b491e509	htsparse: capture conditional CSS @import URLs (#94 ) A bare-string @import carrying a media/supports/layer condition, e.g. `@import "theme.css" screen;`, was dropped. The detector required the closing quote to be immediately followed by the statement terminator, so the trailing condition aborted the capture. The `url(...)` form already worked because it terminates at the paren. Two coupled defects in the inscript/CSS detector: - accept a whitespace-separated trailing condition after a quoted @import URL; - bound the captured URL at its last content char (b) instead of recomputing from the terminator. The old `c -= (ndelim + 1)` mishandled spaces skipped before the terminator, leaving the closing quote inside the range so the bogus-link guard aborted. That also silently broke `foo="url" ;` (a space before the semicolon) for every quoted detection, not only @import. 01_engine-parse.test gains a CSS @import section that crawls a .css directly; the conditioned cases are negative controls that fail without the fix. Closes #94 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 18:46:31 +02:00
Xavier Roche	a8e4bb3b81	Merge pull request #395 from xroche/fix/xmlns-false-links-191 Don't crawl xmlns namespace declarations	2026-06-19 18:28:23 +02:00
Xavier Roche	0145ec37a3	htsparse: don't crawl xmlns namespace declarations (#191 ) The "dirty parsing" heuristic accepts any tag attribute whose value looks like a URL unless the attribute is on the no-detect list. xmlns and xmlns:prefix declarations carry namespace URIs (xmlns:og="http://ogp.me/ns#", etc.) that are not resources, so httrack queued and fetched them, stalling the crawl on unrelated spec URLs. Reject xmlns/xmlns:prefix where the no-detect list is already consulted. 01_engine-parse.test grows a fixture with each form (default and prefixed) as the last attribute of its element, since the heuristic only inspects an attribute whose value is immediately followed by '>'; the targets are local file:// gifs so a regression actually downloads them (verified: reverting the guard fetches all three). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 18:24:55 +02:00
Xavier Roche	a80fab38ba	Merge pull request #394 from xroche/fix/proxy-https-connect-85 Tunnel https through the proxy via CONNECT (#85)	2026-06-19 18:03:31 +02:00
Xavier Roche	c52a524a63	htslib: bound the proxy CONNECT response; harden + cover review findings Follow-up to the CONNECT-tunnel change, from an adversarial review (the proxy response is hostile input: a malicious or MITM proxy controls every byte). - Bound the response read so a proxy cannot stall the single-threaded back_wait crawl: proxy_getline now fails on an over-long line instead of consuming it forever, the header drain is capped at 64 lines, and the send loop gives up rather than spin against a socket that reports writable but never accepts. - Size `authority` to hold any url_adr host (HTS_URLMAXSIZE*2) so an oversized hostname can't trip the abort-on-overflow buff helpers; grow `req` to match. - Reject control bytes in the CONNECT authority as a local backstop; today the CR/LF defense lives entirely upstream (escape_remove_control / header-line splitting). - Test: the origin now records the headers it receives, and the test asserts Proxy-Authorization never reaches the origin through the tunnel (the previous assertions couldn't see a leak). Added a flooding-proxy scenario that proves the crawl terminates instead of hanging on an unbounded response. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 09:52:10 +02:00
Xavier Roche	1907621d37	htslib: tunnel https through the proxy via CONNECT (#85 ) httrack opened https connections straight to the origin even when a proxy was configured, so --proxy was silently ignored for https and the crawler used the real IP. http_xfopen bypassed the proxy for any https:// URL, because the absolute-URI proxy form it uses for http cannot carry https. Connect to the proxy instead and, once the TCP connection is up, open an HTTP CONNECT tunnel (http_proxy_tunnel) before the TLS handshake, so TLS runs end-to-end with the origin. Proxy credentials now ride the CONNECT request rather than the tunneled GET, where they would leak to the origin. The exchange is a bounded blocking read inside the back_wait connect path: no new async state, no struct/ABI change (the helpers stay visibility-hidden). Verified end-to-end by 13_crawl_proxy_https.test: it crawls a local self-signed https origin through a logging CONNECT proxy and asserts the proxy saw the CONNECT and that credentials ride it. The assertion fails on the pre-fix bypass. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 08:43:56 +02:00
Xavier Roche	3b2d7afdaa	Merge pull request #393 from xroche/fix/empty-footer-doitlog-106 Keep empty quoted args when reloading doit.log (#106)	2026-06-19 08:13:19 +02:00
Xavier Roche	6ee539619e	htscoremain: keep empty quoted args when reloading doit.log (#106 ) An empty footer (-%F "") is written to hts-cache/doit.log correctly as the two-character token "", and next_token() unquotes it back to an empty string. But the doit.log reload loop only re-inserted a token when strnotempty(lastp), which dropped the empty one. With its argument gone, -%F absorbed the following token (or had none), so a no-url --continue/--update reprise misparsed and failed. Track whether the token started with a quote (before next_token() strips it in place) and keep it even when empty, so "" survives the round-trip. Whitespace gaps still produce no token, so spacing behavior is unchanged. 01_engine-doitlog.test gains a scenario that mirrors with -%F "" -r2, then on the no-url reprise checks the regenerated doit.log still round-trips the empty token -- probing the reader's rebuilt argv, not just that the reprise didn't crash. The trailing -r2 makes a dropped-token bug visible (it shifts into -%F's slot and panics) rather than a harmless run off the end of argv. Reverting only the guard makes the scenario fail (reprise exits 255). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 08:09:57 +02:00
Xavier Roche	fb098b27b4	Merge pull request #392 from xroche/fix/cookie-rfc6265-151 Drop $Version/$Path from the request Cookie header (#151)	2026-06-18 22:42:47 +02:00