Test the relative-link engine; collapse ../ in file:// URLs

The ../-handling tickets #137 (embedded ../ in a URL) and #162 (cross-host "too many ../") do not reproduce on master or the released 3.49.x: the engine has resolved embedded, cross-host, out-of-scope and above-root ../ correctly since the 2012 import, and the released binary behaves identically. #137's actual breakage was a JS-generated iframe URL (httrack can't rewrite dynamically-built links); #162 is a long-gone Windows path quirk. The area was nearly untested, though, despite feeding both link rewriting and crawl-scope decisions: two trivial lienrelatif asserts, none for ident_url_relatif. Add a wide regression net via two hidden debug probes (-#l lienrelatif, -#i ident_url_relatif, mirroring -#1 fil_simplifie) driving tens of cases in tests/01_engine-relative.test (embedded/cross-host/sibling/ ancestor/above-root ../, query stripping, scheme handling), plus the missing fil_simplifie edge cases (absolute paths, root clamp, query freeze) in 01_engine-simplify.test. Expected values are computed by hand, not echoed. While covering it, fixed one real gap: the file:// branch of ident_url_absolute skipped the fil_simplifie its http sibling runs, so file:// URLs kept their ../ in adrfil->fil while the save path was already collapsed (htsname.c:1343). Collapsing it matches the other schemes, contains traversal at the file:// root, and dedups a/../b against b. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>
Merge pull request #400 from xroche/fix/css-url-paren-163
2026-06-20 17:18:14 +03:00 · 2026-06-20 11:14:28 +02:00 · 2026-06-20 10:02:32 +02:00 · 2026-06-20 10:01:17 +02:00 · 2026-06-19 20:36:26 +02:00 · 2026-06-19 20:27:04 +02:00
7 changed files with 386 additions and 12 deletions
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -2787,6 +2787,47 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
                  return 0;
                }
                break;
+              case 'l': /* lienrelatif: relative link from curr_fil to link */
+                if (na + 2 >= argc) {
+                  HTS_PANIC_PRINTF(
+                      "Option #l needs a link and a current-file path");
+                  printf(
+                      "Example: '-#l' 'host/dir/img.gif' 'host/dir/p.html'\n");
+                  htsmain_free();
+                  return -1;
+                } else {
+                  char s[HTS_URLMAXSIZE * 2];
+
+                  if (lienrelatif(s, sizeof(s), argv[na + 1], argv[na + 2]) ==
+                      0)
+                    printf("relative=%s\n", s);
+                  else
+                    printf("relative=<ERROR>\n");
+                  htsmain_free();
+                  return 0;
+                }
+                break;
+              case 'i': /* ident_url_relatif: resolve a link -> adr/fil */
+                if (na + 3 >= argc) {
+                  HTS_PANIC_PRINTF(
+                      "Option #i needs a link, an origin address and file");
+                  printf("Example: '-#i' '../img.gif' 'www.foo.com' "
+                         "'/d/p.html'\n");
+                  htsmain_free();
+                  return -1;
+                } else {
+                  lien_adrfil af;
+                  const int r = ident_url_relatif(argv[na + 1], argv[na + 2],
+                                                  argv[na + 3], &af);
+
+                  if (r == 0)
+                    printf("adr=%s fil=%s\n", af.adr, af.fil);
+                  else
+                    printf("error=%d\n", r);
+                  htsmain_free();
+                  return 0;
+                }
+                break;
              case '2':        // mimedefs
                if (na + 1 >= argc) {
                  HTS_PANIC_PRINTF("Option #2 needs to be followed by an URL");
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -2605,6 +2605,8 @@ int ident_url_absolute(const char *url, lien_adrfil *adrfil) {
    for(i = 0; adrfil->fil[i] != '\0'; i++)
      if (adrfil->fil[i] == '\\')
        adrfil->fil[i] = '/';
+    // collapse ../ like the http branch above (path-traversal safety)
+    fil_simplifie(adrfil->fil);
  }

  // no hostname
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -296,6 +296,48 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
  return dst;
 }

+/* Byte before html, or a space sentinel at the buffer start where html[-1]
+   would underflow; space reads as the word boundary the guards want there. */
+static HTS_INLINE char html_prevc(const char *html, const char *start) {
+  return html > start ? html[-1] : ' ';
+}
+
+/* True if [s, s+len) is exactly an HTTP method token (XHR.open's first
+   argument is a method, not a URL: #218). Case-insensitive. */
+static int is_http_method(const char *s, size_t len) {
+  static const char *const methods[] = {"GET",    "POST",  "PUT",
+                                        "DELETE", "HEAD",  "OPTIONS",
+                                        "PATCH",  "TRACE", NULL};
+  int i;
+
+  for (i = 0; methods[i] != NULL; i++) {
+    if (strlen(methods[i]) == len && strfield(s, methods[i]) == (int) len)
+      return 1;
+  }
+  return 0;
+}
+
+/* Percent-encode '(' and ')' in a link emitted into an unquoted url(...) (CSS
+   or JS): a literal ')' closes the token early and the UA mis-parses the value
+   (#163). The UA decodes %28/%29 back to the saved-on-disk name. */
+static void escape_url_parens(char *const s, const size_t size) {
+  char BIGSTK buff[HTS_URLMAXSIZE * 2];
+  size_t i, j;
+
+  for (i = 0, j = 0; s[i] != '\0' && j + 3 < size && j + 3 < sizeof(buff);
+       i++) {
+    if (s[i] == '(' || s[i] == ')') {
+      buff[j++] = '%';
+      buff[j++] = '2';
+      buff[j++] = s[i] == '(' ? '8' : '9';
+    } else {
+      buff[j++] = s[i];
+    }
+  }
+  buff[j] = '\0';
+  strlcpybuff(s, buff, size);
+}
+
 /* Main parser */
 int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
  char catbuff[CATBUFF_SIZE];
@@ -556,7 +598,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                  if (opt->getmode & HTS_GETMODE_HTML) {
                    p = strfield(html, "title");
                    if (p) {
-                      if (*(html - 1) == '/')
+                      if (html_prevc(html, r->adr) == '/')
                        p = 0;  // /title
                    } else {
                      if (strfield(html, "/html"))
@@ -1341,6 +1383,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                    int can_avoid_quotes = 0;
                    char quotes_replacement = '\0';
                    int ensure_not_mime = 0;
+                    // .open(method,url): reject an HTTP-method first arg (#218)
+                    int ensure_not_method = 0;
+                    // @import: the quoted token is the URL; a trailing
+                    // media/supports/layer condition is not part of it
+                    int is_import = 0;

                    if (inscript_tag)
                      expected_end = ";\"\'";   // voir a href="javascript:doc.location='foo'"
@@ -1357,9 +1404,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                      if (!nc)
                        nc = strfield(html, ":location");        // javascript:location="doc"
                      if (!nc) {        // location="doc"
-                        if ((nc = strfield(html, "location"))
-                            && !isspace(*(html - 1))
-                          )
+                        if ((nc = strfield(html, "location")) &&
+                            !isspace(html_prevc(html, r->adr)))
                          nc = 0;
                      }
                      if (!nc)
@@ -1369,6 +1415,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                          expected = '(';       // parenthèse
                          expected_end = "),";  // fin: virgule ou parenthèse
                          ensure_not_mime = 1;  //* ensure the url is not a mime type */
+                          ensure_not_method = 1; // xhr.open: don't grab method
                        }
                      if (!nc)
                        if ((nc = strfield(html, ".replace"))) { // window.replace("url")
@@ -1380,7 +1427,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                          expected = '(';       // parenthèse
                          expected_end = ")";   // fin: parenthèse
                        }
-                      if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') {  // url(url)
+                      if (!nc && (nc = strfield(html, "url")) &&
+                          (!isalnum(html_prevc(html, r->adr))) &&
+                          html_prevc(html, r->adr) != '_') { // url(url)
                        expected = '('; // parenthèse
                        expected_end = ")";     // fin: parenthèse
                        can_avoid_quotes = 1;
@@ -1390,6 +1439,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                        if ((nc = strfield(html, "import"))) {   // import "url"
                          if (is_space(*(html + nc))) {
                            expected = 0;       // no char expected
+                            is_import = 1;
                          } else
                            nc = 0;
                        }
@@ -1407,6 +1457,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                          if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
                            const char *b, *c;
                            int ndelim = 1;
+                            int valid_url = 0;

                            if ((*a == 34) || (*a == '\''))
                              a++;
@@ -1421,12 +1472,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                                b++;
                            }
                            c = b--;
-                            c += ndelim;
-                            while(*c == ' ')
-                              c++;
-                            if ((strchr(expected_end, *c)) || (*c == '\n')
-                                || (*c == '\r')) {
-                              c -= (ndelim + 1);
+                            // no closing delimiter here (truncated input):
+                            // Don't scan past the buffer NUL or capture it.
+                            if (*c != '\0') {
+                              c += ndelim;
+                              while (*c == ' ')
+                                c++;
+                              valid_url =
+                                  (strchr(expected_end, *c)) || (*c == '\n') ||
+                                  (*c == '\r') ||
+                                  (is_import && *(b + 1 + ndelim) == ' ');
+                            }
+                            if (valid_url) {
+                              // URL end = last char (b), not the delimiter
+                              c = b;
                              if ((int) (c - a + 1)) {
                                if (ensure_not_mime) {
                                  int i = 0;
@@ -1442,6 +1501,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                                    i++;
                                  }
                                }
+                                // XHR.open's "GET" etc. is a method, not a URL
+                                if (a != NULL && ensure_not_method &&
+                                    is_http_method(a, (size_t) (c - a + 1))) {
+                                  a = NULL;
+                                }
                                // Check for bogus links (Vasiliy)
                                if (a != NULL) {
                                  const size_t size = c - a + 1;
@@ -1485,7 +1549,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                                }
                              }
                            }
-
                          }
                        }
                      }
@@ -1692,6 +1755,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                                                              hts_nodetect[i -
                                                                           1]);
                                              }
+                                              // xmlns / xmlns:prefix declare
+                                              // XML namespaces, not resources
+                                              // (#191)
+                                              else {
+                                                const int xl = strfield(
+                                                    intag_startattr, "xmlns");
+                                                const char xc =
+                                                    intag_startattr[xl];
+                                                if (xl &&
+                                                    (xc == ':' || xc == '=' ||
+                                                     is_space(xc))) {
+                                                  url_ok = 0;
+                                                  hts_log_print(
+                                                      opt, LOG_DEBUG,
+                                                      "dirty parsing: xmlns "
+                                                      "namespace avoided");
+                                                }
+                                              }
                                            }
                                  }

@@ -2967,6 +3048,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                          /* Never escape high-chars (we don't know the encoding!!) */
                          inplace_escape_uri_utf(tempo, sizeof(tempo));

+                          // unquoted url() (CSS/JS): keep parens escaped
+                          if (ending_p == ')')
+                            escape_url_parens(tempo, sizeof(tempo));
+
                          //if (!no_esc_utf)
                          //  escape_uri(tempo);     // escape with %xx
                          //else {
--- a/tests/01_engine-parse.test
+++ b/tests/01_engine-parse.test
@@ -154,4 +154,173 @@ grep -Eq "style=\"background-image:url\('ibgs\.gif'\)\"" "$saved2" ||
 grep -q 'title="file://' "$saved2" ||
    ! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1

+# xmlns / xmlns:prefix decls must not be crawled (#191). Local file:// targets so a
+# regression downloads them; each is the LAST attr (heuristic only scans a value before '>').
+site3="$tmp/xmlns"
+mkdir -p "$site3"
+for f in ns og rdfs real; do gif "$site3/$f.gif"; done
+cat >"$site3/index.html" <<EOF
+<html xmlns="file://$site3/ns.gif"><body>
+<svg xmlns:og="file://$site3/og.gif"></svg>
+<div class="c" xmlns:rdfs="file://$site3/rdfs.gif"></div>
+<a href="file://$site3/real.gif">real link</a>
+</body></html>
+EOF
+out3="$tmp/xmlns-out"
+crawl "$site3/index.html" "$out3"
+
+# the real link is still captured
+found "real.gif" "$out3"
+# namespace-declaration targets must not be fetched (default + prefixed forms)
+notfound "ns.gif" "$out3"
+notfound "og.gif" "$out3"
+notfound "rdfs.gif" "$out3"
+
+# CSS @import (#94): every form's target is captured, crawling the .css directly.
+# The "cond"/"sup"/"spc" cases carry a trailing media/supports/layer condition (or
+# a space before ';'); they are the negative controls: without the parser fix the
+# URL is dropped, so a regression fails these found() checks.
+site4="$tmp/cssimport"
+mkdir -p "$site4"
+for f in nq dqu squ dqs sqs med cond sup lay spc; do printf 'body{}\n' >"$site4/$f.css"; done
+cat >"$site4/main.css" <<'EOF'
+@import url(nq.css);
+@import url("dqu.css");
+@import url('squ.css');
+@import "dqs.css";
+@import 'sqs.css';
+@import url(med.css) screen and (min-width: 400px);
+@import "cond.css" screen;
+@import "sup.css" supports(display: flex);
+@import url(lay.css) layer(base);
+@import "spc.css" ;
+EOF
+out4="$tmp/cssimport-out"
+crawl "$site4/main.css" "$out4"
+for f in nq dqu squ dqs sqs med cond sup lay spc; do found "$f.css" "$out4"; done
+
+# Over-capture guard: the trailing condition is not part of the URL, so it must
+# survive the rewrite verbatim. A regression that grabs it would mangle these.
+m4=$(find "$out4" -type f -path '*/file/*' -name main.css -print -quit)
+test -n "$m4" || ! echo "FAIL: saved main.css not found" || exit 1
+for cond in '@import "cond.css" screen;' 'supports(display: flex)' 'layer(base)'; do
+    grep -Fq "$cond" "$m4" ||
+        ! echo "FAIL #94: '$cond' altered on rewrite (condition captured as URL?)" || exit 1
+done
+
+# Malformed input: an unterminated @import quote (truncated CSS) must not crash or
+# capture a bogus link; a valid sibling import is still captured. Guards a heap
+# overflow on the URL-end scan that aborts under ASan (CI sanitizer job).
+site5="$tmp/cssimport-trunc"
+mkdir -p "$site5"
+printf 'body{}\n' >"$site5/good.css"
+printf '@import "good.css";\n@import "trunc' >"$site5/main.css"
+out5="$tmp/cssimport-trunc-out"
+crawl "$site5/main.css" "$out5"
+found "good.css" "$out5"
+notfound "trunc" "$out5"
+
+# Offset-0 underflow (#396): a token at the buffer start makes the detector's
+# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
+# url() target is still captured; here it just must not underflow.
+site6="$tmp/parse-off0"
+mkdir -p "$site6"
+printf 'body{}\n' >"$site6/off0.css"
+printf 'url(off0.css)\n' >"$site6/main.css"
+out6="$tmp/parse-off0-out"
+crawl "$site6/main.css" "$out6"
+found "off0.css" "$out6"
+
+# XMLHttpRequest.open(method, url) (#218): the first argument is an HTTP method,
+# not a URL. Without the fix "GET" is captured as a link and fetched (the offline
+# fixture saves a bare file named GET; a live server mangles it to GET.html).
+# window.open(url) detection must be unaffected.
+site7="$tmp/xhropen"
+mkdir -p "$site7"
+gif "$site7/winopen.gif"
+cat >"$site7/index.html" <<EOF
+<html><body><script>
+var x = new XMLHttpRequest();
+x.open("GET", "ajax_info.txt");
+var y = new XMLHttpRequest();
+y.open("Post", "submit.cgi");
+window.open("file://$site7/winopen.gif");
+</script></body></html>
+EOF
+out7="$tmp/xhropen-out"
+crawl "$site7/index.html" "$out7"
+# negative control: without the fix a file named exactly GET is downloaded
+notfound "GET" "$out7"
+# methods are matched case-insensitively (XHR spec normalizes them): a mixed-case
+# method is rejected too, so a file named Post must not appear either
+notfound "Post" "$out7"
+# regression guard: window.open(url) is still detected, so its absolute URL is
+# rewritten to a local link. The rewrite only happens if the parser saw it, so
+# these two assertions fail if .open detection broke (not a trivial --near save).
+saved7=$(savedhtml "$out7")
+test -n "$saved7" || ! echo "FAIL: saved xhr page not found" || exit 1
+grep -Fq 'window.open("winopen.gif")' "$saved7" ||
+    ! echo "FAIL #218: window.open(url) no longer detected/rewritten" || exit 1
+! grep -Fq 'window.open("file://' "$saved7" ||
+    ! echo "FAIL #218: window.open URL left absolute (not rewritten)" || exit 1
+
+# Parens in an unquoted url(...) (#163): the source %28/%29 decode to literal
+# '(' ')' in the saved name, but a literal ')' in the rewritten url() closes the
+# token early, so they must stay encoded. Negative control: without the fix the
+# %281%29 greps fail (parens are RFC2396 "mark" chars the escaper leaves alone).
+site8="$tmp/cssparens"
+mkdir -p "$site8"
+for f in 'img (1).gif' 'a(b)c(1).gif' 'q (4).gif'; do gif "$site8/$f"; done
+cat >"$site8/style.css" <<'EOF'
+.a { background: url(img%20%281%29.gif); }
+.b { background: url(a%28b%29c%281%29.gif); }
+.c { background: url("q%20%284%29.gif"); }
+EOF
+out8="$tmp/cssparens-out"
+crawl "$site8/style.css" "$out8"
+found "img (1).gif" "$out8"
+found "a(b)c(1).gif" "$out8"
+found "q (4).gif" "$out8"
+css8=$(find "$out8" -type f -path '*/file/*' -name style.css -print -quit)
+test -n "$css8" || ! echo "FAIL: saved style.css not found" || exit 1
+grep -Fq 'url(img%20%281%29.gif)' "$css8" ||
+    ! echo "FAIL #163: parens in unquoted url() not percent-encoded on rewrite" || exit 1
+grep -Fq 'url(a%28b%29c%281%29.gif)' "$css8" ||
+    ! echo "FAIL #163: not every paren in a url() was percent-encoded" || exit 1
+grep -Fq 'url("q%20%284%29.gif")' "$css8" ||
+    ! echo "FAIL #163: quoted url() altered or parens left literal on rewrite" || exit 1
+
+# The url() detector is not CSS-specific: <script> and inline style= get the
+# same encoding, but ordinary href/src (ending_p is the quote, not ')') keep
+# literal parens -- the attribute checks guard the gate against over-firing.
+site9="$tmp/urlparens"
+mkdir -p "$site9"
+for f in 'js (1).gif' 'inl (2).gif' 'asrc (3).gif' 'ahref (4).gif'; do gif "$site9/$f"; done
+cat >"$site9/index.html" <<EOF
+<html><body>
+<script>var bg = "url(js%20%281%29.gif)";</script>
+<div style="background-image:url(inl%20%282%29.gif)"></div>
+<img src="asrc%20%283%29.gif">
+<a href="ahref%20%284%29.gif">link</a>
+</body></html>
+EOF
+out9="$tmp/urlparens-out"
+crawl "$site9/index.html" "$out9"
+saved9=$(savedhtml "$out9")
+test -n "$saved9" || ! echo "FAIL: saved urlparens page not found" || exit 1
+# rewrite-only: the JS-string asset is not queued for download
+grep -Fq 'url(js%20%281%29.gif)' "$saved9" ||
+    ! echo "FAIL #163: parens in <script> url() not percent-encoded" || exit 1
+found "inl (2).gif" "$out9"
+grep -Fq 'url(inl%20%282%29.gif)' "$saved9" ||
+    ! echo "FAIL #163: parens in inline style url() not percent-encoded" || exit 1
+found "asrc (3).gif" "$out9"
+found "ahref (4).gif" "$out9"
+grep -Fq 'src="asrc%20(3).gif"' "$saved9" ||
+    ! echo "FAIL #163: parens in a plain src attribute were wrongly encoded" || exit 1
+grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
+    ! echo "FAIL #163: parens in a plain href attribute were wrongly encoded" || exit 1
+! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
+    ! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
+
 exit 0
--- a/tests/01_engine-relative.test
+++ b/tests/01_engine-relative.test
@@ -0,0 +1,62 @@
+#!/bin/bash
+#
+# lienrelatif (build relative path) + ident_url_relatif (resolve a link, collapse
+# ./ and ../). Regression net for #137/#162; expected values hand-computed.
+
+set -euo pipefail
+
+# relative path from <curr>'s directory to <link>
+rel() {
+	local got
+	got=$(httrack -O /dev/null -#l "$1" "$2")
+	test "$got" == "relative=$3" ||
+		{ echo "FAIL rel($1, $2): got '$got' want 'relative=$3'"; exit 1; }
+}
+
+# resolve <link> against origin <adr>/<fil> -> adr=.. fil=..
+ident() {
+	local got
+	got=$(httrack -O /dev/null -#i "$1" "$2" "$3")
+	test "$got" == "$4" ||
+		{ echo "FAIL ident($1, $2, $3): got '$got' want '$4'"; exit 1; }
+}
+
+### lienrelatif
+
+rel 'dir/page.html'        'dir/index.html'        'page.html'
+rel 'dir/page.html'        'dir/page.html'         'page.html'   # self-link
+rel 'a.html'               'dir/index.html'        '../a.html'
+rel 'x.html'               'a/b/c/index.html'      '../../../x.html'
+rel 'h/a/x.jpg'            'h/a/sub/page.html'     '../x.jpg'
+rel 'a/b/c/x.html'         'index.html'            'a/b/c/x.html'
+rel 'h/sub/x.jpg'          'h/page.html'           'sub/x.jpg'
+rel 'h/dir2/x.jpg'         'h/dir1/page.html'      '../dir2/x.jpg'   # sibling dir
+rel 'h/bc/x.jpg'           'h/b/page.html'         '../bc/x.jpg'     # b/bc prefix trap
+rel 'h/b/x.jpg'            'h/bc/page.html'        '../b/x.jpg'
+rel 'h2/img/x.jpg'         'h1/p/page.html'        '../../h2/img/x.jpg'   # cross-host
+rel 'img.cdn/photo.jpg'    'www.site/articles/2020/post.html' '../../../img.cdn/photo.jpg'
+rel 'h/a/'                 'h/a/sub/page.html'     '../'             # link is ancestor dir
+rel 'x.html'               'page.html'             'x.html'
+rel 'dir/page.html?x=1'    'dir/index.html?y=2'    'page.html'       # ? stripped
+
+### ident_url_relatif
+
+ident 'img.gif'      'www.foo.com' '/dir/page.html'     'adr=www.foo.com fil=/dir/img.gif'
+ident 'sub/img.gif'  'www.foo.com' '/dir/page.html'     'adr=www.foo.com fil=/dir/sub/img.gif'
+ident '/img.gif'     'www.foo.com' '/dir/page.html'     'adr=www.foo.com fil=/img.gif'
+# embedded ../ collapses (#137)
+ident '../img.gif'             'www.foo.com' '/dir/sub/page.html'        'adr=www.foo.com fil=/dir/img.gif'
+ident 'sub/../logo.png'        'www.foo.com' '/articles/2020/post.html' 'adr=www.foo.com fil=/articles/2020/logo.png'
+ident '../../pix/sub/../logo.png' 'www.foo.com' '/articles/2020/post.html' 'adr=www.foo.com fil=/pix/logo.png'
+ident '../../../../x.gif'      'www.foo.com' '/a/b/page.html'           'adr=www.foo.com fil=/x.gif'   # above-root clamp
+ident '?page=2'      'www.foo.com' '/dir/index.html?old=1' 'adr=www.foo.com fil=/dir/index.html?page=2'
+ident 'http://other.com/a/b/../c/index.html' 'www.foo.com' '/p.html' 'adr=other.com fil=/a/c/index.html'
+# file:// collapses ../ like the other schemes; traversal contained, // authority kept
+ident 'file:///var/data/pix/sub/../logo.png' 'www.foo.com' '/p.html' 'adr=file:// fil=/var/data/pix/logo.png'
+ident 'file:///a/b/c/../../d/e.gif'          'www.foo.com' '/p.html' 'adr=file:// fil=/a/d/e.gif'
+ident 'file:///a/../../b'        'www.foo.com' '/p.html' 'adr=file:// fil=/b'
+ident 'file://srv/share/../x'    'www.foo.com' '/p.html' 'adr=file:// fil=//srv/x'
+ident 'mailto:foo@bar.com'   'www.foo.com' '/p.html'    'error=-1'   # unsupported scheme
+ident 'javascript:void(0)'   'www.foo.com' '/p.html'    'error=-1'
+
+echo "OK"
--- a/tests/01_engine-simplify.test
+++ b/tests/01_engine-simplify.test
@@ -26,3 +26,17 @@ simp './a/../../b' 'b'

 # empty segments ('//') are not dot-segments and are preserved, per RFC 3986
 simp 'a//b' 'a//b'
+simp 'a//b/../c' 'a//c'
+
+# absolute paths keep the leading '/'; above-root '..' is clamped to it
+simp '/a/../b' '/b'
+simp '/a/../../b' '/b'
+simp '/../x' '/x'
+
+# collapses to nothing -> './' (relative) or '/' (absolute)
+simp '..' './'
+simp 'a/..' './'
+simp '/' '/'
+
+simp 'a/b/..' 'a/'              # trailing bare '..'
+simp 'a/../b?x=../y' 'b?x=../y' # '?' freezes simplification
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -35,6 +35,7 @@ TESTS = \
 	01_engine-mime.test \
 	01_engine-parse.test \
 	01_engine-rcfile.test \
+	01_engine-relative.test \
 	01_engine-simplify.test \
 	01_engine-strsafe.test \
 	02_manpage-regen.test \
Author	SHA1	Message	Date
Xavier Roche	bdd1c1bc2c	Test the relative-link engine; collapse ../ in file:// URLs The ../-handling tickets #137 (embedded ../ in a URL) and #162 (cross-host "too many ../") do not reproduce on master or the released 3.49.x: the engine has resolved embedded, cross-host, out-of-scope and above-root ../ correctly since the 2012 import, and the released binary behaves identically. #137's actual breakage was a JS-generated iframe URL (httrack can't rewrite dynamically-built links); #162 is a long-gone Windows path quirk. The area was nearly untested, though, despite feeding both link rewriting and crawl-scope decisions: two trivial lienrelatif asserts, none for ident_url_relatif. Add a wide regression net via two hidden debug probes (-#l lienrelatif, -#i ident_url_relatif, mirroring -#1 fil_simplifie) driving tens of cases in tests/01_engine-relative.test (embedded/cross-host/sibling/ ancestor/above-root ../, query stripping, scheme handling), plus the missing fil_simplifie edge cases (absolute paths, root clamp, query freeze) in 01_engine-simplify.test. Expected values are computed by hand, not echoed. While covering it, fixed one real gap: the file:// branch of ident_url_absolute skipped the fil_simplifie its http sibling runs, so file:// URLs kept their ../ in adrfil->fil while the save path was already collapsed (htsname.c:1343). Collapsing it matches the other schemes, contains traversal at the file:// root, and dedups a/../b against b. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-20 11:14:28 +02:00
Xavier Roche	56665a268f	Merge pull request #400 from xroche/fix/css-url-paren-163 Encode parens in rewritten CSS url() so the value isn't truncated (#163)	2026-06-20 10:02:32 +02:00
Xavier Roche	2e948b9acd	htsparse: percent-encode parens in rewritten CSS url() (#163 ) A source url(...) whose target encodes '(' ')' as %28/%29 was rewritten with literal parens, because they are RFC2396 "mark" characters that the URI escaper (escape_uri_utf, mode 30) leaves alone. In an unquoted CSS url(...) the literal ')' closes the token early, so the browser mis-parses the value and drops the background image. Re-escape '(' and ')' back to %28/%29 when emitting the link, gated on the url() context (ending_p == ')'). The UA decodes them to the saved-on-disk name, so the reference still resolves. Quoted url("...") and ordinary HTML attributes keep their parens, matching prior behavior. Test in 01_engine-parse.test crawls a CSS fixture whose url() references a %20%28...%29 name and asserts the rewrite keeps the parens encoded; negative control confirmed (literal-paren output fails it). Closes #163 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-20 10:01:17 +02:00
Xavier Roche	cae11499f1	Merge pull request #399 from xroche/fix/js-string-falsepos-218 htsparse: don't treat XHR.open's method argument as a URL (#218)	2026-06-19 20:36:26 +02:00
Xavier Roche	02c7f4ebf6	htsparse: don't treat XHR.open's method argument as a URL (#218 ) The JavaScript URL detector matched `.open(` for window.open("url",...) and captured the first argument as a link. XMLHttpRequest.open(method, url) puts the HTTP method first, so `xhr.open("GET", "ajax_info.txt")` turned "GET" into a bogus link, rewritten to "GET.html" on a live server. Reject a first argument that is exactly an HTTP method, mirroring the existing ensure_not_mime guard. window.open(url) is unaffected; the real XHR url (the second argument) is still picked up by the dirty parser. Closes #218 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 20:27:04 +02:00
Xavier Roche	9070b44a70	Merge pull request #398 from xroche/fix/html-underflow-396 htsparse: fix buffer underflow reading *(html-1) at offset 0 (#396)	2026-06-19 19:55:40 +02:00
Xavier Roche	799c045061	htsparse: don't read (html-1) before the parse buffer (#396 ) The link detector's word-boundary guards dereference (html-1) to check the byte preceding a matched token. When the token sits at the very start of the parse buffer (html == r->adr), that reads one byte before the allocation: a heap-buffer-overflow under ASan, silent on a normal build. A stylesheet beginning with a url() token is enough to hit it. Route the three reachable guards (url(), location=, the makeindex /title check) through html_prevc(), which returns a space sentinel at the buffer start. Space is the right value for these tests: a token at offset 0 is at a word boundary, so it stays a valid match. The other *(html-1) sites only run after html has advanced past an opening tag or quote. Covers it with an offset-0 url() fixture in 01_engine-parse.test; without the fix it aborts at htsparse.c:1386 under the CI sanitizer job. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 19:44:25 +02:00
Xavier Roche	fb1ee3bf2e	Merge pull request #397 from xroche/fix/css-import-94 CSS @import: capture URLs that carry a media/supports/layer condition (#94)	2026-06-19 19:30:21 +02:00
Xavier Roche	6a08ca7d39	htsparse: bound the URL-end scan against a missing closing delimiter Reviewing the @import change, ASan flagged a pre-existing heap overflow: when a quoted/parenthesized link token has no closing delimiter before the buffer ends (truncated input such as `@import "x`, `@import "`, `url("x`), the scan stops at the terminating NUL, then `c += ndelim` steps past it and `while (c == ' ')` / the terminator test read out of bounds. Such input aborts under ASan on master. Skip the URL-end scan and capture when no closing delimiter was found (`c == '\0'` right after the scan); c never advances past the NUL. Well-formed tokens are unaffected. 01_engine-parse.test gains a truncated-@import fixture (the valid sibling import is still captured, the unterminated one is not) that trips the overflow under the CI ASan job, plus a check that an @import's trailing media/supports/layer condition survives the rewrite verbatim. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 19:25:39 +02:00
Xavier Roche	a8b491e509	htsparse: capture conditional CSS @import URLs (#94 ) A bare-string @import carrying a media/supports/layer condition, e.g. `@import "theme.css" screen;`, was dropped. The detector required the closing quote to be immediately followed by the statement terminator, so the trailing condition aborted the capture. The `url(...)` form already worked because it terminates at the paren. Two coupled defects in the inscript/CSS detector: - accept a whitespace-separated trailing condition after a quoted @import URL; - bound the captured URL at its last content char (b) instead of recomputing from the terminator. The old `c -= (ndelim + 1)` mishandled spaces skipped before the terminator, leaving the closing quote inside the range so the bogus-link guard aborted. That also silently broke `foo="url" ;` (a space before the semicolon) for every quoted detection, not only @import. 01_engine-parse.test gains a CSS @import section that crawls a .css directly; the conditioned cases are negative controls that fail without the fix. Closes #94 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 18:46:31 +02:00
Xavier Roche	a8e4bb3b81	Merge pull request #395 from xroche/fix/xmlns-false-links-191 Don't crawl xmlns namespace declarations	2026-06-19 18:28:23 +02:00
Xavier Roche	0145ec37a3	htsparse: don't crawl xmlns namespace declarations (#191 ) The "dirty parsing" heuristic accepts any tag attribute whose value looks like a URL unless the attribute is on the no-detect list. xmlns and xmlns:prefix declarations carry namespace URIs (xmlns:og="http://ogp.me/ns#", etc.) that are not resources, so httrack queued and fetched them, stalling the crawl on unrelated spec URLs. Reject xmlns/xmlns:prefix where the no-detect list is already consulted. 01_engine-parse.test grows a fixture with each form (default and prefixed) as the last attribute of its element, since the heuristic only inspects an attribute whose value is immediately followed by '>'; the targets are local file:// gifs so a regression actually downloads them (verified: reverting the guard fetches all three). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>	2026-06-19 18:24:55 +02:00
Xavier Roche	a80fab38ba	Merge pull request #394 from xroche/fix/proxy-https-connect-85 Tunnel https through the proxy via CONNECT (#85)	2026-06-19 18:03:31 +02:00