filtersize self-test: parse the size with sscanf(LLintP), and lock the '>' operator (#432 )

Use the portable sscanf(argv[0], LLintP, &sz) idiom the rest of the tree uses to read an LLint, instead of strtoll: LLint is not always long long (MSVC __int64, plus fallbacks) and strtoll is absent on old MSVC. Add two cases so the size-rule scan-time neutrality is pinned for the '>' operator too, not only '<': -*.jpg*[>10] stays neutral at scan time and cancels once the size is known. Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
Keep size-based filter rules neutral until the file size is known (#143 ) (#431 )
2026-06-27 04:27:16 +03:00 · 2026-06-26 22:01:14 +02:00 · 2026-06-26 21:21:54 +02:00 · 2026-06-26 20:49:20 +02:00 · 2026-06-26 20:10:37 +02:00 · 2026-06-26 17:42:26 +02:00
10 changed files with 209 additions and 18 deletions
--- a/src/htsback.c
+++ b/src/htsback.c
@@ -3766,7 +3766,27 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
                    }
 #endif
 /********** **************************** ********** */
-                  } else {      // il faut aller le chercher
+                  }
+                  // MIME type excluded by a -mime: filter: abort, don't fetch
+                  // the body (#58)
+                  else if (HTTP_IS_OK(back[i].r.statuscode) &&
+                           !back[i].testmode &&
+                           strnotempty(back[i].r.contenttype) &&
+                           hts_acceptmime(opt, 0, back[i].url_adr,
+                                          back[i].url_fil,
+                                          back[i].r.contenttype) == 1) {
+                    deletehttp(&back[i].r);
+                    back[i].r.soc = INVALID_SOCKET;
+                    back[i].status = STATUS_READY;
+                    back_set_finished(sback, i);
+                    back[i].r.statuscode = STATUSCODE_EXCLUDED;
+                    strcpybuff(back[i].r.msg, "Excluded by MIME type filter");
+                    hts_log_print(
+                        opt, LOG_NOTICE,
+                        "File excluded by MIME type filter (%s): %s%s",
+                        back[i].r.contenttype, back[i].url_adr,
+                        back[i].url_fil);
+                  } else { // il faut aller le chercher

                    // effacer buffer (requète)
                    if (!noFreebuff) {
@@ -3985,7 +4005,6 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,

                      }
                    }
-
                  }

                  /*} */
--- a/src/htsbasenet.h
+++ b/src/htsbasenet.h
@@ -146,7 +146,8 @@ typedef enum BackStatusCode {
  STATUSCODE_NON_FATAL = -5,
  STATUSCODE_SSL_HANDSHAKE = -6,
  STATUSCODE_TOO_BIG = -7,
-  STATUSCODE_TEST_OK = -10
+  STATUSCODE_TEST_OK = -10,
+  STATUSCODE_EXCLUDED = -11 /* aborted: MIME excluded by a -mime: filter */
 } BackStatusCode;

 /** HTTrack status ('status' member of of 'lien_back') **/
--- a/src/htscore.c
+++ b/src/htscore.c
@@ -736,26 +736,39 @@ int httpmirror(char *url1, httrackp * opt) {
    /* OPTIMIZED for fast load */
    if (StringNotEmpty(opt->filelist)) {
      char *filelist_buff = NULL;
-      const size_t filelist_sz = off_t_to_size_t(fsize(StringBuff(opt->filelist)));
+      size_t filelist_sz = 0;
+      const char *filelist_err = NULL; /* failure reason, NULL on success */
+      const off_t fs = fsize(StringBuff(opt->filelist));

-      if (filelist_sz != (size_t) -1) {
+      if (fs < 0) {
+        /* fsize() hides the cause; redo stat() for a precise errno (#49) */
+        struct stat st;
+        filelist_err = stat(StringBuff(opt->filelist), &st) != 0
+                           ? strerror(errno)
+                           : "not a regular file";
+      } else if ((filelist_sz = off_t_to_size_t(fs)) == (size_t) -1) {
+        filelist_err = "file too large";
+        filelist_sz = 0;
+      } else {
        FILE *fp = fopen(StringBuff(opt->filelist), "rb");

-        if (fp) {
+        if (fp == NULL) {
+          filelist_err = strerror(errno);
+        } else {
          filelist_buff = malloct(filelist_sz + 1);
-          if (filelist_buff) {
-            if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
-              freet(filelist_buff);
-              filelist_buff = NULL;
-            } else {
-              *(filelist_buff + filelist_sz) = '\0';
-            }
+          if (filelist_buff == NULL) {
+            filelist_err = "out of memory";
+          } else if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
+            freet(filelist_buff);
+            filelist_err = "read error";
+          } else {
+            filelist_buff[filelist_sz] = '\0';
          }
          fclose(fp);
        }
      }

-      if (filelist_buff) {
+      if (filelist_buff != NULL) {
        int filelist_ptr = 0;
        int n = 0;
        char BIGSTK line[HTS_URLMAXSIZE * 2];
@@ -780,8 +793,8 @@ int httpmirror(char *url1, httrackp * opt) {
        // Free buffer
        freet(filelist_buff);
      } else {
-        hts_log_print(opt, LOG_ERROR, "Could not include URL list: %s",
-                      StringBuff(opt->filelist));
+        hts_log_print(opt, LOG_ERROR, "Could not include URL list \"%s\": %s",
+                      StringBuff(opt->filelist), filelist_err);
      }
    }

--- a/src/htsfilters.c
+++ b/src/htsfilters.c
@@ -76,7 +76,8 @@ int fa_strjoker(int type, char **filters, int nfil, const char *nom, LLint * siz
    }
    if (size)
      sz = *size;
-    if (strjoker(nom, filters[i] + filteroffs, &sz, size_flag)) {       // reconnu
+    /* size unknown (scan time): no size pointer => size tests stay neutral */
+    if (strjoker(nom, filters[i] + filteroffs, size ? &sz : NULL, size_flag)) {
      if (size)
        if (sz != *size)
          sizelimit = sz;
--- a/src/htsselftest.c
+++ b/src/htsselftest.c
@@ -524,6 +524,32 @@ static int st_filter(httrackp *opt, int argc, char **argv) {
  return 0;
 }

+/* Size-aware filter verdict via fa_strjoker: a negative <size> means the size
+   is still unknown (scan time), so a size rule like -*.jpg*[<10] must stay
+   neutral. */
+static int st_filtersize(httrackp *opt, int argc, char **argv) {
+  LLint sz;
+  int size_flag = 0, verdict, known;
+
+  (void) opt;
+  if (argc < 3) {
+    fprintf(stderr, "filtersize: needs <size> <string> <filter> [filter...]\n");
+    return 1;
+  }
+  known = (argv[0][0] != '-'); /* "-1"/"-" => size unknown */
+  sz = -1;
+  if (known)
+    sscanf(argv[0], LLintP, &sz);
+  verdict = fa_strjoker(0, &argv[2], argc - 2, argv[1], known ? &sz : NULL,
+                        known ? &size_flag : NULL, NULL);
+  printf("verdict=%s size_flag=%d\n",
+         verdict > 0   ? "allowed"
+         : verdict < 0 ? "forbidden"
+                       : "unknown",
+         size_flag);
+  return 0;
+}
+
 static int st_simplify(httrackp *opt, int argc, char **argv) {
  (void) opt;
  if (argc < 1) {
@@ -1038,6 +1064,9 @@ static const struct selftest_entry {
 } selftests[] = {
    {"filter", "<pattern> <string>", "match a string against a wildcard filter",
     st_filter},
+    {"filtersize", "<size> <string> <filter>...",
+     "size-aware filter verdict (negative size = unknown/scan time)",
+     st_filtersize},
    {"simplify", "<path>", "collapse ./ and ../ in a path", st_simplify},
    {"mime", "<filename>", "MIME type for a filename", st_mime},
    {"charset", "<charset> <string>",
--- a/tests/01_engine-filelist.test
+++ b/tests/01_engine-filelist.test
@@ -0,0 +1,65 @@
+#!/bin/bash
+#
+# -%L URL-list loading (#49): a readable list is honored; an unusable one fails
+# with the reason (errno / not-a-regular-file), not a bare "Could not include
+# URL list". Offline: file:// fixture, no server. Asserts on httrack's own
+# strings and the message shape, so it is locale-independent.
+
+set -euo pipefail
+
+tmp=$(mktemp -d "${TMPDIR:-/tmp}/httrack_filelist.XXXXXX") || exit 1
+trap 'rm -rf "$tmp"' EXIT HUP INT QUIT PIPE TERM
+
+echo '<html><body>hi</body></html>' >"$tmp/index.html"
+
+# run httrack with the given -%L target; structured log lands in $out/hts-log.txt
+run() {
+    local out="$1" list="$2"
+    rm -rf "$out"
+    mkdir -p "$out"
+    httrack -O "$out" --quiet -n "-%L" "$list" >"$out/.stdout" 2>&1 || true
+    LOG="$out/hts-log.txt"
+}
+
+fail() {
+    echo "FAIL: $1"
+    cat "$LOG"
+    exit 1
+}
+loghas() {
+    grep -Eq "$1" "$LOG" || fail "expected /$1/ in $LOG"
+}
+lognot() {
+    if grep -Eq "$1" "$LOG"; then fail "unexpected /$1/ in $LOG"; fi
+}
+
+# readable list: its one URL is loaded and counted (count must be non-zero)
+printf 'file://%s/index.html\n' "$tmp" >"$tmp/urls.txt"
+run "$tmp/ok" "$tmp/urls.txt"
+loghas '[1-9][0-9]* links added from'
+
+# missing file: quoted name + a non-empty reason, never the old reasonless
+# "Could not include URL list: <name>". The reason is the stat() errno, not the
+# directory fallback literal (guards against dropping the errno lookup).
+run "$tmp/miss" "$tmp/nope.txt"
+loghas 'Could not include URL list "[^"]+": .+'
+lognot 'Could not include URL list: '
+lognot 'not a regular file'
+
+# a directory is rejected with our own reason (locale-independent)
+mkdir -p "$tmp/adir"
+run "$tmp/dir" "$tmp/adir"
+loghas 'Could not include URL list "[^"]+": not a regular file'
+
+# unreadable regular file: the fopen() errno arm fires, distinct from the
+# directory branch. Root bypasses mode 000, so skip it there.
+if test "$(id -u)" -ne 0; then
+    : >"$tmp/noperm.txt"
+    chmod 000 "$tmp/noperm.txt"
+    run "$tmp/perm" "$tmp/noperm.txt"
+    chmod 644 "$tmp/noperm.txt"
+    loghas 'Could not include URL list "[^"]+": .+'
+    lognot 'not a regular file'
+fi
+
+exit 0
--- a/tests/01_engine-filter.test
+++ b/tests/01_engine-filter.test
@@ -71,3 +71,27 @@ nomatch '*[\[\]]' '[' # not matched, despite the docs
 match '*[\[\]]' ']'   # only via the empty class-match + trailing ']'
 match '*[\[\]]' '[]'  # one of {'[','\'} then the trailing ']'
 nomatch '*[\[\]]' '[]x'
+
+# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
+# means the size is still unknown (scan time). A size exclusion must stay neutral
+# then, so the file is fetched and only cancelled once its size is known (#143).
+fsize() {
+    local want="$1"
+    shift
+    test "$(httrack -O /dev/null -#test=filtersize "$@")" == "$want" || exit 1
+}
+fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[<10]'   # scan time: keep
+fsize 'verdict=forbidden size_flag=1' 5 foo.jpg -* '+*.jpg' '-*.jpg*[<10]'  # <10KB: cancel
+fsize 'verdict=allowed size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[<10]'   # >=10KB: keep
+fsize 'verdict=forbidden size_flag=0' -1 foo.txt -* '+*.jpg' '-*.jpg*[<10]' # not a jpg
+# the '>' operator is just as neutral at scan time, and fires once size is known
+fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[>10]'   # scan time: keep
+fsize 'verdict=forbidden size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # >10KB: cancel
+
+# [name]/[file]/[path] never span '?' mid-string; a trailing query is still
+# tolerated by the global '?' rule (same as plain *.aspx), not the class (#144).
+nomatch '*[path]/end' 'a?b/end'
+nomatch '*[file]end' 'foo?xend'
+nomatch '*[name]X' 'abc?X'
+match '*[file]' 'foo?x=1' # trailing query: tolerated, as for *.aspx
+match '*.aspx' 'page.aspx?y=2'
--- a/tests/25_local-mime-exclude.test
+++ b/tests/25_local-mime-exclude.test
@@ -0,0 +1,16 @@
+#!/bin/bash
+#
+# A -mime: exclusion must abort the transfer on the response Content-Type, not
+# fetch the whole 1 MB body then discard it (#58). The bytes-received guard is
+# the real one: the file is absent either way, but only the fix keeps the count
+# tiny (header only) instead of pulling the body. Match it positively (a small,
+# <=4-digit count) so a vanished/reworded summary line fails rather than passes.
+
+: "${top_srcdir:=..}"
+
+bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
+    --found 'mimex/real.html' \
+    --not-found 'mimex/blob.pdf' \
+    --log-found 'excluded by MIME type filter' \
+    --log-found '\[[0-9]{1,4} bytes received' \
+    httrack 'BASEURL/mimex/index.html' '-mime:application/pdf'
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -34,6 +34,7 @@ TESTS = \
 	01_engine-dns.test \
 	01_engine-doitlog.test \
 	01_engine-entities.test \
+	01_engine-filelist.test \
 	01_engine-filter.test \
 	01_engine-hashtable.test \
 	01_engine-idna.test \
@@ -66,6 +67,7 @@ TESTS = \
 	21_local-intl-update.test \
 	22_local-broken-size.test \
 	23_local-errpage.test \
-	24_local-resume-overlap.test
+	24_local-resume-overlap.test \
+	25_local-mime-exclude.test

 CLEANFILES = check-network_sh.cache
--- a/tests/local-server.py
+++ b/tests/local-server.py
@@ -177,6 +177,24 @@ class Handler(SimpleHTTPRequestHandler):
        body, ctype = self.TYPE_MATRIX[path]
        self.send_raw(body, ctype)

+    # --- MIME-type exclusion abort (issue #58) -----------------------------
+    # A -mime:application/pdf filter must abort the transfer once the header
+    # arrives, not download the whole body and discard it.
+    def route_mimex_index(self):
+        self.send_html(
+            '\t<a href="blob.pdf">pdf</a>\n' '\t<a href="real.html">real</a>\n'
+        )
+
+    # 1 MB body: the fix aborts after the header, so httrack's "bytes received"
+    # stays tiny; without it the engine reads the body and the count jumps.
+    MIMEX_BLOB = b"%PDF-1.4\n" + b"\x00" * (1024 * 1024)
+
+    def route_mimex_blob(self):
+        self.send_raw(self.MIMEX_BLOB, "application/pdf")
+
+    def route_mimex_real(self):
+        self.send_raw(b"<html><body>real</body></html>", "text/html")
+
    # --- special chars in URLs across an update (issue #157) ---------------
    # A dotless, accented basename served as text/html (MediaWiki style). The
    # name the first crawl picks (.html) must survive the update pass.
@@ -355,6 +373,9 @@ class Handler(SimpleHTTPRequestHandler):
        "/errpage/good.html": route_errpage_good,
        "/errpage/missing.html": route_errpage_missing,
        "/errpage/empty.html": route_errpage_empty,
+        "/mimex/index.html": route_mimex_index,
+        "/mimex/blob.pdf": route_mimex_blob,
+        "/mimex/real.html": route_mimex_real,
    }

    # --- dispatch ----------------------------------------------------------
Author	SHA1	Message	Date
Xavier Roche	b138c87a93	filtersize self-test: parse the size with sscanf(LLintP), and lock the '>' operator (#432 ) Use the portable sscanf(argv[0], LLintP, &sz) idiom the rest of the tree uses to read an LLint, instead of strtoll: LLint is not always long long (MSVC __int64, plus fallbacks) and strtoll is absent on old MSVC. Add two cases so the size-rule scan-time neutrality is pinned for the '>' operator too, not only '<': -.jpg[>10] stays neutral at scan time and cancels once the size is known. Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-26 22:01:14 +02:00
Xavier Roche	3de47433b7	Keep size-based filter rules neutral until the file size is known (#143 ) (#431 ) A rule such as -.jpg[<10] is meant to fetch every JPG, then delete the ones under 10KB once their size is known. Instead it could forbid all of them up front: at scan time the wizard calls fa_strjoker with no size, but fa_strjoker always handed strjoker the address of an uninitialized local sz, so the [<10] predicate ran against stack garbage. When that garbage fell in [0,10) the rule "matched" and the link was dropped before it was ever downloaded ("(wizard) explicit forbidden (-.jpg[<10])"). Pass no size pointer when the size is unknown, routing into strjoker's existing "test impossible -> no match" path so size rules stay neutral at scan time and only fire once the real size is in. The size-known path is unchanged. Add a filtersize engine self-test that drives fa_strjoker through both phases and a tests/01_engine-filter.test block locking the scenario. Also lock #144: the [name]/[file]/[path] classes do not span '?'; a trailing query is tolerated by the same global rule that lets *.aspx match page.aspx?y=2, not by the class. Working as intended. Closes #143 Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-26 21:21:54 +02:00
Xavier Roche	fb8827718e	htscore: report why a -%L URL list could not be loaded (#49 ) (#430 ) A missing, unreadable, or non-regular -%L file all collapsed into one reasonless "Could not include URL list: <name>", which is what left the #49 reporter unable to tell why the list was rejected. Open and stat() the file explicitly so the log carries the cause: the errno text (no such file, permission denied), "not a regular file", or "file too large". The loader keeps the original regular-file guard, so it still won't open a directory or FIFO. Covered by an offline file:// test: a readable list loads with a non-zero count, while a missing file, an unreadable file, and a directory each fail with a distinct reason instead of the bare message. Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-26 20:49:20 +02:00
Xavier Roche	7228210061	Abort the download when the response MIME type is excluded by -mime: (#58 ) (#429 ) A -mime: exclusion only took effect after the full body had been downloaded and then discarded (leaving a .delayed temp behind), wasting bandwidth. Honor it as soon as the response Content-Type arrives: back_wait now aborts the transfer before the body when hts_acceptmime forbids the declared type, finishing the slot with a new STATUSCODE_EXCLUDED clean-skip status rather than fetching and dropping. Covers the reported case (an HTML-looking URL served as application/pdf past a +*.html include) and any -mime: match regardless of extension. Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-26 20:10:37 +02:00
Xavier Roche	38882c0aee	Honor the server's Content-Range when resuming a partial download (#198 ) (#428 ) * Honor the server's Content-Range when resuming a partial download (#198) A resumed download (Range: bytes=N-) may be answered with a 206 whose range starts before N: block-aligned caches and CDNs routinely round the start down to a block boundary, and RFC 7233 lets the server pick the range it returns. httrack ignored the returned Content-Range and blindly appended the 206 body to the bytes already on disk, so the overlapping bytes were duplicated and the file grew by the overlap. With timing deciding which files get interrupted (and thus resumed), this surfaced as a random subset of files corrupted on each run, each a few bytes too large. Resume at the server's crange_start instead: ftruncate the partial to that offset and write the 206 body there (the in-memory branch keeps only that prefix). When the returned range is unusable (a forward gap, no/garbage Content-Range, or one that doesn't reach EOF) drop the partial and refetch the whole file rather than stitch a corrupt one. Reading the existing crange_start/crange_end/crange fields only, no ABI change. Driven by tests/24_local-resume-overlap.test: pass 1 interrupts a download mid-body, pass 2 resumes against a 206 that backs up 8 bytes, and the result must be byte-identical to the same content fetched whole. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com> * Harden #198 fix: verify the truncate, assert the test hit the resume path Two follow-ups from review of the resume fix. If HTS_FTRUNCATE fails the partial could keep a stale tail (only when the resource shrank between runs, sz > full, so the body write no longer covers the old end). Check its return and, on failure, drop the partial and refetch the whole file instead of writing a possibly-corrupt one. The resume test only compared the resumed bytes against the whole file, which also passes if httrack silently re-downloads the file with no Range (the bug never fires). Mark when the server actually serves a resume 206 and assert pass 2 hit that path, so a full re-download fails the test instead of passing it. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com> * tests: run 24_local-resume-overlap under set -e Follow the golden rule for shell scripts: start with set -e so a non-last failure can't be masked. Guard the backgrounded-crawl kill/wait spots with \|\| true so the expected SIGTERM exit doesn't abort the run. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com> --------- Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-26 17:42:26 +02:00