filtersize self-test: parse the size with sscanf(LLintP), and lock the '>' operator (#432 )

Use the portable sscanf(argv[0], LLintP, &sz) idiom the rest of the tree uses to read an LLint, instead of strtoll: LLint is not always long long (MSVC __int64, plus fallbacks) and strtoll is absent on old MSVC. Add two cases so the size-rule scan-time neutrality is pinned for the '>' operator too, not only '<': -*.jpg*[>10] stays neutral at scan time and cancels once the size is known. Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
Keep size-based filter rules neutral until the file size is known (#143 ) (#431 )
2026-06-27 04:27:16 +03:00 · 2026-06-26 22:01:14 +02:00 · 2026-06-26 21:21:54 +02:00 · 2026-06-26 20:49:20 +02:00 · 2026-06-26 20:10:37 +02:00
10 changed files with 209 additions and 18 deletions
--- a/src/htsback.c
+++ b/src/htsback.c
@@ -3766,7 +3766,27 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
                    }
 #endif
 /********** **************************** ********** */
-                  } else {      // il faut aller le chercher
+                  }
+                  // MIME type excluded by a -mime: filter: abort, don't fetch
+                  // the body (#58)
+                  else if (HTTP_IS_OK(back[i].r.statuscode) &&
+                           !back[i].testmode &&
+                           strnotempty(back[i].r.contenttype) &&
+                           hts_acceptmime(opt, 0, back[i].url_adr,
+                                          back[i].url_fil,
+                                          back[i].r.contenttype) == 1) {
+                    deletehttp(&back[i].r);
+                    back[i].r.soc = INVALID_SOCKET;
+                    back[i].status = STATUS_READY;
+                    back_set_finished(sback, i);
+                    back[i].r.statuscode = STATUSCODE_EXCLUDED;
+                    strcpybuff(back[i].r.msg, "Excluded by MIME type filter");
+                    hts_log_print(
+                        opt, LOG_NOTICE,
+                        "File excluded by MIME type filter (%s): %s%s",
+                        back[i].r.contenttype, back[i].url_adr,
+                        back[i].url_fil);
+                  } else { // il faut aller le chercher

                    // effacer buffer (requète)
                    if (!noFreebuff) {
@@ -3985,7 +4005,6 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,

                      }
                    }
-
                  }

                  /*} */
--- a/src/htsbasenet.h
+++ b/src/htsbasenet.h
@@ -146,7 +146,8 @@ typedef enum BackStatusCode {
  STATUSCODE_NON_FATAL = -5,
  STATUSCODE_SSL_HANDSHAKE = -6,
  STATUSCODE_TOO_BIG = -7,
-  STATUSCODE_TEST_OK = -10
+  STATUSCODE_TEST_OK = -10,
+  STATUSCODE_EXCLUDED = -11 /* aborted: MIME excluded by a -mime: filter */
 } BackStatusCode;

 /** HTTrack status ('status' member of of 'lien_back') **/
--- a/src/htscore.c
+++ b/src/htscore.c
@@ -736,26 +736,39 @@ int httpmirror(char *url1, httrackp * opt) {
    /* OPTIMIZED for fast load */
    if (StringNotEmpty(opt->filelist)) {
      char *filelist_buff = NULL;
-      const size_t filelist_sz = off_t_to_size_t(fsize(StringBuff(opt->filelist)));
+      size_t filelist_sz = 0;
+      const char *filelist_err = NULL; /* failure reason, NULL on success */
+      const off_t fs = fsize(StringBuff(opt->filelist));

-      if (filelist_sz != (size_t) -1) {
+      if (fs < 0) {
+        /* fsize() hides the cause; redo stat() for a precise errno (#49) */
+        struct stat st;
+        filelist_err = stat(StringBuff(opt->filelist), &st) != 0
+                           ? strerror(errno)
+                           : "not a regular file";
+      } else if ((filelist_sz = off_t_to_size_t(fs)) == (size_t) -1) {
+        filelist_err = "file too large";
+        filelist_sz = 0;
+      } else {
        FILE *fp = fopen(StringBuff(opt->filelist), "rb");

-        if (fp) {
+        if (fp == NULL) {
+          filelist_err = strerror(errno);
+        } else {
          filelist_buff = malloct(filelist_sz + 1);
-          if (filelist_buff) {
-            if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
-              freet(filelist_buff);
-              filelist_buff = NULL;
-            } else {
-              *(filelist_buff + filelist_sz) = '\0';
-            }
+          if (filelist_buff == NULL) {
+            filelist_err = "out of memory";
+          } else if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
+            freet(filelist_buff);
+            filelist_err = "read error";
+          } else {
+            filelist_buff[filelist_sz] = '\0';
          }
          fclose(fp);
        }
      }

-      if (filelist_buff) {
+      if (filelist_buff != NULL) {
        int filelist_ptr = 0;
        int n = 0;
        char BIGSTK line[HTS_URLMAXSIZE * 2];
@@ -780,8 +793,8 @@ int httpmirror(char *url1, httrackp * opt) {
        // Free buffer
        freet(filelist_buff);
      } else {
-        hts_log_print(opt, LOG_ERROR, "Could not include URL list: %s",
-                      StringBuff(opt->filelist));
+        hts_log_print(opt, LOG_ERROR, "Could not include URL list \"%s\": %s",
+                      StringBuff(opt->filelist), filelist_err);
      }
    }

--- a/src/htsfilters.c
+++ b/src/htsfilters.c
@@ -76,7 +76,8 @@ int fa_strjoker(int type, char **filters, int nfil, const char *nom, LLint * siz
    }
    if (size)
      sz = *size;
-    if (strjoker(nom, filters[i] + filteroffs, &sz, size_flag)) {       // reconnu
+    /* size unknown (scan time): no size pointer => size tests stay neutral */
+    if (strjoker(nom, filters[i] + filteroffs, size ? &sz : NULL, size_flag)) {
      if (size)
        if (sz != *size)
          sizelimit = sz;
--- a/src/htsselftest.c
+++ b/src/htsselftest.c
@@ -524,6 +524,32 @@ static int st_filter(httrackp *opt, int argc, char **argv) {
  return 0;
 }

+/* Size-aware filter verdict via fa_strjoker: a negative <size> means the size
+   is still unknown (scan time), so a size rule like -*.jpg*[<10] must stay
+   neutral. */
+static int st_filtersize(httrackp *opt, int argc, char **argv) {
+  LLint sz;
+  int size_flag = 0, verdict, known;
+
+  (void) opt;
+  if (argc < 3) {
+    fprintf(stderr, "filtersize: needs <size> <string> <filter> [filter...]\n");
+    return 1;
+  }
+  known = (argv[0][0] != '-'); /* "-1"/"-" => size unknown */
+  sz = -1;
+  if (known)
+    sscanf(argv[0], LLintP, &sz);
+  verdict = fa_strjoker(0, &argv[2], argc - 2, argv[1], known ? &sz : NULL,
+                        known ? &size_flag : NULL, NULL);
+  printf("verdict=%s size_flag=%d\n",
+         verdict > 0   ? "allowed"
+         : verdict < 0 ? "forbidden"
+                       : "unknown",
+         size_flag);
+  return 0;
+}
+
 static int st_simplify(httrackp *opt, int argc, char **argv) {
  (void) opt;
  if (argc < 1) {
@@ -1038,6 +1064,9 @@ static const struct selftest_entry {
 } selftests[] = {
    {"filter", "<pattern> <string>", "match a string against a wildcard filter",
     st_filter},
+    {"filtersize", "<size> <string> <filter>...",
+     "size-aware filter verdict (negative size = unknown/scan time)",
+     st_filtersize},
    {"simplify", "<path>", "collapse ./ and ../ in a path", st_simplify},
    {"mime", "<filename>", "MIME type for a filename", st_mime},
    {"charset", "<charset> <string>",
--- a/tests/01_engine-filelist.test
+++ b/tests/01_engine-filelist.test
@@ -0,0 +1,65 @@
+#!/bin/bash
+#
+# -%L URL-list loading (#49): a readable list is honored; an unusable one fails
+# with the reason (errno / not-a-regular-file), not a bare "Could not include
+# URL list". Offline: file:// fixture, no server. Asserts on httrack's own
+# strings and the message shape, so it is locale-independent.
+
+set -euo pipefail
+
+tmp=$(mktemp -d "${TMPDIR:-/tmp}/httrack_filelist.XXXXXX") || exit 1
+trap 'rm -rf "$tmp"' EXIT HUP INT QUIT PIPE TERM
+
+echo '<html><body>hi</body></html>' >"$tmp/index.html"
+
+# run httrack with the given -%L target; structured log lands in $out/hts-log.txt
+run() {
+    local out="$1" list="$2"
+    rm -rf "$out"
+    mkdir -p "$out"
+    httrack -O "$out" --quiet -n "-%L" "$list" >"$out/.stdout" 2>&1 || true
+    LOG="$out/hts-log.txt"
+}
+
+fail() {
+    echo "FAIL: $1"
+    cat "$LOG"
+    exit 1
+}
+loghas() {
+    grep -Eq "$1" "$LOG" || fail "expected /$1/ in $LOG"
+}
+lognot() {
+    if grep -Eq "$1" "$LOG"; then fail "unexpected /$1/ in $LOG"; fi
+}
+
+# readable list: its one URL is loaded and counted (count must be non-zero)
+printf 'file://%s/index.html\n' "$tmp" >"$tmp/urls.txt"
+run "$tmp/ok" "$tmp/urls.txt"
+loghas '[1-9][0-9]* links added from'
+
+# missing file: quoted name + a non-empty reason, never the old reasonless
+# "Could not include URL list: <name>". The reason is the stat() errno, not the
+# directory fallback literal (guards against dropping the errno lookup).
+run "$tmp/miss" "$tmp/nope.txt"
+loghas 'Could not include URL list "[^"]+": .+'
+lognot 'Could not include URL list: '
+lognot 'not a regular file'
+
+# a directory is rejected with our own reason (locale-independent)
+mkdir -p "$tmp/adir"
+run "$tmp/dir" "$tmp/adir"
+loghas 'Could not include URL list "[^"]+": not a regular file'
+
+# unreadable regular file: the fopen() errno arm fires, distinct from the
+# directory branch. Root bypasses mode 000, so skip it there.
+if test "$(id -u)" -ne 0; then
+    : >"$tmp/noperm.txt"
+    chmod 000 "$tmp/noperm.txt"
+    run "$tmp/perm" "$tmp/noperm.txt"
+    chmod 644 "$tmp/noperm.txt"
+    loghas 'Could not include URL list "[^"]+": .+'
+    lognot 'not a regular file'
+fi
+
+exit 0
--- a/tests/01_engine-filter.test
+++ b/tests/01_engine-filter.test
@@ -71,3 +71,27 @@ nomatch '*[\[\]]' '[' # not matched, despite the docs
 match '*[\[\]]' ']'   # only via the empty class-match + trailing ']'
 match '*[\[\]]' '[]'  # one of {'[','\'} then the trailing ']'
 nomatch '*[\[\]]' '[]x'
+
+# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
+# means the size is still unknown (scan time). A size exclusion must stay neutral
+# then, so the file is fetched and only cancelled once its size is known (#143).
+fsize() {
+    local want="$1"
+    shift
+    test "$(httrack -O /dev/null -#test=filtersize "$@")" == "$want" || exit 1
+}
+fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[<10]'   # scan time: keep
+fsize 'verdict=forbidden size_flag=1' 5 foo.jpg -* '+*.jpg' '-*.jpg*[<10]'  # <10KB: cancel
+fsize 'verdict=allowed size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[<10]'   # >=10KB: keep
+fsize 'verdict=forbidden size_flag=0' -1 foo.txt -* '+*.jpg' '-*.jpg*[<10]' # not a jpg
+# the '>' operator is just as neutral at scan time, and fires once size is known
+fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[>10]'   # scan time: keep
+fsize 'verdict=forbidden size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # >10KB: cancel
+
+# [name]/[file]/[path] never span '?' mid-string; a trailing query is still
+# tolerated by the global '?' rule (same as plain *.aspx), not the class (#144).
+nomatch '*[path]/end' 'a?b/end'
+nomatch '*[file]end' 'foo?xend'
+nomatch '*[name]X' 'abc?X'
+match '*[file]' 'foo?x=1' # trailing query: tolerated, as for *.aspx
+match '*.aspx' 'page.aspx?y=2'
--- a/tests/25_local-mime-exclude.test
+++ b/tests/25_local-mime-exclude.test
@@ -0,0 +1,16 @@
+#!/bin/bash
+#
+# A -mime: exclusion must abort the transfer on the response Content-Type, not
+# fetch the whole 1 MB body then discard it (#58). The bytes-received guard is
+# the real one: the file is absent either way, but only the fix keeps the count
+# tiny (header only) instead of pulling the body. Match it positively (a small,
+# <=4-digit count) so a vanished/reworded summary line fails rather than passes.
+
+: "${top_srcdir:=..}"
+
+bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
+    --found 'mimex/real.html' \
+    --not-found 'mimex/blob.pdf' \
+    --log-found 'excluded by MIME type filter' \
+    --log-found '\[[0-9]{1,4} bytes received' \
+    httrack 'BASEURL/mimex/index.html' '-mime:application/pdf'
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -34,6 +34,7 @@ TESTS = \
 	01_engine-dns.test \
 	01_engine-doitlog.test \
 	01_engine-entities.test \
+	01_engine-filelist.test \
 	01_engine-filter.test \
 	01_engine-hashtable.test \
 	01_engine-idna.test \
@@ -66,6 +67,7 @@ TESTS = \
 	21_local-intl-update.test \
 	22_local-broken-size.test \
 	23_local-errpage.test \
-	24_local-resume-overlap.test
+	24_local-resume-overlap.test \
+	25_local-mime-exclude.test

 CLEANFILES = check-network_sh.cache
--- a/tests/local-server.py
+++ b/tests/local-server.py
@@ -177,6 +177,24 @@ class Handler(SimpleHTTPRequestHandler):
        body, ctype = self.TYPE_MATRIX[path]
        self.send_raw(body, ctype)

+    # --- MIME-type exclusion abort (issue #58) -----------------------------
+    # A -mime:application/pdf filter must abort the transfer once the header
+    # arrives, not download the whole body and discard it.
+    def route_mimex_index(self):
+        self.send_html(
+            '\t<a href="blob.pdf">pdf</a>\n' '\t<a href="real.html">real</a>\n'
+        )
+
+    # 1 MB body: the fix aborts after the header, so httrack's "bytes received"
+    # stays tiny; without it the engine reads the body and the count jumps.
+    MIMEX_BLOB = b"%PDF-1.4\n" + b"\x00" * (1024 * 1024)
+
+    def route_mimex_blob(self):
+        self.send_raw(self.MIMEX_BLOB, "application/pdf")
+
+    def route_mimex_real(self):
+        self.send_raw(b"<html><body>real</body></html>", "text/html")
+
    # --- special chars in URLs across an update (issue #157) ---------------
    # A dotless, accented basename served as text/html (MediaWiki style). The
    # name the first crawl picks (.html) must survive the update pass.
@@ -355,6 +373,9 @@ class Handler(SimpleHTTPRequestHandler):
        "/errpage/good.html": route_errpage_good,
        "/errpage/missing.html": route_errpage_missing,
        "/errpage/empty.html": route_errpage_empty,
+        "/mimex/index.html": route_mimex_index,
+        "/mimex/blob.pdf": route_mimex_blob,
+        "/mimex/real.html": route_mimex_real,
    }

    # --- dispatch ----------------------------------------------------------
Author	SHA1	Message	Date
Xavier Roche	b138c87a93	filtersize self-test: parse the size with sscanf(LLintP), and lock the '>' operator (#432 ) Use the portable sscanf(argv[0], LLintP, &sz) idiom the rest of the tree uses to read an LLint, instead of strtoll: LLint is not always long long (MSVC __int64, plus fallbacks) and strtoll is absent on old MSVC. Add two cases so the size-rule scan-time neutrality is pinned for the '>' operator too, not only '<': -.jpg[>10] stays neutral at scan time and cancels once the size is known. Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-26 22:01:14 +02:00
Xavier Roche	3de47433b7	Keep size-based filter rules neutral until the file size is known (#143 ) (#431 ) A rule such as -.jpg[<10] is meant to fetch every JPG, then delete the ones under 10KB once their size is known. Instead it could forbid all of them up front: at scan time the wizard calls fa_strjoker with no size, but fa_strjoker always handed strjoker the address of an uninitialized local sz, so the [<10] predicate ran against stack garbage. When that garbage fell in [0,10) the rule "matched" and the link was dropped before it was ever downloaded ("(wizard) explicit forbidden (-.jpg[<10])"). Pass no size pointer when the size is unknown, routing into strjoker's existing "test impossible -> no match" path so size rules stay neutral at scan time and only fire once the real size is in. The size-known path is unchanged. Add a filtersize engine self-test that drives fa_strjoker through both phases and a tests/01_engine-filter.test block locking the scenario. Also lock #144: the [name]/[file]/[path] classes do not span '?'; a trailing query is tolerated by the same global rule that lets *.aspx match page.aspx?y=2, not by the class. Working as intended. Closes #143 Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-26 21:21:54 +02:00
Xavier Roche	fb8827718e	htscore: report why a -%L URL list could not be loaded (#49 ) (#430 ) A missing, unreadable, or non-regular -%L file all collapsed into one reasonless "Could not include URL list: <name>", which is what left the #49 reporter unable to tell why the list was rejected. Open and stat() the file explicitly so the log carries the cause: the errno text (no such file, permission denied), "not a regular file", or "file too large". The loader keeps the original regular-file guard, so it still won't open a directory or FIFO. Covered by an offline file:// test: a readable list loads with a non-zero count, while a missing file, an unreadable file, and a directory each fail with a distinct reason instead of the bare message. Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-26 20:49:20 +02:00
Xavier Roche	7228210061	Abort the download when the response MIME type is excluded by -mime: (#58 ) (#429 ) A -mime: exclusion only took effect after the full body had been downloaded and then discarded (leaving a .delayed temp behind), wasting bandwidth. Honor it as soon as the response Content-Type arrives: back_wait now aborts the transfer before the body when hts_acceptmime forbids the declared type, finishing the slot with a new STATUSCODE_EXCLUDED clean-skip status rather than fetching and dropping. Covers the reported case (an HTML-looking URL served as application/pdf past a +*.html include) and any -mime: match regardless of extension. Signed-off-by: Xavier Roche <roche@httrack.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-26 20:10:37 +02:00