Compare commits

...

1 Commits

Author SHA1 Message Date
Xavier Roche
0edf5e3cde Keep size-based filter rules neutral until the file size is known (#143)
A rule such as -*.jpg*[<10] is meant to fetch every JPG, then delete the
ones under 10KB once their size is known. Instead it could forbid all of
them up front: at scan time the wizard calls fa_strjoker with no size, but
fa_strjoker always handed strjoker the address of an uninitialized local sz,
so the *[<10] predicate ran against stack garbage. When that garbage fell in
[0,10) the rule "matched" and the link was dropped before it was ever
downloaded ("(wizard) explicit forbidden (-*.jpg*[<10])").

Pass no size pointer when the size is unknown, routing into strjoker's
existing "test impossible -> no match" path so size rules stay neutral at
scan time and only fire once the real size is in. The size-known path is
unchanged.

Add a filtersize engine self-test that drives fa_strjoker through both
phases and a tests/01_engine-filter.test block locking the scenario.

Also lock #144: the *[name]/*[file]/*[path] classes do not span '?'; a
trailing query is tolerated by the same global rule that lets *.aspx match
page.aspx?y=2, not by the class. Working as intended.

Closes #143

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-26 21:11:01 +02:00
3 changed files with 50 additions and 1 deletions

View File

@@ -76,7 +76,8 @@ int fa_strjoker(int type, char **filters, int nfil, const char *nom, LLint * siz
}
if (size)
sz = *size;
if (strjoker(nom, filters[i] + filteroffs, &sz, size_flag)) { // reconnu
/* size unknown (scan time): no size pointer => size tests stay neutral */
if (strjoker(nom, filters[i] + filteroffs, size ? &sz : NULL, size_flag)) {
if (size)
if (sz != *size)
sizelimit = sz;

View File

@@ -524,6 +524,30 @@ static int st_filter(httrackp *opt, int argc, char **argv) {
return 0;
}
/* Size-aware filter verdict via fa_strjoker: a negative <size> means the size
is still unknown (scan time), so a size rule like -*.jpg*[<10] must stay
neutral. */
static int st_filtersize(httrackp *opt, int argc, char **argv) {
LLint sz;
int size_flag = 0, verdict, known;
(void) opt;
if (argc < 3) {
fprintf(stderr, "filtersize: needs <size> <string> <filter> [filter...]\n");
return 1;
}
known = (argv[0][0] != '-'); /* "-1"/"-" => size unknown */
sz = known ? (LLint) strtoll(argv[0], NULL, 10) : -1;
verdict = fa_strjoker(0, &argv[2], argc - 2, argv[1], known ? &sz : NULL,
known ? &size_flag : NULL, NULL);
printf("verdict=%s size_flag=%d\n",
verdict > 0 ? "allowed"
: verdict < 0 ? "forbidden"
: "unknown",
size_flag);
return 0;
}
static int st_simplify(httrackp *opt, int argc, char **argv) {
(void) opt;
if (argc < 1) {
@@ -1038,6 +1062,9 @@ static const struct selftest_entry {
} selftests[] = {
{"filter", "<pattern> <string>", "match a string against a wildcard filter",
st_filter},
{"filtersize", "<size> <string> <filter>...",
"size-aware filter verdict (negative size = unknown/scan time)",
st_filtersize},
{"simplify", "<path>", "collapse ./ and ../ in a path", st_simplify},
{"mime", "<filename>", "MIME type for a filename", st_mime},
{"charset", "<charset> <string>",

View File

@@ -71,3 +71,24 @@ nomatch '*[\[\]]' '[' # not matched, despite the docs
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
nomatch '*[\[\]]' '[]x'
# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
# means the size is still unknown (scan time). A size exclusion must stay neutral
# then, so the file is fetched and only cancelled once its size is known (#143).
fsize() {
local want="$1"
shift
test "$(httrack -O /dev/null -#test=filtersize "$@")" == "$want" || exit 1
}
fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # scan time: keep
fsize 'verdict=forbidden size_flag=1' 5 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # <10KB: cancel
fsize 'verdict=allowed size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # >=10KB: keep
fsize 'verdict=forbidden size_flag=0' -1 foo.txt -* '+*.jpg' '-*.jpg*[<10]' # not a jpg
# [name]/[file]/[path] never span '?' mid-string; a trailing query is still
# tolerated by the global '?' rule (same as plain *.aspx), not the class (#144).
nomatch '*[path]/end' 'a?b/end'
nomatch '*[file]end' 'foo?xend'
nomatch '*[name]X' 'abc?X'
match '*[file]' 'foo?x=1' # trailing query: tolerated, as for *.aspx
match '*.aspx' 'page.aspx?y=2'