Compare commits

...

4 Commits

Author SHA1 Message Date
Xavier Roche
b138c87a93 filtersize self-test: parse the size with sscanf(LLintP), and lock the '>' operator (#432)
Use the portable sscanf(argv[0], LLintP, &sz) idiom the rest of the tree
uses to read an LLint, instead of strtoll: LLint is not always long long
(MSVC __int64, plus fallbacks) and strtoll is absent on old MSVC.

Add two cases so the size-rule scan-time neutrality is pinned for the '>'
operator too, not only '<': -*.jpg*[>10] stays neutral at scan time and
cancels once the size is known.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 22:01:14 +02:00
Xavier Roche
3de47433b7 Keep size-based filter rules neutral until the file size is known (#143) (#431)
A rule such as -*.jpg*[<10] is meant to fetch every JPG, then delete the
ones under 10KB once their size is known. Instead it could forbid all of
them up front: at scan time the wizard calls fa_strjoker with no size, but
fa_strjoker always handed strjoker the address of an uninitialized local sz,
so the *[<10] predicate ran against stack garbage. When that garbage fell in
[0,10) the rule "matched" and the link was dropped before it was ever
downloaded ("(wizard) explicit forbidden (-*.jpg*[<10])").

Pass no size pointer when the size is unknown, routing into strjoker's
existing "test impossible -> no match" path so size rules stay neutral at
scan time and only fire once the real size is in. The size-known path is
unchanged.

Add a filtersize engine self-test that drives fa_strjoker through both
phases and a tests/01_engine-filter.test block locking the scenario.

Also lock #144: the *[name]/*[file]/*[path] classes do not span '?'; a
trailing query is tolerated by the same global rule that lets *.aspx match
page.aspx?y=2, not by the class. Working as intended.

Closes #143

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 21:21:54 +02:00
Xavier Roche
fb8827718e htscore: report why a -%L URL list could not be loaded (#49) (#430)
A missing, unreadable, or non-regular -%L file all collapsed into one
reasonless "Could not include URL list: <name>", which is what left the
#49 reporter unable to tell why the list was rejected. Open and stat()
the file explicitly so the log carries the cause: the errno text (no
such file, permission denied), "not a regular file", or "file too
large". The loader keeps the original regular-file guard, so it still
won't open a directory or FIFO.

Covered by an offline file:// test: a readable list loads with a
non-zero count, while a missing file, an unreadable file, and a
directory each fail with a distinct reason instead of the bare message.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 20:49:20 +02:00
Xavier Roche
7228210061 Abort the download when the response MIME type is excluded by -mime: (#58) (#429)
A -mime: exclusion only took effect after the full body had been
downloaded and then discarded (leaving a .delayed temp behind), wasting
bandwidth. Honor it as soon as the response Content-Type arrives:
back_wait now aborts the transfer before the body when hts_acceptmime
forbids the declared type, finishing the slot with a new
STATUSCODE_EXCLUDED clean-skip status rather than fetching and dropping.

Covers the reported case (an HTML-looking URL served as application/pdf
past a +*.html include) and any -mime: match regardless of extension.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 20:10:37 +02:00
10 changed files with 209 additions and 18 deletions

View File

@@ -3766,7 +3766,27 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
}
#endif
/********** **************************** ********** */
} else { // il faut aller le chercher
}
// MIME type excluded by a -mime: filter: abort, don't fetch
// the body (#58)
else if (HTTP_IS_OK(back[i].r.statuscode) &&
!back[i].testmode &&
strnotempty(back[i].r.contenttype) &&
hts_acceptmime(opt, 0, back[i].url_adr,
back[i].url_fil,
back[i].r.contenttype) == 1) {
deletehttp(&back[i].r);
back[i].r.soc = INVALID_SOCKET;
back[i].status = STATUS_READY;
back_set_finished(sback, i);
back[i].r.statuscode = STATUSCODE_EXCLUDED;
strcpybuff(back[i].r.msg, "Excluded by MIME type filter");
hts_log_print(
opt, LOG_NOTICE,
"File excluded by MIME type filter (%s): %s%s",
back[i].r.contenttype, back[i].url_adr,
back[i].url_fil);
} else { // il faut aller le chercher
// effacer buffer (requète)
if (!noFreebuff) {
@@ -3985,7 +4005,6 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
}
}
}
/*} */

View File

@@ -146,7 +146,8 @@ typedef enum BackStatusCode {
STATUSCODE_NON_FATAL = -5,
STATUSCODE_SSL_HANDSHAKE = -6,
STATUSCODE_TOO_BIG = -7,
STATUSCODE_TEST_OK = -10
STATUSCODE_TEST_OK = -10,
STATUSCODE_EXCLUDED = -11 /* aborted: MIME excluded by a -mime: filter */
} BackStatusCode;
/** HTTrack status ('status' member of of 'lien_back') **/

View File

@@ -736,26 +736,39 @@ int httpmirror(char *url1, httrackp * opt) {
/* OPTIMIZED for fast load */
if (StringNotEmpty(opt->filelist)) {
char *filelist_buff = NULL;
const size_t filelist_sz = off_t_to_size_t(fsize(StringBuff(opt->filelist)));
size_t filelist_sz = 0;
const char *filelist_err = NULL; /* failure reason, NULL on success */
const off_t fs = fsize(StringBuff(opt->filelist));
if (filelist_sz != (size_t) -1) {
if (fs < 0) {
/* fsize() hides the cause; redo stat() for a precise errno (#49) */
struct stat st;
filelist_err = stat(StringBuff(opt->filelist), &st) != 0
? strerror(errno)
: "not a regular file";
} else if ((filelist_sz = off_t_to_size_t(fs)) == (size_t) -1) {
filelist_err = "file too large";
filelist_sz = 0;
} else {
FILE *fp = fopen(StringBuff(opt->filelist), "rb");
if (fp) {
if (fp == NULL) {
filelist_err = strerror(errno);
} else {
filelist_buff = malloct(filelist_sz + 1);
if (filelist_buff) {
if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
freet(filelist_buff);
filelist_buff = NULL;
} else {
*(filelist_buff + filelist_sz) = '\0';
}
if (filelist_buff == NULL) {
filelist_err = "out of memory";
} else if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
freet(filelist_buff);
filelist_err = "read error";
} else {
filelist_buff[filelist_sz] = '\0';
}
fclose(fp);
}
}
if (filelist_buff) {
if (filelist_buff != NULL) {
int filelist_ptr = 0;
int n = 0;
char BIGSTK line[HTS_URLMAXSIZE * 2];
@@ -780,8 +793,8 @@ int httpmirror(char *url1, httrackp * opt) {
// Free buffer
freet(filelist_buff);
} else {
hts_log_print(opt, LOG_ERROR, "Could not include URL list: %s",
StringBuff(opt->filelist));
hts_log_print(opt, LOG_ERROR, "Could not include URL list \"%s\": %s",
StringBuff(opt->filelist), filelist_err);
}
}

View File

@@ -76,7 +76,8 @@ int fa_strjoker(int type, char **filters, int nfil, const char *nom, LLint * siz
}
if (size)
sz = *size;
if (strjoker(nom, filters[i] + filteroffs, &sz, size_flag)) { // reconnu
/* size unknown (scan time): no size pointer => size tests stay neutral */
if (strjoker(nom, filters[i] + filteroffs, size ? &sz : NULL, size_flag)) {
if (size)
if (sz != *size)
sizelimit = sz;

View File

@@ -524,6 +524,32 @@ static int st_filter(httrackp *opt, int argc, char **argv) {
return 0;
}
/* Size-aware filter verdict via fa_strjoker: a negative <size> means the size
is still unknown (scan time), so a size rule like -*.jpg*[<10] must stay
neutral. */
static int st_filtersize(httrackp *opt, int argc, char **argv) {
LLint sz;
int size_flag = 0, verdict, known;
(void) opt;
if (argc < 3) {
fprintf(stderr, "filtersize: needs <size> <string> <filter> [filter...]\n");
return 1;
}
known = (argv[0][0] != '-'); /* "-1"/"-" => size unknown */
sz = -1;
if (known)
sscanf(argv[0], LLintP, &sz);
verdict = fa_strjoker(0, &argv[2], argc - 2, argv[1], known ? &sz : NULL,
known ? &size_flag : NULL, NULL);
printf("verdict=%s size_flag=%d\n",
verdict > 0 ? "allowed"
: verdict < 0 ? "forbidden"
: "unknown",
size_flag);
return 0;
}
static int st_simplify(httrackp *opt, int argc, char **argv) {
(void) opt;
if (argc < 1) {
@@ -1038,6 +1064,9 @@ static const struct selftest_entry {
} selftests[] = {
{"filter", "<pattern> <string>", "match a string against a wildcard filter",
st_filter},
{"filtersize", "<size> <string> <filter>...",
"size-aware filter verdict (negative size = unknown/scan time)",
st_filtersize},
{"simplify", "<path>", "collapse ./ and ../ in a path", st_simplify},
{"mime", "<filename>", "MIME type for a filename", st_mime},
{"charset", "<charset> <string>",

View File

@@ -0,0 +1,65 @@
#!/bin/bash
#
# -%L URL-list loading (#49): a readable list is honored; an unusable one fails
# with the reason (errno / not-a-regular-file), not a bare "Could not include
# URL list". Offline: file:// fixture, no server. Asserts on httrack's own
# strings and the message shape, so it is locale-independent.
set -euo pipefail
tmp=$(mktemp -d "${TMPDIR:-/tmp}/httrack_filelist.XXXXXX") || exit 1
trap 'rm -rf "$tmp"' EXIT HUP INT QUIT PIPE TERM
echo '<html><body>hi</body></html>' >"$tmp/index.html"
# run httrack with the given -%L target; structured log lands in $out/hts-log.txt
run() {
local out="$1" list="$2"
rm -rf "$out"
mkdir -p "$out"
httrack -O "$out" --quiet -n "-%L" "$list" >"$out/.stdout" 2>&1 || true
LOG="$out/hts-log.txt"
}
fail() {
echo "FAIL: $1"
cat "$LOG"
exit 1
}
loghas() {
grep -Eq "$1" "$LOG" || fail "expected /$1/ in $LOG"
}
lognot() {
if grep -Eq "$1" "$LOG"; then fail "unexpected /$1/ in $LOG"; fi
}
# readable list: its one URL is loaded and counted (count must be non-zero)
printf 'file://%s/index.html\n' "$tmp" >"$tmp/urls.txt"
run "$tmp/ok" "$tmp/urls.txt"
loghas '[1-9][0-9]* links added from'
# missing file: quoted name + a non-empty reason, never the old reasonless
# "Could not include URL list: <name>". The reason is the stat() errno, not the
# directory fallback literal (guards against dropping the errno lookup).
run "$tmp/miss" "$tmp/nope.txt"
loghas 'Could not include URL list "[^"]+": .+'
lognot 'Could not include URL list: '
lognot 'not a regular file'
# a directory is rejected with our own reason (locale-independent)
mkdir -p "$tmp/adir"
run "$tmp/dir" "$tmp/adir"
loghas 'Could not include URL list "[^"]+": not a regular file'
# unreadable regular file: the fopen() errno arm fires, distinct from the
# directory branch. Root bypasses mode 000, so skip it there.
if test "$(id -u)" -ne 0; then
: >"$tmp/noperm.txt"
chmod 000 "$tmp/noperm.txt"
run "$tmp/perm" "$tmp/noperm.txt"
chmod 644 "$tmp/noperm.txt"
loghas 'Could not include URL list "[^"]+": .+'
lognot 'not a regular file'
fi
exit 0

View File

@@ -71,3 +71,27 @@ nomatch '*[\[\]]' '[' # not matched, despite the docs
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
nomatch '*[\[\]]' '[]x'
# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
# means the size is still unknown (scan time). A size exclusion must stay neutral
# then, so the file is fetched and only cancelled once its size is known (#143).
fsize() {
local want="$1"
shift
test "$(httrack -O /dev/null -#test=filtersize "$@")" == "$want" || exit 1
}
fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # scan time: keep
fsize 'verdict=forbidden size_flag=1' 5 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # <10KB: cancel
fsize 'verdict=allowed size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # >=10KB: keep
fsize 'verdict=forbidden size_flag=0' -1 foo.txt -* '+*.jpg' '-*.jpg*[<10]' # not a jpg
# the '>' operator is just as neutral at scan time, and fires once size is known
fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # scan time: keep
fsize 'verdict=forbidden size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # >10KB: cancel
# [name]/[file]/[path] never span '?' mid-string; a trailing query is still
# tolerated by the global '?' rule (same as plain *.aspx), not the class (#144).
nomatch '*[path]/end' 'a?b/end'
nomatch '*[file]end' 'foo?xend'
nomatch '*[name]X' 'abc?X'
match '*[file]' 'foo?x=1' # trailing query: tolerated, as for *.aspx
match '*.aspx' 'page.aspx?y=2'

View File

@@ -0,0 +1,16 @@
#!/bin/bash
#
# A -mime: exclusion must abort the transfer on the response Content-Type, not
# fetch the whole 1 MB body then discard it (#58). The bytes-received guard is
# the real one: the file is absent either way, but only the fix keeps the count
# tiny (header only) instead of pulling the body. Match it positively (a small,
# <=4-digit count) so a vanished/reworded summary line fails rather than passes.
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'mimex/real.html' \
--not-found 'mimex/blob.pdf' \
--log-found 'excluded by MIME type filter' \
--log-found '\[[0-9]{1,4} bytes received' \
httrack 'BASEURL/mimex/index.html' '-mime:application/pdf'

View File

@@ -34,6 +34,7 @@ TESTS = \
01_engine-dns.test \
01_engine-doitlog.test \
01_engine-entities.test \
01_engine-filelist.test \
01_engine-filter.test \
01_engine-hashtable.test \
01_engine-idna.test \
@@ -66,6 +67,7 @@ TESTS = \
21_local-intl-update.test \
22_local-broken-size.test \
23_local-errpage.test \
24_local-resume-overlap.test
24_local-resume-overlap.test \
25_local-mime-exclude.test
CLEANFILES = check-network_sh.cache

View File

@@ -177,6 +177,24 @@ class Handler(SimpleHTTPRequestHandler):
body, ctype = self.TYPE_MATRIX[path]
self.send_raw(body, ctype)
# --- MIME-type exclusion abort (issue #58) -----------------------------
# A -mime:application/pdf filter must abort the transfer once the header
# arrives, not download the whole body and discard it.
def route_mimex_index(self):
self.send_html(
'\t<a href="blob.pdf">pdf</a>\n' '\t<a href="real.html">real</a>\n'
)
# 1 MB body: the fix aborts after the header, so httrack's "bytes received"
# stays tiny; without it the engine reads the body and the count jumps.
MIMEX_BLOB = b"%PDF-1.4\n" + b"\x00" * (1024 * 1024)
def route_mimex_blob(self):
self.send_raw(self.MIMEX_BLOB, "application/pdf")
def route_mimex_real(self):
self.send_raw(b"<html><body>real</body></html>", "text/html")
# --- special chars in URLs across an update (issue #157) ---------------
# A dotless, accented basename served as text/html (MediaWiki style). The
# name the first crawl picks (.html) must survive the update pass.
@@ -355,6 +373,9 @@ class Handler(SimpleHTTPRequestHandler):
"/errpage/good.html": route_errpage_good,
"/errpage/missing.html": route_errpage_missing,
"/errpage/empty.html": route_errpage_empty,
"/mimex/index.html": route_mimex_index,
"/mimex/blob.pdf": route_mimex_blob,
"/mimex/real.html": route_mimex_real,
}
# --- dispatch ----------------------------------------------------------