Compare commits

..

1 Commits

Author SHA1 Message Date
Xavier Roche
0a21ec9acf Replace single-letter -# self-tests with a named -#test=NAME registry
The hidden engine self-tests had accreted into a grab-bag of arbitrary
single-letter/-digit -# arms (-#0, -#A, -#W, ...) buried in the htscoremain.c
option switch, with no mnemonics and stale --help text. Collapse them into one
registry: -#test lists every test with a usage hint and one-line description,
and -#test=NAME [args] runs one.

The handlers and the two helpers they used (basic_selftests,
string_safety_selftests) move to a new htsselftest.c keyed by a
{name, args, desc, fn} table; htscoremain.c keeps only a small dispatch that
runs ahead of the no-URL usage gate, so a bare -#test (or an arg-less test like
copyopt/dns/cookies) no longer needs a dummy URL token to be reached. The
genuine debug knobs (-#L, -#C, -#R, -#h, ...) stay as letters in the switch;
only the unit self-tests, whose sole callers are tests/01_engine-*.test, are
renamed, so this is internal-only with no compatibility surface. Behavior is
preserved: each test prints the same result line and exit code, which the
existing assertions pin. Three now-unused includes (htscache_selftest.h,
htsdns_selftest.h, htsencoding.h) drop out of htscoremain.c.

Tests: the engine tests move to -#test=NAME; 01_engine-hashtable now asserts its
success line (not just exit code) so a misrouted registry row can't pass, and a
new 01_engine-selftest-dispatch covers the bare-list and unknown-name paths.

The --help/man "guru options" list now points at -#test instead of enumerating
a stale subset. The lone vestigial alias --debug-testfilters still resolves to
the removed -#0 (it was already non-functional: param1 supplies one argument,
-#0 required two); it is left untouched because editing that array forces
clang-format to reflow the whole untouched table.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-26 07:58:23 +02:00
4 changed files with 22 additions and 255 deletions

View File

@@ -57,10 +57,7 @@ Please visit our Website: http://www.httrack.com
// DOS
#include <process.h> /* _beginthread, _endthread */
#endif
#include <io.h> /* _chsize_s */
#define HTS_FTRUNCATE(fp, sz) _chsize_s(_fileno(fp), (sz))
#else
#define HTS_FTRUNCATE(fp, sz) ftruncate(fileno(fp), (sz))
#endif
#define VT_CLREOL "\33[K"
@@ -3777,70 +3774,35 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
// xxc SI CHUNK VERIFIER QUE CA MARCHE??
if (back[i].r.statuscode == 206) { // on nous envoie un morceau (la fin) coz une partie sur disque!
off_t sz = fsize_utf8(back[i].url_sav);
/* RFC 7233: resume at the server's Content-Range start,
not the offset we requested; a server may resume
earlier and appending the overlap duplicates bytes
(#198). */
const LLint resume = back[i].r.crange_start;
const hts_boolean range_ok =
back[i].r.crange > 0 && resume >= 0 &&
resume <= (LLint) sz &&
back[i].r.crange_end + 1 == back[i].r.crange &&
(back[i].r.totalsize < 0 ||
back[i].r.totalsize ==
back[i].r.crange_end - resume + 1);
#if HDEBUG
printf("partial content: " LLintP " on disk..\n",
(LLint) sz);
#endif
if (sz >= 0 && range_ok) {
if (sz >= 0) {
if (!is_hypertext_mime(opt, back[i].r.contenttype, back[i].url_sav)) { // pas HTML
if (opt->getmode & HTS_GETMODE_NONHTML) {
filenote(&opt->state.strc, back[i].url_sav, NULL); // noter fichier comme connu
file_notify(opt, back[i].url_adr, back[i].url_fil,
back[i].url_sav, 0, 1,
back[i].r.notmodified);
back[i].r.out =
FOPEN(fconv(catbuff, sizeof(catbuff),
back[i].url_sav),
"r+b"); // resume in place
back[i].r.out = FOPEN(fconv(catbuff, sizeof(catbuff), back[i].url_sav), "ab"); // append
if (back[i].r.out && opt->cache != 0) {
back[i].r.is_write = 1;
back[i].r.size = resume; // bytes already on disk
back[i].r.statuscode = HTTP_OK; // force 'OK'
back[i].r.is_write = 1; // écrire
back[i].r.size = sz; // déja écrit
back[i].r.statuscode = HTTP_OK; // Forcer 'OK'
if (back[i].r.totalsize >= 0)
back[i].r.totalsize += resume; // -> full size
// drop bytes past the resume point; a silent
// failure could leave a stale tail, so on error
// drop the partial and refetch the whole file
if (HTS_FTRUNCATE(back[i].r.out,
(off_t) resume) != 0) {
fclose(back[i].r.out);
back[i].r.out = NULL;
url_savename_refname_remove(
opt, back[i].url_adr, back[i].url_fil);
UNLINK(back[i].url_sav);
back[i].status = STATUS_READY;
back_set_finished(sback, i);
strcpybuff(back[i].r.msg,
"Can not truncate partial file, "
"restarting");
} else {
fseeko(back[i].r.out, (off_t) resume, SEEK_SET);
/* create a temporary reference file in case of
* broken mirror */
if (back_serialize_ref(opt, &back[i]) != 0) {
hts_log_print(opt, LOG_WARNING,
"Could not create temporary "
"reference file for %s%s",
back[i].url_adr,
back[i].url_fil);
}
#if HDEBUG
printf("continue interrupted file\n");
#endif
back[i].r.totalsize += sz; // plus en fait
fseek(back[i].r.out, 0, SEEK_END); // à la fin
/* create a temporary reference file in case of broken mirror */
if (back_serialize_ref(opt, &back[i]) != 0) {
hts_log_print(opt, LOG_WARNING,
"Could not create temporary reference file for %s%s",
back[i].url_adr, back[i].url_fil);
}
#if HDEBUG
printf("continue interrupted file\n");
#endif
} else { // On est dans la m**
back[i].status = STATUS_READY; // terminé (voir plus loin)
back_set_finished(sback, i);
@@ -3852,18 +3814,17 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
FILE *fp =
FOPEN(fconv(catbuff, sizeof(catbuff), back[i].url_sav), "rb");
if (fp) {
LLint alloc_mem = resume + 1;
LLint alloc_mem = sz + 1;
if (back[i].r.totalsize >= 0)
alloc_mem += back[i].r.totalsize; // AJOUTER RESTANT!
if (deleteaddr(&back[i].r)
&& (back[i].r.adr =
(char *) malloct((size_t) alloc_mem))) {
back[i].r.size = resume;
back[i].r.size = sz;
if (back[i].r.totalsize >= 0)
back[i].r.totalsize += resume; // -> full size
if ((fread(back[i].r.adr, 1, (size_t) resume,
fp)) != (size_t) resume) {
back[i].r.totalsize += sz; // plus en fait
if ((fread(back[i].r.adr, 1, sz, fp)) != sz) {
back[i].status = STATUS_READY; // terminé (voir plus loin)
back_set_finished(sback, i);
strcpybuff(back[i].r.msg,
@@ -3881,30 +3842,14 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
"No memory for partial file");
}
fclose(fp);
} else { // open failed
} else { // Argh..
back[i].status = STATUS_READY; // terminé (voir plus loin)
back_set_finished(sback, i);
strcpybuff(back[i].r.msg,
"Can not open partial file");
}
}
} else if (sz >=
0) { // unusable range -> restart whole file
hts_log_print(opt, LOG_WARNING,
"Unusable partial-content range for %s%s "
"(have " LLintP " bytes, got " LLintP
"-" LLintP "/" LLintP "), restarting",
back[i].url_adr, back[i].url_fil,
(LLint) sz, back[i].r.crange_start,
back[i].r.crange_end, back[i].r.crange);
url_savename_refname_remove(opt, back[i].url_adr,
back[i].url_fil);
UNLINK(back[i].url_sav);
back[i].status = STATUS_READY;
back_set_finished(sback, i);
strcpybuff(back[i].r.msg,
"Unusable partial content, restarting");
} else { // partial not found
} else { // Non trouvé??
back[i].status = STATUS_READY; // terminé (voir plus loin)
back_set_finished(sback, i);
strcpybuff(back[i].r.msg, "Can not find partial file");

View File

@@ -1,109 +0,0 @@
#!/bin/bash
# Issue #198: on a resumed download the server may answer the Range with a 206
# that starts *before* the offset we asked for (block-aligned ranges). httrack
# must honor the returned Content-Range, not blindly append, or the overlap
# bytes get duplicated and the file grows (corrupt PDFs). Pass 1 interrupts
# flaky.bin mid-body (partial + temp-ref); pass 2 resumes against a 206 that
# backs up 8 bytes. The result must equal the same bytes fetched whole (full.bin).
set -eu
: "${top_srcdir:=..}"
testdir=$(cd "$(dirname "$0")" && pwd)
server="${testdir}/local-server.py"
command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_198.XXXXXX") || exit 1
serverpid=
crawlpid=
cleanup() {
if test -n "$crawlpid"; then kill -9 "$crawlpid" 2>/dev/null || true; fi
if test -n "$serverpid"; then
kill "$serverpid" 2>/dev/null || true
wait "$serverpid" 2>/dev/null || true
fi
rm -rf "$tmpdir"
}
trap cleanup EXIT HUP INT QUIT PIPE TERM
# OVERLAP_COUNTER gets a byte per flaky.bin request so pass 1 knows when to interrupt.
serverlog="${tmpdir}/server.log"
counter="${tmpdir}/hits"
resumed="${tmpdir}/resumed" # gets a byte when the server serves a resume 206
OVERLAP_COUNTER="$counter" OVERLAP_RESUMED="$resumed" \
python3 "$server" --root "${testdir}/server-root" \
>"$serverlog" 2>&1 &
serverpid=$!
port=
for _ in $(seq 1 50); do
line=$(head -n1 "$serverlog" 2>/dev/null)
if test "${line%% *}" == "PORT"; then
port="${line#PORT }"
break
fi
kill -0 "$serverpid" 2>/dev/null || {
echo "server exited early: $(cat "$serverlog")"
exit 1
}
sleep 0.1
done
test -n "$port" || {
echo "could not discover server port"
exit 1
}
base="http://127.0.0.1:${port}"
which httrack >/dev/null || {
echo "could not find httrack"
exit 1
}
out="${tmpdir}/crawl"
common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0 -c1)
refdir="${out}/hts-cache/ref"
# pass 1: interrupt once flaky.bin's prefix is streaming (partial + temp-ref).
printf '[pass 1: interrupt flaky.bin] ..\t'
httrack "${common[@]}" "${base}/overlap/index.html" >"${tmpdir}/log1" 2>&1 &
crawlpid=$!
for _ in $(seq 1 300); do
test -s "$counter" && break
kill -0 "$crawlpid" 2>/dev/null || break
sleep 0.1
done
sleep 0.5
kill -TERM "$crawlpid" 2>/dev/null || true
wait "$crawlpid" 2>/dev/null || true
crawlpid=
test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
echo "FAIL: no temp-ref survived pass 1; cannot drive the resume"
exit 1
}
echo "OK (temp-ref present)"
# pass 2: --continue -> resume Range -> 206 that starts 8 bytes early.
printf '[pass 2: resume flaky.bin] ..\t'
httrack "${common[@]}" --continue "${base}/overlap/index.html" >"${tmpdir}/log2" 2>&1 || true
echo "OK"
# Guard against a silent full re-download: the byte-compare below only tests the
# fix if pass 2 actually went through the resume Range -> 206 path.
printf '[resume path was exercised] ..\t'
if ! test -s "$resumed"; then
echo "FAIL: pass 2 never triggered a resume 206; the overlap fix was not exercised"
exit 1
fi
echo "OK"
printf '[resumed file is not corrupted] ..\t'
dir=$(find "$out" -maxdepth 1 -type d -name '127.0.0.1*' | head -1)
flaky="${dir}/overlap/flaky.bin"
full="${dir}/overlap/full.bin"
if ! test -f "$flaky" || ! test -f "$full"; then
echo "FAIL: flaky.bin or full.bin missing after pass 2"
exit 1
fi
if ! cmp -s "$flaky" "$full"; then
echo "FAIL: resumed flaky.bin ($(wc -c <"$flaky")) != full.bin ($(wc -c <"$full")); overlap duplicated"
exit 1
fi
echo "OK ($(wc -c <"$flaky") bytes, byte-identical)"

View File

@@ -65,7 +65,6 @@ TESTS = \
20_local-resume-loop.test \
21_local-intl-update.test \
22_local-broken-size.test \
23_local-errpage.test \
24_local-resume-overlap.test
23_local-errpage.test
CLEANFILES = check-network_sh.cache

View File

@@ -225,71 +225,6 @@ class Handler(SimpleHTTPRequestHandler):
self.send_header("Content-Length", "0")
self.end_headers()
# 206 resume must honor the server's Content-Range, not the offset we asked
# for (#198): a server resuming a few bytes *before* the request must not
# leave httrack duplicating the overlap onto the partial. flaky.bin
# interrupts once then resumes OVERLAP_EARLY bytes early; full.bin serves
# the identical bytes in one shot, so the test can compare the two.
OVERLAP_BLOB = b"%PDF-1.4\n" + bytes((i * 37 + 11) % 256 for i in range(8000))
OVERLAP_EARLY = 8
OVERLAP_PREFIX_LEN = 4000 # flushed before the stall
_overlap_started = False
def route_overlap_index(self):
self.send_html('\t<a href="flaky.bin">flaky</a>\n\t<a href="full.bin">full</a>')
def route_overlap_full(self):
self.send_raw(self.OVERLAP_BLOB, "application/octet-stream")
def route_overlap(self):
counter = os.environ.get("OVERLAP_COUNTER")
if counter:
with open(counter, "a") as fp:
fp.write("x")
blob = self.OVERLAP_BLOB
rng = self.headers.get("Range")
# First GET: stream a prefix then stall, so the crawl can be interrupted
# mid-body (partial + temp-ref on disk).
if rng is None and not Handler._overlap_started:
Handler._overlap_started = True
self.send_response(200)
self.send_header("Content-Type", "application/octet-stream")
self.send_header("Content-Length", str(len(blob)))
self.send_header("Accept-Ranges", "bytes")
self.end_headers()
if self.command != "HEAD":
self.wfile.write(blob[: self.OVERLAP_PREFIX_LEN])
self.wfile.flush()
try:
while True:
time.sleep(3600)
except OSError:
pass
return
if rng is None: # no resume request: serve the whole file
return self.route_overlap_full()
# Resume: honor the Range, but back up OVERLAP_EARLY bytes.
start = (
int(rng[len("bytes=") :].split("-")[0]) if rng.startswith("bytes=") else 0
)
start = max(0, start - self.OVERLAP_EARLY)
# Signal that the resume Range -> 206 path actually fired, so the test
# can prove it was exercised (not a silent full re-download).
resumed = os.environ.get("OVERLAP_RESUMED")
if resumed:
with open(resumed, "a") as fp:
fp.write("x")
part = blob[start:]
self.send_response(206, "Partial Content")
self.send_header("Content-Type", "application/octet-stream")
self.send_header("Content-Length", str(len(part)))
self.send_header(
"Content-Range", "bytes %d-%d/%d" % (start, len(blob) - 1, len(blob))
)
self.end_headers()
if self.command != "HEAD":
self.wfile.write(part)
# error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
# bodies off disk; a genuine 0-byte 200 is a valid file and stays.
def route_errpage_index(self):
@@ -346,9 +281,6 @@ class Handler(SimpleHTTPRequestHandler):
"/intl/" + INTL_NAME: route_intl_page,
"/resume/index.html": route_resume_index,
"/resume/blob.txt": route_resume,
"/overlap/index.html": route_overlap_index,
"/overlap/flaky.bin": route_overlap,
"/overlap/full.bin": route_overlap_full,
"/size/index.html": route_size_index,
"/size/oversize.bin": route_size_oversize,
"/errpage/index.html": route_errpage_index,