Compare commits

...

3 Commits

Author SHA1 Message Date
Xavier Roche
7754a5b2b9 tests: run 24_local-resume-overlap under set -e
Follow the golden rule for shell scripts: start with set -e so a non-last
failure can't be masked. Guard the backgrounded-crawl kill/wait spots with
|| true so the expected SIGTERM exit doesn't abort the run.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-26 09:37:15 +02:00
Xavier Roche
828bdbd632 Harden #198 fix: verify the truncate, assert the test hit the resume path
Two follow-ups from review of the resume fix.

If HTS_FTRUNCATE fails the partial could keep a stale tail (only when the
resource shrank between runs, sz > full, so the body write no longer covers
the old end). Check its return and, on failure, drop the partial and refetch
the whole file instead of writing a possibly-corrupt one.

The resume test only compared the resumed bytes against the whole file, which
also passes if httrack silently re-downloads the file with no Range (the bug
never fires). Mark when the server actually serves a resume 206 and assert pass
2 hit that path, so a full re-download fails the test instead of passing it.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-26 09:26:10 +02:00
Xavier Roche
5640bb6837 Honor the server's Content-Range when resuming a partial download (#198)
A resumed download (Range: bytes=N-) may be answered with a 206 whose range
starts before N: block-aligned caches and CDNs routinely round the start down
to a block boundary, and RFC 7233 lets the server pick the range it returns.
httrack ignored the returned Content-Range and blindly appended the 206 body to
the bytes already on disk, so the overlapping bytes were duplicated and the file
grew by the overlap. With timing deciding which files get interrupted (and thus
resumed), this surfaced as a random subset of files corrupted on each run, each
a few bytes too large.

Resume at the server's crange_start instead: ftruncate the partial to that
offset and write the 206 body there (the in-memory branch keeps only that
prefix). When the returned range is unusable (a forward gap, no/garbage
Content-Range, or one that doesn't reach EOF) drop the partial and refetch the
whole file rather than stitch a corrupt one.

Reading the existing crange_start/crange_end/crange fields only, no ABI change.
Driven by tests/24_local-resume-overlap.test: pass 1 interrupts a download
mid-body, pass 2 resumes against a 206 that backs up 8 bytes, and the result
must be byte-identical to the same content fetched whole.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-26 09:11:31 +02:00
4 changed files with 254 additions and 21 deletions

View File

@@ -57,7 +57,10 @@ Please visit our Website: http://www.httrack.com
// DOS
#include <process.h> /* _beginthread, _endthread */
#endif
#include <io.h> /* _chsize_s */
#define HTS_FTRUNCATE(fp, sz) _chsize_s(_fileno(fp), (sz))
#else
#define HTS_FTRUNCATE(fp, sz) ftruncate(fileno(fp), (sz))
#endif
#define VT_CLREOL "\33[K"
@@ -3774,35 +3777,70 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
// xxc SI CHUNK VERIFIER QUE CA MARCHE??
if (back[i].r.statuscode == 206) { // on nous envoie un morceau (la fin) coz une partie sur disque!
off_t sz = fsize_utf8(back[i].url_sav);
/* RFC 7233: resume at the server's Content-Range start,
not the offset we requested; a server may resume
earlier and appending the overlap duplicates bytes
(#198). */
const LLint resume = back[i].r.crange_start;
const hts_boolean range_ok =
back[i].r.crange > 0 && resume >= 0 &&
resume <= (LLint) sz &&
back[i].r.crange_end + 1 == back[i].r.crange &&
(back[i].r.totalsize < 0 ||
back[i].r.totalsize ==
back[i].r.crange_end - resume + 1);
#if HDEBUG
printf("partial content: " LLintP " on disk..\n",
(LLint) sz);
#endif
if (sz >= 0) {
if (sz >= 0 && range_ok) {
if (!is_hypertext_mime(opt, back[i].r.contenttype, back[i].url_sav)) { // pas HTML
if (opt->getmode & HTS_GETMODE_NONHTML) {
filenote(&opt->state.strc, back[i].url_sav, NULL); // noter fichier comme connu
file_notify(opt, back[i].url_adr, back[i].url_fil,
back[i].url_sav, 0, 1,
back[i].r.notmodified);
back[i].r.out = FOPEN(fconv(catbuff, sizeof(catbuff), back[i].url_sav), "ab"); // append
back[i].r.out =
FOPEN(fconv(catbuff, sizeof(catbuff),
back[i].url_sav),
"r+b"); // resume in place
if (back[i].r.out && opt->cache != 0) {
back[i].r.is_write = 1; // écrire
back[i].r.size = sz; // déja écrit
back[i].r.statuscode = HTTP_OK; // Forcer 'OK'
back[i].r.is_write = 1;
back[i].r.size = resume; // bytes already on disk
back[i].r.statuscode = HTTP_OK; // force 'OK'
if (back[i].r.totalsize >= 0)
back[i].r.totalsize += sz; // plus en fait
fseek(back[i].r.out, 0, SEEK_END); // à la fin
/* create a temporary reference file in case of broken mirror */
if (back_serialize_ref(opt, &back[i]) != 0) {
hts_log_print(opt, LOG_WARNING,
"Could not create temporary reference file for %s%s",
back[i].url_adr, back[i].url_fil);
}
back[i].r.totalsize += resume; // -> full size
// drop bytes past the resume point; a silent
// failure could leave a stale tail, so on error
// drop the partial and refetch the whole file
if (HTS_FTRUNCATE(back[i].r.out,
(off_t) resume) != 0) {
fclose(back[i].r.out);
back[i].r.out = NULL;
url_savename_refname_remove(
opt, back[i].url_adr, back[i].url_fil);
UNLINK(back[i].url_sav);
back[i].status = STATUS_READY;
back_set_finished(sback, i);
strcpybuff(back[i].r.msg,
"Can not truncate partial file, "
"restarting");
} else {
fseeko(back[i].r.out, (off_t) resume, SEEK_SET);
/* create a temporary reference file in case of
* broken mirror */
if (back_serialize_ref(opt, &back[i]) != 0) {
hts_log_print(opt, LOG_WARNING,
"Could not create temporary "
"reference file for %s%s",
back[i].url_adr,
back[i].url_fil);
}
#if HDEBUG
printf("continue interrupted file\n");
printf("continue interrupted file\n");
#endif
}
} else { // On est dans la m**
back[i].status = STATUS_READY; // terminé (voir plus loin)
back_set_finished(sback, i);
@@ -3814,17 +3852,18 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
FILE *fp =
FOPEN(fconv(catbuff, sizeof(catbuff), back[i].url_sav), "rb");
if (fp) {
LLint alloc_mem = sz + 1;
LLint alloc_mem = resume + 1;
if (back[i].r.totalsize >= 0)
alloc_mem += back[i].r.totalsize; // AJOUTER RESTANT!
if (deleteaddr(&back[i].r)
&& (back[i].r.adr =
(char *) malloct((size_t) alloc_mem))) {
back[i].r.size = sz;
back[i].r.size = resume;
if (back[i].r.totalsize >= 0)
back[i].r.totalsize += sz; // plus en fait
if ((fread(back[i].r.adr, 1, sz, fp)) != sz) {
back[i].r.totalsize += resume; // -> full size
if ((fread(back[i].r.adr, 1, (size_t) resume,
fp)) != (size_t) resume) {
back[i].status = STATUS_READY; // terminé (voir plus loin)
back_set_finished(sback, i);
strcpybuff(back[i].r.msg,
@@ -3842,14 +3881,30 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
"No memory for partial file");
}
fclose(fp);
} else { // Argh..
} else { // open failed
back[i].status = STATUS_READY; // terminé (voir plus loin)
back_set_finished(sback, i);
strcpybuff(back[i].r.msg,
"Can not open partial file");
}
}
} else { // Non trouvé??
} else if (sz >=
0) { // unusable range -> restart whole file
hts_log_print(opt, LOG_WARNING,
"Unusable partial-content range for %s%s "
"(have " LLintP " bytes, got " LLintP
"-" LLintP "/" LLintP "), restarting",
back[i].url_adr, back[i].url_fil,
(LLint) sz, back[i].r.crange_start,
back[i].r.crange_end, back[i].r.crange);
url_savename_refname_remove(opt, back[i].url_adr,
back[i].url_fil);
UNLINK(back[i].url_sav);
back[i].status = STATUS_READY;
back_set_finished(sback, i);
strcpybuff(back[i].r.msg,
"Unusable partial content, restarting");
} else { // partial not found
back[i].status = STATUS_READY; // terminé (voir plus loin)
back_set_finished(sback, i);
strcpybuff(back[i].r.msg, "Can not find partial file");

View File

@@ -0,0 +1,109 @@
#!/bin/bash
# Issue #198: on a resumed download the server may answer the Range with a 206
# that starts *before* the offset we asked for (block-aligned ranges). httrack
# must honor the returned Content-Range, not blindly append, or the overlap
# bytes get duplicated and the file grows (corrupt PDFs). Pass 1 interrupts
# flaky.bin mid-body (partial + temp-ref); pass 2 resumes against a 206 that
# backs up 8 bytes. The result must equal the same bytes fetched whole (full.bin).
set -eu
: "${top_srcdir:=..}"
testdir=$(cd "$(dirname "$0")" && pwd)
server="${testdir}/local-server.py"
command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_198.XXXXXX") || exit 1
serverpid=
crawlpid=
cleanup() {
if test -n "$crawlpid"; then kill -9 "$crawlpid" 2>/dev/null || true; fi
if test -n "$serverpid"; then
kill "$serverpid" 2>/dev/null || true
wait "$serverpid" 2>/dev/null || true
fi
rm -rf "$tmpdir"
}
trap cleanup EXIT HUP INT QUIT PIPE TERM
# OVERLAP_COUNTER gets a byte per flaky.bin request so pass 1 knows when to interrupt.
serverlog="${tmpdir}/server.log"
counter="${tmpdir}/hits"
resumed="${tmpdir}/resumed" # gets a byte when the server serves a resume 206
OVERLAP_COUNTER="$counter" OVERLAP_RESUMED="$resumed" \
python3 "$server" --root "${testdir}/server-root" \
>"$serverlog" 2>&1 &
serverpid=$!
port=
for _ in $(seq 1 50); do
line=$(head -n1 "$serverlog" 2>/dev/null)
if test "${line%% *}" == "PORT"; then
port="${line#PORT }"
break
fi
kill -0 "$serverpid" 2>/dev/null || {
echo "server exited early: $(cat "$serverlog")"
exit 1
}
sleep 0.1
done
test -n "$port" || {
echo "could not discover server port"
exit 1
}
base="http://127.0.0.1:${port}"
which httrack >/dev/null || {
echo "could not find httrack"
exit 1
}
out="${tmpdir}/crawl"
common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0 -c1)
refdir="${out}/hts-cache/ref"
# pass 1: interrupt once flaky.bin's prefix is streaming (partial + temp-ref).
printf '[pass 1: interrupt flaky.bin] ..\t'
httrack "${common[@]}" "${base}/overlap/index.html" >"${tmpdir}/log1" 2>&1 &
crawlpid=$!
for _ in $(seq 1 300); do
test -s "$counter" && break
kill -0 "$crawlpid" 2>/dev/null || break
sleep 0.1
done
sleep 0.5
kill -TERM "$crawlpid" 2>/dev/null || true
wait "$crawlpid" 2>/dev/null || true
crawlpid=
test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
echo "FAIL: no temp-ref survived pass 1; cannot drive the resume"
exit 1
}
echo "OK (temp-ref present)"
# pass 2: --continue -> resume Range -> 206 that starts 8 bytes early.
printf '[pass 2: resume flaky.bin] ..\t'
httrack "${common[@]}" --continue "${base}/overlap/index.html" >"${tmpdir}/log2" 2>&1 || true
echo "OK"
# Guard against a silent full re-download: the byte-compare below only tests the
# fix if pass 2 actually went through the resume Range -> 206 path.
printf '[resume path was exercised] ..\t'
if ! test -s "$resumed"; then
echo "FAIL: pass 2 never triggered a resume 206; the overlap fix was not exercised"
exit 1
fi
echo "OK"
printf '[resumed file is not corrupted] ..\t'
dir=$(find "$out" -maxdepth 1 -type d -name '127.0.0.1*' | head -1)
flaky="${dir}/overlap/flaky.bin"
full="${dir}/overlap/full.bin"
if ! test -f "$flaky" || ! test -f "$full"; then
echo "FAIL: flaky.bin or full.bin missing after pass 2"
exit 1
fi
if ! cmp -s "$flaky" "$full"; then
echo "FAIL: resumed flaky.bin ($(wc -c <"$flaky")) != full.bin ($(wc -c <"$full")); overlap duplicated"
exit 1
fi
echo "OK ($(wc -c <"$flaky") bytes, byte-identical)"

View File

@@ -65,6 +65,7 @@ TESTS = \
20_local-resume-loop.test \
21_local-intl-update.test \
22_local-broken-size.test \
23_local-errpage.test
23_local-errpage.test \
24_local-resume-overlap.test
CLEANFILES = check-network_sh.cache

View File

@@ -225,6 +225,71 @@ class Handler(SimpleHTTPRequestHandler):
self.send_header("Content-Length", "0")
self.end_headers()
# 206 resume must honor the server's Content-Range, not the offset we asked
# for (#198): a server resuming a few bytes *before* the request must not
# leave httrack duplicating the overlap onto the partial. flaky.bin
# interrupts once then resumes OVERLAP_EARLY bytes early; full.bin serves
# the identical bytes in one shot, so the test can compare the two.
OVERLAP_BLOB = b"%PDF-1.4\n" + bytes((i * 37 + 11) % 256 for i in range(8000))
OVERLAP_EARLY = 8
OVERLAP_PREFIX_LEN = 4000 # flushed before the stall
_overlap_started = False
def route_overlap_index(self):
self.send_html('\t<a href="flaky.bin">flaky</a>\n\t<a href="full.bin">full</a>')
def route_overlap_full(self):
self.send_raw(self.OVERLAP_BLOB, "application/octet-stream")
def route_overlap(self):
counter = os.environ.get("OVERLAP_COUNTER")
if counter:
with open(counter, "a") as fp:
fp.write("x")
blob = self.OVERLAP_BLOB
rng = self.headers.get("Range")
# First GET: stream a prefix then stall, so the crawl can be interrupted
# mid-body (partial + temp-ref on disk).
if rng is None and not Handler._overlap_started:
Handler._overlap_started = True
self.send_response(200)
self.send_header("Content-Type", "application/octet-stream")
self.send_header("Content-Length", str(len(blob)))
self.send_header("Accept-Ranges", "bytes")
self.end_headers()
if self.command != "HEAD":
self.wfile.write(blob[: self.OVERLAP_PREFIX_LEN])
self.wfile.flush()
try:
while True:
time.sleep(3600)
except OSError:
pass
return
if rng is None: # no resume request: serve the whole file
return self.route_overlap_full()
# Resume: honor the Range, but back up OVERLAP_EARLY bytes.
start = (
int(rng[len("bytes=") :].split("-")[0]) if rng.startswith("bytes=") else 0
)
start = max(0, start - self.OVERLAP_EARLY)
# Signal that the resume Range -> 206 path actually fired, so the test
# can prove it was exercised (not a silent full re-download).
resumed = os.environ.get("OVERLAP_RESUMED")
if resumed:
with open(resumed, "a") as fp:
fp.write("x")
part = blob[start:]
self.send_response(206, "Partial Content")
self.send_header("Content-Type", "application/octet-stream")
self.send_header("Content-Length", str(len(part)))
self.send_header(
"Content-Range", "bytes %d-%d/%d" % (start, len(blob) - 1, len(blob))
)
self.end_headers()
if self.command != "HEAD":
self.wfile.write(part)
# error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
# bodies off disk; a genuine 0-byte 200 is a valid file and stays.
def route_errpage_index(self):
@@ -281,6 +346,9 @@ class Handler(SimpleHTTPRequestHandler):
"/intl/" + INTL_NAME: route_intl_page,
"/resume/index.html": route_resume_index,
"/resume/blob.txt": route_resume,
"/overlap/index.html": route_overlap_index,
"/overlap/flaky.bin": route_overlap,
"/overlap/full.bin": route_overlap_full,
"/size/index.html": route_size_index,
"/size/oversize.bin": route_size_oversize,
"/errpage/index.html": route_errpage_index,