Compare commits

..

3 Commits

Author SHA1 Message Date
Xavier Roche
61a533e6f3 Empty responses finalized delayed slots under the .delayed name
Found while probing for a harness trigger of the finalize race: a
delayed-type URL answered 200 with Content-Length: 0 takes the
empty-response branch, which reaches READY inside the header round and
calls back_finalize directly, while url_sav still holds the .delayed
placeholder. Deterministic on any such URL: the 'bogus state
(incomplete type)' warning and an entry recorded under the placeholder
name. Empty responses are common in the wild (beacons, placeholder
files), so this is the reproducible face of #5.

Defer the finalize when the slot is locked, matching the other guards:
the waiter finalizes right after patching url_sav. The new empty.php
fixture makes 33_local-delayed fail on master and pass here; the
timing race itself remains untriggerable from the harness (headers and
body advance in separate back_wait rounds; probe notes in the PR).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-04 03:47:41 +02:00
Xavier Roche
47ab43762b Exercise the degenerate delayed-type paths; audit .delayed leftovers everywhere
New delayed/ fixtures (302 without Location, self-loop, a chain one hop
past the redirect budget, a proper redirect, a typeless unknown-ext
file) with 33_local-delayed.test, run twice via --rerun. local-crawl.sh
now fails ANY local test whose finished mirror contains a *.delayed
temporary, guarding the whole suite against the #107 class.

Probing the degenerate paths settled a review question: afs->save can
still be a fresh placeholder when url_sav is patched (redirects that
never resolve), but nothing name-keyed is stored there -- non-OK
entries are cached header-only with an empty X-Save, the link is
dropped, no file is left. The test pins exactly that.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-04 03:21:47 +02:00
Xavier Roche
37322e0bb3 Fix delayed-slot races: lock guards and finalize order
A slot whose savename is still the .delayed placeholder could be
finalized and recorded under that transient name when the transfer
completed before hts_wait_delayed resolved the type (fast servers):
back_clean ignored the lock hts_wait_delayed holds, the direct-to-disk
shortcut bound the output file to the placeholder name, and the
type-known finalize ran before url_sav was patched, caching the entry
under the wrong name (the 'bogus state (incomplete type)' warnings).
Skip locked slots in back_clean and in the disk shortcut, and patch
url_sav before finalizing.

The delayed-naming body sniff (next commit) waits for body bytes and
would hit these races deterministically; 15_local-types and
18_local-update cover the flow end to end.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-04 03:04:02 +02:00
6 changed files with 108 additions and 14 deletions

View File

@@ -2237,12 +2237,13 @@ int host_wait(httrackp * opt, lien_back * back) {
static int slot_can_be_cleaned(const lien_back * back) {
return (back->status == STATUS_READY) // ready
/* Check autoclean */
&& (!back->testmode) // not test mode
&& (strnotempty(back->url_sav)) // filename exists
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
&& (back->r.size >= 0) // size>=0
;
/* Check autoclean */
&& (!back->locked) // not held by hts_wait_delayed (name pending)
&& (!back->testmode) // not test mode
&& (strnotempty(back->url_sav)) // filename exists
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
&& (back->r.size >= 0) // size>=0
;
}
static int slot_can_be_finalized(httrackp * opt, const lien_back * back) {
@@ -2891,10 +2892,10 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
// range size hack old location
#if HTS_DIRECTDISK
// Court-circuit:
// Peut-on stocker le fichier directement sur disque?
// Ahh que ca serait vachement mieux et que ahh que la mémoire vous dit merci!
if (back[i].status) {
// Shortcut: store the file directly on disk when possible,
// sparing memory
if (back[i].status &&
!back[i].locked) { // name still pending when locked
if (back[i].r.is_write == 0) { // mode mémoire
if (back[i].r.adr == NULL) { // rien n'a été écrit
if (!back[i].testmode) { // pas mode test
@@ -3961,7 +3962,10 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
back[i].r.adr[0] = 0;
}
hts_log_print(opt, LOG_TRACE, "finalizing empty");
back_finalize(opt, cache, sback, i);
/* locked = name pending; the waiter finalizes after
patching url_sav (else: cached as .delayed, #5) */
if (!back[i].locked)
back_finalize(opt, cache, sback, i);
} else if (!back[i].r.is_chunk) { // pas de chunk
//if (back[i].r.http11!=2) { // pas de chunk
back[i].is_chunk = 0;

View File

@@ -4845,6 +4845,9 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
/* Still have a back reference */
if (b >= 0) {
/* Patch destination filename for direct-to-disk mode, BEFORE any
finalize: it records and caches the entry under url_sav */
strcpybuff(back[b].url_sav, afs->save);
/* Finalize now as we have the type */
if (back[b].status == STATUS_READY) {
if (!back[b].finalized) {
@@ -4852,8 +4855,6 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
back_finalize(opt, cache, sback, b);
}
}
/* Patch destination filename for direct-to-disk mode */
strcpybuff(back[b].url_sav, afs->save);
}
} // b >= 0

View File

@@ -0,0 +1,20 @@
#!/bin/bash
#
# Degenerate delayed-type paths (#5/#107 family): redirects that never resolve
# a name must drop cleanly -- no .delayed leftovers (audited by local-crawl.sh),
# no "bogus state" cache warnings, resolvable links still land correctly.
set -euo pipefail
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --rerun --errors 0 \
--found 'delayed/real.pdf' \
--file-matches 'delayed/real.pdf' '%PDF' \
--found 'delayed/notype.bin.html' \
--found 'delayed/empty.html' \
--not-found 'delayed/noloc.html' \
--not-found 'delayed/selfloop.html' \
--not-found 'delayed/chain9.pdf' \
--log-not-found 'bogus state' \
httrack 'BASEURL/delayed/index.html'

View File

@@ -93,6 +93,7 @@ TESTS = \
29_local-redirect-fragment.test \
30_local-fragment-link.test \
31_local-javaclass.test \
32_local-cdispo.test
32_local-cdispo.test \
33_local-delayed.test
CLEANFILES = check-network_sh.cache

View File

@@ -246,6 +246,14 @@ done
test -n "$hostroot" || die "could not find host root under $out"
debug "host root: $hostroot"
# A completed crawl must leave no .delayed temporaries (issue #107)
info "checking for leftover .delayed files"
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
if test -z "$leftovers"; then result "OK"; else
result "leftover: $leftovers"
exit 1
fi
# --- audit -------------------------------------------------------------------
i=0
while test "$i" -lt "${#audit[@]}"; do

View File

@@ -392,6 +392,50 @@ class Handler(SimpleHTTPRequestHandler):
def route_redir_target(self):
self.send_raw(b"<html><body>redirect target</body></html>\n", "text/html")
# --- delayed-type degenerate paths (issues #5/#107) --------------------
def route_delayed_index(self):
self.send_html(
'\t<a href="noloc.php">noloc</a>\n'
'\t<a href="selfloop.php">selfloop</a>\n'
'\t<a href="chain1.php">chain</a>\n'
'\t<a href="redir.php">redir</a>\n'
'\t<a href="notype.bin">notype</a>\n'
'\t<a href="empty.php">empty</a>\n'
)
def send_redirect(self, location):
self.send_response(302, "Found")
if location is not None:
self.send_header("Location", location)
self.send_header("Content-Length", "0")
self.end_headers()
def route_delayed_noloc(self):
self.send_redirect(None) # 302 without Location: name never resolves
def route_delayed_selfloop(self):
self.send_redirect("selfloop.php")
def route_delayed_chain(self):
# chain1..chain9: one more hop than the type-check redirect budget
n = int(urlsplit(self.path).path.rsplit("chain", 1)[1].split(".")[0])
if n < 9:
self.send_redirect("chain%d.php" % (n + 1))
else:
self.send_raw(self.FAKE_PDF, "application/pdf")
def route_delayed_redir(self):
self.send_redirect("real.pdf")
def route_delayed_realpdf(self):
self.send_raw(self.FAKE_PDF, "application/pdf")
def route_delayed_notype(self):
self.send_raw(self.FAKE_PDF, None)
def route_delayed_empty(self):
self.send_raw(b"", "text/html") # 200 + Content-Length: 0
ROUTES = {
"/cookies/entrance.php": route_entrance,
"/cookies/second.php": route_second,
@@ -432,6 +476,22 @@ class Handler(SimpleHTTPRequestHandler):
"/cdispo/index.html": route_cdispo_index,
"/cdispo/fetch.php": route_cdispo,
"/cdispo/evil.php": route_cdispo,
"/delayed/index.html": route_delayed_index,
"/delayed/noloc.php": route_delayed_noloc,
"/delayed/selfloop.php": route_delayed_selfloop,
"/delayed/redir.php": route_delayed_redir,
"/delayed/real.pdf": route_delayed_realpdf,
"/delayed/notype.bin": route_delayed_notype,
"/delayed/empty.php": route_delayed_empty,
"/delayed/chain1.php": route_delayed_chain,
"/delayed/chain2.php": route_delayed_chain,
"/delayed/chain3.php": route_delayed_chain,
"/delayed/chain4.php": route_delayed_chain,
"/delayed/chain5.php": route_delayed_chain,
"/delayed/chain6.php": route_delayed_chain,
"/delayed/chain7.php": route_delayed_chain,
"/delayed/chain8.php": route_delayed_chain,
"/delayed/chain9.php": route_delayed_chain,
"/redir/index.html": route_redir_index,
"/redir/go.php": route_redir_go,
"/redir/target.html": route_redir_target,