mirror of
https://github.com/xroche/httrack.git
synced 2026-07-04 08:04:13 +03:00
Compare commits
3 Commits
master
...
delayed-sl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
61a533e6f3 | ||
|
|
47ab43762b | ||
|
|
37322e0bb3 |
@@ -2237,12 +2237,13 @@ int host_wait(httrackp * opt, lien_back * back) {
|
||||
|
||||
static int slot_can_be_cleaned(const lien_back * back) {
|
||||
return (back->status == STATUS_READY) // ready
|
||||
/* Check autoclean */
|
||||
&& (!back->testmode) // not test mode
|
||||
&& (strnotempty(back->url_sav)) // filename exists
|
||||
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
|
||||
&& (back->r.size >= 0) // size>=0
|
||||
;
|
||||
/* Check autoclean */
|
||||
&& (!back->locked) // not held by hts_wait_delayed (name pending)
|
||||
&& (!back->testmode) // not test mode
|
||||
&& (strnotempty(back->url_sav)) // filename exists
|
||||
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
|
||||
&& (back->r.size >= 0) // size>=0
|
||||
;
|
||||
}
|
||||
|
||||
static int slot_can_be_finalized(httrackp * opt, const lien_back * back) {
|
||||
@@ -2891,10 +2892,10 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
// range size hack old location
|
||||
|
||||
#if HTS_DIRECTDISK
|
||||
// Court-circuit:
|
||||
// Peut-on stocker le fichier directement sur disque?
|
||||
// Ahh que ca serait vachement mieux et que ahh que la mémoire vous dit merci!
|
||||
if (back[i].status) {
|
||||
// Shortcut: store the file directly on disk when possible,
|
||||
// sparing memory
|
||||
if (back[i].status &&
|
||||
!back[i].locked) { // name still pending when locked
|
||||
if (back[i].r.is_write == 0) { // mode mémoire
|
||||
if (back[i].r.adr == NULL) { // rien n'a été écrit
|
||||
if (!back[i].testmode) { // pas mode test
|
||||
@@ -3961,7 +3962,10 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
back[i].r.adr[0] = 0;
|
||||
}
|
||||
hts_log_print(opt, LOG_TRACE, "finalizing empty");
|
||||
back_finalize(opt, cache, sback, i);
|
||||
/* locked = name pending; the waiter finalizes after
|
||||
patching url_sav (else: cached as .delayed, #5) */
|
||||
if (!back[i].locked)
|
||||
back_finalize(opt, cache, sback, i);
|
||||
} else if (!back[i].r.is_chunk) { // pas de chunk
|
||||
//if (back[i].r.http11!=2) { // pas de chunk
|
||||
back[i].is_chunk = 0;
|
||||
|
||||
@@ -4845,6 +4845,9 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
|
||||
/* Still have a back reference */
|
||||
if (b >= 0) {
|
||||
/* Patch destination filename for direct-to-disk mode, BEFORE any
|
||||
finalize: it records and caches the entry under url_sav */
|
||||
strcpybuff(back[b].url_sav, afs->save);
|
||||
/* Finalize now as we have the type */
|
||||
if (back[b].status == STATUS_READY) {
|
||||
if (!back[b].finalized) {
|
||||
@@ -4852,8 +4855,6 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
back_finalize(opt, cache, sback, b);
|
||||
}
|
||||
}
|
||||
/* Patch destination filename for direct-to-disk mode */
|
||||
strcpybuff(back[b].url_sav, afs->save);
|
||||
}
|
||||
|
||||
} // b >= 0
|
||||
|
||||
20
tests/33_local-delayed.test
Normal file
20
tests/33_local-delayed.test
Normal file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Degenerate delayed-type paths (#5/#107 family): redirects that never resolve
|
||||
# a name must drop cleanly -- no .delayed leftovers (audited by local-crawl.sh),
|
||||
# no "bogus state" cache warnings, resolvable links still land correctly.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --rerun --errors 0 \
|
||||
--found 'delayed/real.pdf' \
|
||||
--file-matches 'delayed/real.pdf' '%PDF' \
|
||||
--found 'delayed/notype.bin.html' \
|
||||
--found 'delayed/empty.html' \
|
||||
--not-found 'delayed/noloc.html' \
|
||||
--not-found 'delayed/selfloop.html' \
|
||||
--not-found 'delayed/chain9.pdf' \
|
||||
--log-not-found 'bogus state' \
|
||||
httrack 'BASEURL/delayed/index.html'
|
||||
@@ -93,6 +93,7 @@ TESTS = \
|
||||
29_local-redirect-fragment.test \
|
||||
30_local-fragment-link.test \
|
||||
31_local-javaclass.test \
|
||||
32_local-cdispo.test
|
||||
32_local-cdispo.test \
|
||||
33_local-delayed.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -246,6 +246,14 @@ done
|
||||
test -n "$hostroot" || die "could not find host root under $out"
|
||||
debug "host root: $hostroot"
|
||||
|
||||
# A completed crawl must leave no .delayed temporaries (issue #107)
|
||||
info "checking for leftover .delayed files"
|
||||
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
|
||||
if test -z "$leftovers"; then result "OK"; else
|
||||
result "leftover: $leftovers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- audit -------------------------------------------------------------------
|
||||
i=0
|
||||
while test "$i" -lt "${#audit[@]}"; do
|
||||
|
||||
@@ -392,6 +392,50 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def route_redir_target(self):
|
||||
self.send_raw(b"<html><body>redirect target</body></html>\n", "text/html")
|
||||
|
||||
# --- delayed-type degenerate paths (issues #5/#107) --------------------
|
||||
def route_delayed_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="noloc.php">noloc</a>\n'
|
||||
'\t<a href="selfloop.php">selfloop</a>\n'
|
||||
'\t<a href="chain1.php">chain</a>\n'
|
||||
'\t<a href="redir.php">redir</a>\n'
|
||||
'\t<a href="notype.bin">notype</a>\n'
|
||||
'\t<a href="empty.php">empty</a>\n'
|
||||
)
|
||||
|
||||
def send_redirect(self, location):
|
||||
self.send_response(302, "Found")
|
||||
if location is not None:
|
||||
self.send_header("Location", location)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
def route_delayed_noloc(self):
|
||||
self.send_redirect(None) # 302 without Location: name never resolves
|
||||
|
||||
def route_delayed_selfloop(self):
|
||||
self.send_redirect("selfloop.php")
|
||||
|
||||
def route_delayed_chain(self):
|
||||
# chain1..chain9: one more hop than the type-check redirect budget
|
||||
n = int(urlsplit(self.path).path.rsplit("chain", 1)[1].split(".")[0])
|
||||
if n < 9:
|
||||
self.send_redirect("chain%d.php" % (n + 1))
|
||||
else:
|
||||
self.send_raw(self.FAKE_PDF, "application/pdf")
|
||||
|
||||
def route_delayed_redir(self):
|
||||
self.send_redirect("real.pdf")
|
||||
|
||||
def route_delayed_realpdf(self):
|
||||
self.send_raw(self.FAKE_PDF, "application/pdf")
|
||||
|
||||
def route_delayed_notype(self):
|
||||
self.send_raw(self.FAKE_PDF, None)
|
||||
|
||||
def route_delayed_empty(self):
|
||||
self.send_raw(b"", "text/html") # 200 + Content-Length: 0
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
@@ -432,6 +476,22 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/cdispo/index.html": route_cdispo_index,
|
||||
"/cdispo/fetch.php": route_cdispo,
|
||||
"/cdispo/evil.php": route_cdispo,
|
||||
"/delayed/index.html": route_delayed_index,
|
||||
"/delayed/noloc.php": route_delayed_noloc,
|
||||
"/delayed/selfloop.php": route_delayed_selfloop,
|
||||
"/delayed/redir.php": route_delayed_redir,
|
||||
"/delayed/real.pdf": route_delayed_realpdf,
|
||||
"/delayed/notype.bin": route_delayed_notype,
|
||||
"/delayed/empty.php": route_delayed_empty,
|
||||
"/delayed/chain1.php": route_delayed_chain,
|
||||
"/delayed/chain2.php": route_delayed_chain,
|
||||
"/delayed/chain3.php": route_delayed_chain,
|
||||
"/delayed/chain4.php": route_delayed_chain,
|
||||
"/delayed/chain5.php": route_delayed_chain,
|
||||
"/delayed/chain6.php": route_delayed_chain,
|
||||
"/delayed/chain7.php": route_delayed_chain,
|
||||
"/delayed/chain8.php": route_delayed_chain,
|
||||
"/delayed/chain9.php": route_delayed_chain,
|
||||
"/redir/index.html": route_redir_index,
|
||||
"/redir/go.php": route_redir_go,
|
||||
"/redir/target.html": route_redir_target,
|
||||
|
||||
Reference in New Issue
Block a user