mirror of
https://github.com/xroche/httrack.git
synced 2026-06-25 11:37:28 +03:00
Compare commits
1 Commits
fix/update
...
fix-delaye
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
174bc565fc |
@@ -1729,10 +1729,10 @@ char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||
StringBuff(opt->path_log), digest_filename);
|
||||
}
|
||||
|
||||
/* remove refname if any; HTS_TRUE if it was removed */
|
||||
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||
const char *fil) {
|
||||
/* remove refname if any */
|
||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
||||
const char *fil) {
|
||||
char *filename = url_savename_refname_fullpath(opt, adr, fil);
|
||||
|
||||
return UNLINK(filename) == 0 ? HTS_TRUE : HTS_FALSE;
|
||||
(void) UNLINK(filename);
|
||||
}
|
||||
|
||||
@@ -104,9 +104,8 @@ char *url_md5(char *digest_buffer, const char *fil_complete);
|
||||
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
||||
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||
const char *fil);
|
||||
/* Remove the temp-ref for (adr,fil); HTS_TRUE if it was removed. */
|
||||
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||
const char *fil);
|
||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
||||
const char *fil);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3749,60 +3749,44 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
|
||||
} // bloc
|
||||
// erreur HTTP (ex: 404, not found)
|
||||
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED) ||
|
||||
(r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)) {
|
||||
// 412/416: the resume partial is stale; re-get the whole file (#206)
|
||||
lien_back *itemback = NULL;
|
||||
int had_partial = 0;
|
||||
int ref_existed = 0;
|
||||
int ref_gone;
|
||||
|
||||
// Drop the temp-ref, its partial, and heap->sav so the re-get carries no
|
||||
// Range; else back_add rebuilds the same Range and loops.
|
||||
if (back_unserialize_ref(opt, heap(ptr)->adr, heap(ptr)->fil,
|
||||
&itemback) == 0) {
|
||||
had_partial = 1;
|
||||
ref_existed = 1;
|
||||
// best-effort: an orphaned partial cannot re-Range once the ref is gone
|
||||
if (fexist_utf8(itemback->url_sav))
|
||||
(void) UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
itemback->url_sav));
|
||||
back_clear_entry(itemback);
|
||||
freet(itemback);
|
||||
}
|
||||
// don't re-record if the ref survived (it would re-Range and loop)
|
||||
ref_gone =
|
||||
url_savename_refname_remove(opt, heap(ptr)->adr, heap(ptr)->fil) ||
|
||||
!ref_existed;
|
||||
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED)
|
||||
|| (r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)
|
||||
) { // Precondition Failed, c'est à dire pour nous redemander TOUT le fichier
|
||||
if (fexist_utf8(heap(ptr)->sav)) {
|
||||
had_partial = 1;
|
||||
remove(heap(ptr)->sav);
|
||||
remove(heap(ptr)->sav); // Eliminer
|
||||
} else {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Unexpected 412/416 error (%s) for %s%s, '%s' could not be found on disk",
|
||||
r->msg, urladr(), urlfil(),
|
||||
heap(ptr)->sav != NULL ? heap(ptr)->sav : "");
|
||||
}
|
||||
|
||||
// Re-get once, only if a partial existed and both Range triggers are
|
||||
// gone; a failed removal gives up rather than looping. range_used is
|
||||
// unreliable (it does not survive the delayed-type two-pass).
|
||||
if (had_partial && ref_gone && !fexist_utf8(heap(ptr)->sav)) {
|
||||
if (!fexist_utf8(heap(ptr)->sav)) { // Bien éliminé? (sinon on boucle..)
|
||||
#if HDEBUG
|
||||
printf("Partial content NOT up-to-date, reget all file for %s\n",
|
||||
heap(ptr)->sav);
|
||||
#endif
|
||||
hts_log_print(opt, LOG_DEBUG, "Partial file reget (%s) for %s%s",
|
||||
r->msg, urladr(), urlfil());
|
||||
// enregistrer le MEME lien
|
||||
if (hts_record_link(opt, heap(ptr)->adr, heap(ptr)->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||
heap_top()->testmode = heap(ptr)->testmode;
|
||||
heap_top()->link_import = 0;
|
||||
heap_top()->testmode = heap(ptr)->testmode; // mode test?
|
||||
heap_top()->link_import = 0; // pas mode import
|
||||
heap_top()->depth = heap(ptr)->depth;
|
||||
heap_top()->pass2 = max(heap(ptr)->pass2, numero_passe);
|
||||
heap_top()->retry = heap(ptr)->retry;
|
||||
heap_top()->premier = heap(ptr)->premier;
|
||||
heap_top()->precedent = ptr;
|
||||
//
|
||||
// canceller lien actuel
|
||||
error = 1;
|
||||
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
||||
} else { // out of memory
|
||||
XH_uninit;
|
||||
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
||||
//
|
||||
} else { // oups erreur, plus de mémoire!!
|
||||
XH_uninit; // désallocation mémoire & buffers
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Giving up on partial reget (%s) for %s%s", r->msg,
|
||||
urladr(), urlfil());
|
||||
hts_log_print(opt, LOG_ERROR, "Can not remove old file %s", urlfil());
|
||||
error = 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Issue #206: a continue/update crawl looped forever when the resume Range got a
|
||||
# 416. Pass 1 leaves a partial + temp-ref; pass 2 must terminate and not loop.
|
||||
set -u
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
testdir=$(cd "$(dirname "$0")" && pwd)
|
||||
server="${testdir}/local-server.py"
|
||||
|
||||
command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
|
||||
|
||||
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_206.XXXXXX") || exit 1
|
||||
serverpid=
|
||||
crawlpid=
|
||||
cleanup() {
|
||||
test -n "$crawlpid" && kill -9 "$crawlpid" 2>/dev/null
|
||||
if test -n "$serverpid"; then
|
||||
kill "$serverpid" 2>/dev/null
|
||||
wait "$serverpid" 2>/dev/null
|
||||
fi
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
trap cleanup EXIT HUP INT QUIT PIPE TERM
|
||||
|
||||
# --- start the server, discover its ephemeral port --------------------------
|
||||
# RESUME_COUNTER gets a byte per /resume/blob.txt request (pass-2 delta bounds re-gets).
|
||||
serverlog="${tmpdir}/server.log"
|
||||
counter="${tmpdir}/blobcount"
|
||||
RESUME_COUNTER="$counter" python3 "$server" --root "${testdir}/server-root" >"$serverlog" 2>&1 &
|
||||
serverpid=$!
|
||||
port=
|
||||
for _ in $(seq 1 50); do
|
||||
line=$(head -n1 "$serverlog" 2>/dev/null)
|
||||
if test "${line%% *}" == "PORT"; then
|
||||
port="${line#PORT }"
|
||||
break
|
||||
fi
|
||||
kill -0 "$serverpid" 2>/dev/null || {
|
||||
echo "server exited early: $(cat "$serverlog")"
|
||||
exit 1
|
||||
}
|
||||
sleep 0.1
|
||||
done
|
||||
test -n "$port" || {
|
||||
echo "could not discover server port"
|
||||
exit 1
|
||||
}
|
||||
base="http://127.0.0.1:${port}"
|
||||
|
||||
which httrack >/dev/null || {
|
||||
echo "could not find httrack"
|
||||
exit 1
|
||||
}
|
||||
out="${tmpdir}/crawl"
|
||||
mkdir "$out"
|
||||
common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0)
|
||||
refdir="${out}/hts-cache/ref"
|
||||
|
||||
# --- pass 1: crawl, interrupt once the blob download is underway -------------
|
||||
printf '[pass 1: interrupt mid-download] ..\t'
|
||||
httrack "${common[@]}" "${base}/resume/index.html" >"${tmpdir}/log1" 2>&1 &
|
||||
crawlpid=$!
|
||||
# Wait until blob.txt is requested, then SIGTERM so httrack's exit handler
|
||||
# finalizes the cache and serializes the temp-ref.
|
||||
for _ in $(seq 1 300); do
|
||||
test -s "$counter" && break
|
||||
kill -0 "$crawlpid" 2>/dev/null || break
|
||||
sleep 0.1
|
||||
done
|
||||
sleep 0.5
|
||||
kill -TERM "$crawlpid" 2>/dev/null
|
||||
wait "$crawlpid" 2>/dev/null
|
||||
crawlpid=
|
||||
test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
|
||||
echo "FAIL: no temp-ref survived pass 1; cannot drive #206"
|
||||
exit 1
|
||||
}
|
||||
echo "OK (temp-ref present)"
|
||||
before=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||
|
||||
# --- pass 2: --continue -> resume Range -> 416, bounded against the #206 loop -
|
||||
# Kill pass 2 after a deadline (portable stand-in for `timeout`, absent on macOS).
|
||||
printf '[pass 2: resume must terminate] ..\t'
|
||||
HANG_RC=137 # 128 + SIGKILL
|
||||
httrack "${common[@]}" --continue "${base}/resume/index.html" >"${tmpdir}/log2" 2>&1 &
|
||||
crawlpid=$!
|
||||
(sleep 30 && kill -9 "$crawlpid" 2>/dev/null) &
|
||||
guard=$!
|
||||
rc=0
|
||||
wait "$crawlpid" 2>/dev/null || rc=$?
|
||||
crawlpid=
|
||||
kill "$guard" 2>/dev/null || true
|
||||
wait "$guard" 2>/dev/null || true
|
||||
if test "$rc" -eq "$HANG_RC"; then
|
||||
echo "FAIL: pass 2 did not terminate (#206 resume->416 loop)"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK (terminated, rc=$rc)"
|
||||
|
||||
# The fix re-gets once (resume Range + range-less re-get = 2): the lower bound
|
||||
# rejects a drop-the-link non-fix (1), the upper bound rejects the loop (many).
|
||||
after=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||
hits=$((after - before))
|
||||
printf '[bounded re-get count] ..\t'
|
||||
if test "$hits" -lt 2; then
|
||||
echo "FAIL: only ${hits} pass-2 request(s); the stale partial was not re-got"
|
||||
exit 1
|
||||
fi
|
||||
if test "$hits" -gt 8; then
|
||||
echo "FAIL: ${hits} pass-2 requests for blob.txt (resume is looping)"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK (${hits} requests)"
|
||||
@@ -1,11 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# #157: a dotless, accented URL named .html on the first crawl must keep .html
|
||||
# across an update -- not revert to the extensionless name.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'intl/Instalação_CVS_no_Ubuntu.html' \
|
||||
--not-found 'intl/Instalação_CVS_no_Ubuntu' \
|
||||
httrack 'BASEURL/intl/index.html'
|
||||
@@ -59,8 +59,6 @@ TESTS = \
|
||||
16_local-assume.test \
|
||||
17_local-empty-ct.test \
|
||||
18_local-update.test \
|
||||
19_local-connect-fallback.test \
|
||||
20_local-resume-loop.test \
|
||||
21_local-intl-update.test
|
||||
19_local-connect-fallback.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -196,15 +196,6 @@ if test -n "$rerun"; then
|
||||
exit 1
|
||||
}
|
||||
result "OK (update)"
|
||||
# The update summary reports "files updated"; a fresh crawl never does. Assert
|
||||
# it so a regression that bypasses the cache (re-crawls fresh) can't pass.
|
||||
info "checking update used the cache"
|
||||
if grep -aqE "mirror complete in .*files updated" "${out}/hts-log.txt"; then
|
||||
result "OK"
|
||||
else
|
||||
result "update pass did not report cache activity"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||
|
||||
@@ -15,7 +15,6 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||
from urllib.parse import quote, unquote, urlsplit
|
||||
|
||||
@@ -177,54 +176,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
body, ctype = self.TYPE_MATRIX[path]
|
||||
self.send_raw(body, ctype)
|
||||
|
||||
# --- special chars in URLs across an update (issue #157) ---------------
|
||||
# A dotless, accented basename served as text/html (MediaWiki style). The
|
||||
# name the first crawl picks (.html) must survive the update pass.
|
||||
INTL_NAME = "Instalação_CVS_no_Ubuntu"
|
||||
|
||||
def route_intl_index(self):
|
||||
self.send_html('\t<a href="%s">accented</a>\n' % self.INTL_NAME)
|
||||
|
||||
def route_intl_page(self):
|
||||
self.send_raw(b"<html><body>accented page</body></html>\n", "text/html")
|
||||
|
||||
# resume / 416 loop (#206): the first GET stalls after a prefix so the crawl
|
||||
# can be interrupted (partial + temp-ref); every later request is 416.
|
||||
RESUME_PREFIX = b"PARTIAL-" + b"x" * 4096 # flushed before the stall
|
||||
RESUME_LEN = len(RESUME_PREFIX) + 4096 # declared length never delivered
|
||||
_resume_started = False
|
||||
|
||||
def route_resume_index(self):
|
||||
self.send_html('\t<a href="blob.txt">blob</a>')
|
||||
|
||||
def route_resume(self):
|
||||
counter = os.environ.get("RESUME_COUNTER")
|
||||
if counter:
|
||||
with open(counter, "a") as fp:
|
||||
fp.write("x")
|
||||
# First GET: stall mid-body so the crawl can be interrupted with a partial.
|
||||
if not Handler._resume_started:
|
||||
Handler._resume_started = True
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "image/png")
|
||||
self.send_header("Content-Length", str(self.RESUME_LEN))
|
||||
self.send_header("Accept-Ranges", "bytes")
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(self.RESUME_PREFIX)
|
||||
self.wfile.flush()
|
||||
try:
|
||||
while True:
|
||||
time.sleep(3600)
|
||||
except OSError:
|
||||
pass
|
||||
return
|
||||
self.send_response(416, "Requested Range Not Satisfiable")
|
||||
self.send_header("Content-Type", "image/png")
|
||||
self.send_header("Content-Range", "bytes */%d" % self.RESUME_LEN)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
@@ -244,10 +195,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/types/style.css": route_types,
|
||||
"/types/data.json": route_types,
|
||||
"/types/gen.php": route_types,
|
||||
"/intl/index.html": route_intl_index,
|
||||
"/intl/" + INTL_NAME: route_intl_page,
|
||||
"/resume/index.html": route_resume_index,
|
||||
"/resume/blob.txt": route_resume,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
@@ -255,8 +202,7 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
# Match percent-encoded paths (accented #157 route) by their decoded form.
|
||||
handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
|
||||
handler = self.ROUTES.get(path)
|
||||
if handler is not None:
|
||||
handler(self)
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user