mirror of
https://github.com/xroche/httrack.git
synced 2026-06-28 13:07:35 +03:00
Compare commits
1 Commits
fix/pdf-co
...
selftest-n
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a21ec9acf |
@@ -57,10 +57,7 @@ Please visit our Website: http://www.httrack.com
|
||||
// DOS
|
||||
#include <process.h> /* _beginthread, _endthread */
|
||||
#endif
|
||||
#include <io.h> /* _chsize_s */
|
||||
#define HTS_FTRUNCATE(fp, sz) _chsize_s(_fileno(fp), (sz))
|
||||
#else
|
||||
#define HTS_FTRUNCATE(fp, sz) ftruncate(fileno(fp), (sz))
|
||||
#endif
|
||||
|
||||
#define VT_CLREOL "\33[K"
|
||||
@@ -3777,70 +3774,35 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
// xxc SI CHUNK VERIFIER QUE CA MARCHE??
|
||||
if (back[i].r.statuscode == 206) { // on nous envoie un morceau (la fin) coz une partie sur disque!
|
||||
off_t sz = fsize_utf8(back[i].url_sav);
|
||||
/* RFC 7233: resume at the server's Content-Range start,
|
||||
not the offset we requested; a server may resume
|
||||
earlier and appending the overlap duplicates bytes
|
||||
(#198). */
|
||||
const LLint resume = back[i].r.crange_start;
|
||||
const hts_boolean range_ok =
|
||||
back[i].r.crange > 0 && resume >= 0 &&
|
||||
resume <= (LLint) sz &&
|
||||
back[i].r.crange_end + 1 == back[i].r.crange &&
|
||||
(back[i].r.totalsize < 0 ||
|
||||
back[i].r.totalsize ==
|
||||
back[i].r.crange_end - resume + 1);
|
||||
|
||||
#if HDEBUG
|
||||
printf("partial content: " LLintP " on disk..\n",
|
||||
(LLint) sz);
|
||||
#endif
|
||||
if (sz >= 0 && range_ok) {
|
||||
if (sz >= 0) {
|
||||
if (!is_hypertext_mime(opt, back[i].r.contenttype, back[i].url_sav)) { // pas HTML
|
||||
if (opt->getmode & HTS_GETMODE_NONHTML) {
|
||||
filenote(&opt->state.strc, back[i].url_sav, NULL); // noter fichier comme connu
|
||||
file_notify(opt, back[i].url_adr, back[i].url_fil,
|
||||
back[i].url_sav, 0, 1,
|
||||
back[i].r.notmodified);
|
||||
back[i].r.out =
|
||||
FOPEN(fconv(catbuff, sizeof(catbuff),
|
||||
back[i].url_sav),
|
||||
"r+b"); // resume in place
|
||||
back[i].r.out = FOPEN(fconv(catbuff, sizeof(catbuff), back[i].url_sav), "ab"); // append
|
||||
if (back[i].r.out && opt->cache != 0) {
|
||||
back[i].r.is_write = 1;
|
||||
back[i].r.size = resume; // bytes already on disk
|
||||
back[i].r.statuscode = HTTP_OK; // force 'OK'
|
||||
back[i].r.is_write = 1; // écrire
|
||||
back[i].r.size = sz; // déja écrit
|
||||
back[i].r.statuscode = HTTP_OK; // Forcer 'OK'
|
||||
if (back[i].r.totalsize >= 0)
|
||||
back[i].r.totalsize += resume; // -> full size
|
||||
// drop bytes past the resume point; a silent
|
||||
// failure could leave a stale tail, so on error
|
||||
// drop the partial and refetch the whole file
|
||||
if (HTS_FTRUNCATE(back[i].r.out,
|
||||
(off_t) resume) != 0) {
|
||||
fclose(back[i].r.out);
|
||||
back[i].r.out = NULL;
|
||||
url_savename_refname_remove(
|
||||
opt, back[i].url_adr, back[i].url_fil);
|
||||
UNLINK(back[i].url_sav);
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
strcpybuff(back[i].r.msg,
|
||||
"Can not truncate partial file, "
|
||||
"restarting");
|
||||
} else {
|
||||
fseeko(back[i].r.out, (off_t) resume, SEEK_SET);
|
||||
/* create a temporary reference file in case of
|
||||
* broken mirror */
|
||||
if (back_serialize_ref(opt, &back[i]) != 0) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Could not create temporary "
|
||||
"reference file for %s%s",
|
||||
back[i].url_adr,
|
||||
back[i].url_fil);
|
||||
}
|
||||
#if HDEBUG
|
||||
printf("continue interrupted file\n");
|
||||
#endif
|
||||
back[i].r.totalsize += sz; // plus en fait
|
||||
fseek(back[i].r.out, 0, SEEK_END); // à la fin
|
||||
/* create a temporary reference file in case of broken mirror */
|
||||
if (back_serialize_ref(opt, &back[i]) != 0) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Could not create temporary reference file for %s%s",
|
||||
back[i].url_adr, back[i].url_fil);
|
||||
}
|
||||
#if HDEBUG
|
||||
printf("continue interrupted file\n");
|
||||
#endif
|
||||
} else { // On est dans la m**
|
||||
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
||||
back_set_finished(sback, i);
|
||||
@@ -3852,18 +3814,17 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
FILE *fp =
|
||||
FOPEN(fconv(catbuff, sizeof(catbuff), back[i].url_sav), "rb");
|
||||
if (fp) {
|
||||
LLint alloc_mem = resume + 1;
|
||||
LLint alloc_mem = sz + 1;
|
||||
|
||||
if (back[i].r.totalsize >= 0)
|
||||
alloc_mem += back[i].r.totalsize; // AJOUTER RESTANT!
|
||||
if (deleteaddr(&back[i].r)
|
||||
&& (back[i].r.adr =
|
||||
(char *) malloct((size_t) alloc_mem))) {
|
||||
back[i].r.size = resume;
|
||||
back[i].r.size = sz;
|
||||
if (back[i].r.totalsize >= 0)
|
||||
back[i].r.totalsize += resume; // -> full size
|
||||
if ((fread(back[i].r.adr, 1, (size_t) resume,
|
||||
fp)) != (size_t) resume) {
|
||||
back[i].r.totalsize += sz; // plus en fait
|
||||
if ((fread(back[i].r.adr, 1, sz, fp)) != sz) {
|
||||
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
||||
back_set_finished(sback, i);
|
||||
strcpybuff(back[i].r.msg,
|
||||
@@ -3881,30 +3842,14 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
"No memory for partial file");
|
||||
}
|
||||
fclose(fp);
|
||||
} else { // open failed
|
||||
} else { // Argh..
|
||||
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
||||
back_set_finished(sback, i);
|
||||
strcpybuff(back[i].r.msg,
|
||||
"Can not open partial file");
|
||||
}
|
||||
}
|
||||
} else if (sz >=
|
||||
0) { // unusable range -> restart whole file
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Unusable partial-content range for %s%s "
|
||||
"(have " LLintP " bytes, got " LLintP
|
||||
"-" LLintP "/" LLintP "), restarting",
|
||||
back[i].url_adr, back[i].url_fil,
|
||||
(LLint) sz, back[i].r.crange_start,
|
||||
back[i].r.crange_end, back[i].r.crange);
|
||||
url_savename_refname_remove(opt, back[i].url_adr,
|
||||
back[i].url_fil);
|
||||
UNLINK(back[i].url_sav);
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
strcpybuff(back[i].r.msg,
|
||||
"Unusable partial content, restarting");
|
||||
} else { // partial not found
|
||||
} else { // Non trouvé??
|
||||
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
||||
back_set_finished(sback, i);
|
||||
strcpybuff(back[i].r.msg, "Can not find partial file");
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Issue #198: on a resumed download the server may answer the Range with a 206
|
||||
# that starts *before* the offset we asked for (block-aligned ranges). httrack
|
||||
# must honor the returned Content-Range, not blindly append, or the overlap
|
||||
# bytes get duplicated and the file grows (corrupt PDFs). Pass 1 interrupts
|
||||
# flaky.bin mid-body (partial + temp-ref); pass 2 resumes against a 206 that
|
||||
# backs up 8 bytes. The result must equal the same bytes fetched whole (full.bin).
|
||||
set -eu
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
testdir=$(cd "$(dirname "$0")" && pwd)
|
||||
server="${testdir}/local-server.py"
|
||||
|
||||
command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
|
||||
|
||||
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_198.XXXXXX") || exit 1
|
||||
serverpid=
|
||||
crawlpid=
|
||||
cleanup() {
|
||||
if test -n "$crawlpid"; then kill -9 "$crawlpid" 2>/dev/null || true; fi
|
||||
if test -n "$serverpid"; then
|
||||
kill "$serverpid" 2>/dev/null || true
|
||||
wait "$serverpid" 2>/dev/null || true
|
||||
fi
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
trap cleanup EXIT HUP INT QUIT PIPE TERM
|
||||
|
||||
# OVERLAP_COUNTER gets a byte per flaky.bin request so pass 1 knows when to interrupt.
|
||||
serverlog="${tmpdir}/server.log"
|
||||
counter="${tmpdir}/hits"
|
||||
resumed="${tmpdir}/resumed" # gets a byte when the server serves a resume 206
|
||||
OVERLAP_COUNTER="$counter" OVERLAP_RESUMED="$resumed" \
|
||||
python3 "$server" --root "${testdir}/server-root" \
|
||||
>"$serverlog" 2>&1 &
|
||||
serverpid=$!
|
||||
port=
|
||||
for _ in $(seq 1 50); do
|
||||
line=$(head -n1 "$serverlog" 2>/dev/null)
|
||||
if test "${line%% *}" == "PORT"; then
|
||||
port="${line#PORT }"
|
||||
break
|
||||
fi
|
||||
kill -0 "$serverpid" 2>/dev/null || {
|
||||
echo "server exited early: $(cat "$serverlog")"
|
||||
exit 1
|
||||
}
|
||||
sleep 0.1
|
||||
done
|
||||
test -n "$port" || {
|
||||
echo "could not discover server port"
|
||||
exit 1
|
||||
}
|
||||
base="http://127.0.0.1:${port}"
|
||||
|
||||
which httrack >/dev/null || {
|
||||
echo "could not find httrack"
|
||||
exit 1
|
||||
}
|
||||
out="${tmpdir}/crawl"
|
||||
common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0 -c1)
|
||||
refdir="${out}/hts-cache/ref"
|
||||
|
||||
# pass 1: interrupt once flaky.bin's prefix is streaming (partial + temp-ref).
|
||||
printf '[pass 1: interrupt flaky.bin] ..\t'
|
||||
httrack "${common[@]}" "${base}/overlap/index.html" >"${tmpdir}/log1" 2>&1 &
|
||||
crawlpid=$!
|
||||
for _ in $(seq 1 300); do
|
||||
test -s "$counter" && break
|
||||
kill -0 "$crawlpid" 2>/dev/null || break
|
||||
sleep 0.1
|
||||
done
|
||||
sleep 0.5
|
||||
kill -TERM "$crawlpid" 2>/dev/null || true
|
||||
wait "$crawlpid" 2>/dev/null || true
|
||||
crawlpid=
|
||||
test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
|
||||
echo "FAIL: no temp-ref survived pass 1; cannot drive the resume"
|
||||
exit 1
|
||||
}
|
||||
echo "OK (temp-ref present)"
|
||||
|
||||
# pass 2: --continue -> resume Range -> 206 that starts 8 bytes early.
|
||||
printf '[pass 2: resume flaky.bin] ..\t'
|
||||
httrack "${common[@]}" --continue "${base}/overlap/index.html" >"${tmpdir}/log2" 2>&1 || true
|
||||
echo "OK"
|
||||
|
||||
# Guard against a silent full re-download: the byte-compare below only tests the
|
||||
# fix if pass 2 actually went through the resume Range -> 206 path.
|
||||
printf '[resume path was exercised] ..\t'
|
||||
if ! test -s "$resumed"; then
|
||||
echo "FAIL: pass 2 never triggered a resume 206; the overlap fix was not exercised"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK"
|
||||
|
||||
printf '[resumed file is not corrupted] ..\t'
|
||||
dir=$(find "$out" -maxdepth 1 -type d -name '127.0.0.1*' | head -1)
|
||||
flaky="${dir}/overlap/flaky.bin"
|
||||
full="${dir}/overlap/full.bin"
|
||||
if ! test -f "$flaky" || ! test -f "$full"; then
|
||||
echo "FAIL: flaky.bin or full.bin missing after pass 2"
|
||||
exit 1
|
||||
fi
|
||||
if ! cmp -s "$flaky" "$full"; then
|
||||
echo "FAIL: resumed flaky.bin ($(wc -c <"$flaky")) != full.bin ($(wc -c <"$full")); overlap duplicated"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK ($(wc -c <"$flaky") bytes, byte-identical)"
|
||||
@@ -65,7 +65,6 @@ TESTS = \
|
||||
20_local-resume-loop.test \
|
||||
21_local-intl-update.test \
|
||||
22_local-broken-size.test \
|
||||
23_local-errpage.test \
|
||||
24_local-resume-overlap.test
|
||||
23_local-errpage.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -225,71 +225,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
# 206 resume must honor the server's Content-Range, not the offset we asked
|
||||
# for (#198): a server resuming a few bytes *before* the request must not
|
||||
# leave httrack duplicating the overlap onto the partial. flaky.bin
|
||||
# interrupts once then resumes OVERLAP_EARLY bytes early; full.bin serves
|
||||
# the identical bytes in one shot, so the test can compare the two.
|
||||
OVERLAP_BLOB = b"%PDF-1.4\n" + bytes((i * 37 + 11) % 256 for i in range(8000))
|
||||
OVERLAP_EARLY = 8
|
||||
OVERLAP_PREFIX_LEN = 4000 # flushed before the stall
|
||||
_overlap_started = False
|
||||
|
||||
def route_overlap_index(self):
|
||||
self.send_html('\t<a href="flaky.bin">flaky</a>\n\t<a href="full.bin">full</a>')
|
||||
|
||||
def route_overlap_full(self):
|
||||
self.send_raw(self.OVERLAP_BLOB, "application/octet-stream")
|
||||
|
||||
def route_overlap(self):
|
||||
counter = os.environ.get("OVERLAP_COUNTER")
|
||||
if counter:
|
||||
with open(counter, "a") as fp:
|
||||
fp.write("x")
|
||||
blob = self.OVERLAP_BLOB
|
||||
rng = self.headers.get("Range")
|
||||
# First GET: stream a prefix then stall, so the crawl can be interrupted
|
||||
# mid-body (partial + temp-ref on disk).
|
||||
if rng is None and not Handler._overlap_started:
|
||||
Handler._overlap_started = True
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", str(len(blob)))
|
||||
self.send_header("Accept-Ranges", "bytes")
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(blob[: self.OVERLAP_PREFIX_LEN])
|
||||
self.wfile.flush()
|
||||
try:
|
||||
while True:
|
||||
time.sleep(3600)
|
||||
except OSError:
|
||||
pass
|
||||
return
|
||||
if rng is None: # no resume request: serve the whole file
|
||||
return self.route_overlap_full()
|
||||
# Resume: honor the Range, but back up OVERLAP_EARLY bytes.
|
||||
start = (
|
||||
int(rng[len("bytes=") :].split("-")[0]) if rng.startswith("bytes=") else 0
|
||||
)
|
||||
start = max(0, start - self.OVERLAP_EARLY)
|
||||
# Signal that the resume Range -> 206 path actually fired, so the test
|
||||
# can prove it was exercised (not a silent full re-download).
|
||||
resumed = os.environ.get("OVERLAP_RESUMED")
|
||||
if resumed:
|
||||
with open(resumed, "a") as fp:
|
||||
fp.write("x")
|
||||
part = blob[start:]
|
||||
self.send_response(206, "Partial Content")
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", str(len(part)))
|
||||
self.send_header(
|
||||
"Content-Range", "bytes %d-%d/%d" % (start, len(blob) - 1, len(blob))
|
||||
)
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(part)
|
||||
|
||||
# error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
|
||||
# bodies off disk; a genuine 0-byte 200 is a valid file and stays.
|
||||
def route_errpage_index(self):
|
||||
@@ -346,9 +281,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/intl/" + INTL_NAME: route_intl_page,
|
||||
"/resume/index.html": route_resume_index,
|
||||
"/resume/blob.txt": route_resume,
|
||||
"/overlap/index.html": route_overlap_index,
|
||||
"/overlap/flaky.bin": route_overlap,
|
||||
"/overlap/full.bin": route_overlap_full,
|
||||
"/size/index.html": route_size_index,
|
||||
"/size/oversize.bin": route_size_oversize,
|
||||
"/errpage/index.html": route_errpage_index,
|
||||
|
||||
Reference in New Issue
Block a user