mirror of
https://github.com/xroche/httrack.git
synced 2026-06-28 21:17:57 +03:00
Compare commits
4 Commits
fix/update
...
fix/cache-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f900ca5efd | ||
|
|
5501faa7b1 | ||
|
|
6322b6fb1f | ||
|
|
58f368a91a |
@@ -220,6 +220,25 @@ struct cache_back_zip_entry {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/* A cache (new.zip) write failed: storage is gone (disk full / dropped share),
|
||||
so the mirror is doomed too. Abort it via exit_xh, don't crash as assertf
|
||||
did. */
|
||||
static void cache_zip_write_failed(httrackp *opt, cache_back *cache,
|
||||
const char *what, int zErr) {
|
||||
if (!cache->zipWriteFailed) {
|
||||
cache->zipWriteFailed = HTS_TRUE;
|
||||
if (check_fatal_io_errno()) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: cache write failed (%s): %s", what,
|
||||
hts_get_zerror(zErr));
|
||||
}
|
||||
}
|
||||
opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
|
||||
}
|
||||
|
||||
/* Ajout d'un fichier en cache */
|
||||
void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
const char *url_adr, const char *url_fil, const char *url_save,
|
||||
@@ -236,6 +255,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
const char *url_save_suffix = url_save;
|
||||
int zErr;
|
||||
|
||||
/* already failed and aborting; don't touch the broken stream again */
|
||||
if (cache->zipWriteFailed)
|
||||
return;
|
||||
|
||||
// robots.txt hack
|
||||
if (url_save == NULL) {
|
||||
dataincache = 0; // testing links
|
||||
@@ -346,9 +369,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
*/
|
||||
headers, (uInt) strlen(headers), NULL, 0, NULL, /* comment */
|
||||
Z_DEFLATED, Z_DEFAULT_COMPRESSION)) != Z_OK) {
|
||||
int zip_zipOpenNewFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipOpenNewFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "opening a cache entry", zErr);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Write data in cache */
|
||||
@@ -358,9 +380,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, r->adr,
|
||||
(int) r->size)) != Z_OK) {
|
||||
int zip_zipWriteInFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipWriteInFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache", zErr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -381,9 +402,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, buff,
|
||||
(int) nl)) != Z_OK) {
|
||||
int zip_zipWriteInFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipWriteInFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache",
|
||||
zErr);
|
||||
fclose(fp);
|
||||
return;
|
||||
}
|
||||
}
|
||||
} while(nl > 0);
|
||||
@@ -397,16 +419,14 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
|
||||
/* Close */
|
||||
if ((zErr = zipCloseFileInZip((zipFile) cache->zipOutput)) != Z_OK) {
|
||||
int zip_zipCloseFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipCloseFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "closing a cache entry", zErr);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Flush */
|
||||
if ((zErr = zipFlush((zipFile) cache->zipOutput)) != 0) {
|
||||
int zip_zipFlush_failed = 0;
|
||||
|
||||
assertf(zip_zipFlush_failed);
|
||||
cache_zip_write_failed(opt, cache, "flushing the cache", zErr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htslib.h"
|
||||
#include "htszlib.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
@@ -316,6 +317,136 @@ static int disk_fallback_selftest(httrackp *opt) {
|
||||
return fail;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
size_t budget; /**< bytes allowed through before writes start failing */
|
||||
int fail_errno; /**< errno set on the failing write (ENOSPC, EIO, ...) */
|
||||
int writes; /**< zwrite call count, to detect re-entry into the stream */
|
||||
} writefail_inject;
|
||||
|
||||
/* zwrite that copies until the budget runs out, then fails with inj->fail_errno
|
||||
(the #174/#219 condition). Counts calls so the test can prove a flagged cache
|
||||
never re-enters the stream. */
|
||||
static uLong selftest_failing_zwrite(voidpf opaque, voidpf stream,
|
||||
const void *buf, uLong size) {
|
||||
writefail_inject *inj = (writefail_inject *) opaque;
|
||||
|
||||
inj->writes++;
|
||||
if (inj->budget >= (size_t) size) {
|
||||
inj->budget -= (size_t) size;
|
||||
return (uLong) fwrite(buf, 1, (size_t) size, (FILE *) stream);
|
||||
}
|
||||
errno = inj->fail_errno;
|
||||
return 0; /* short write -> the minizip op returns an error */
|
||||
}
|
||||
|
||||
/* Open a ZIP whose writes fail past inj->budget, so cache_add() hits an error.
|
||||
*/
|
||||
static zipFile selftest_open_failing_zip(const char *path,
|
||||
writefail_inject *inj) {
|
||||
zlib_filefunc_def ff;
|
||||
|
||||
fill_fopen_filefunc(&ff); /* real fopen/read/seek/close; ignores opaque */
|
||||
ff.zwrite_file = selftest_failing_zwrite;
|
||||
ff.opaque = inj;
|
||||
return zipOpen2(path, APPEND_STATUS_CREATE, NULL, &ff);
|
||||
}
|
||||
|
||||
/* Store one octet-stream body into `cache` (all-in-cache, body in the ZIP). */
|
||||
static void writefail_store(httrackp *opt, cache_back *cache, const char *fil,
|
||||
const char *body, size_t body_len) {
|
||||
htsblk r;
|
||||
char locbuf[4];
|
||||
char *bodycopy = malloct(body_len);
|
||||
|
||||
hts_init_htsblk(&r);
|
||||
r.statuscode = 200;
|
||||
r.size = (LLint) body_len;
|
||||
strcpybuff(r.msg, "OK");
|
||||
strcpybuff(r.contenttype, "application/octet-stream");
|
||||
locbuf[0] = '\0';
|
||||
r.location = locbuf;
|
||||
r.is_write = 0;
|
||||
memcpy(bodycopy, body, body_len);
|
||||
r.adr = bodycopy;
|
||||
cache_add(opt, cache, &r, "example.com", fil, "example.com/blob.bin", 1,
|
||||
NULL);
|
||||
freet(bodycopy);
|
||||
}
|
||||
|
||||
/* #174/#219: a failing cache write used to crash via assertf(); it must instead
|
||||
stop the mirror (exit_xh = -1) without crashing. Assert that, plus the cache
|
||||
is flagged and a sibling write doesn't re-enter the broken stream. */
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
int fail = 0;
|
||||
char path[HTS_URLMAXSIZE];
|
||||
/* incompressible + big, so deflate flushes (and fails) mid-write, before
|
||||
* close */
|
||||
static const size_t body_len = 256 * 1024;
|
||||
char *body = malloct(body_len);
|
||||
int phase;
|
||||
|
||||
gen_body(body, body_len, 1 /* incompressible */);
|
||||
fconcat(path, sizeof(path), dir, "/wfail.zip");
|
||||
|
||||
/* phase 0: fail on the body write, fatal errno (ENOSPC, the disk-full
|
||||
branch). phase 1: fail on the open, non-fatal errno (EIO, dropped-share
|
||||
branch). Both must abort the mirror. */
|
||||
for (phase = 0; phase < 2; phase++) {
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
int writes_after_fail;
|
||||
|
||||
inj.budget = (phase == 0) ? 4096 : 0;
|
||||
inj.fail_errno = (phase == 0) ? ENOSPC : EIO;
|
||||
inj.writes = 0;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||
if (cache.zipOutput == NULL) {
|
||||
fprintf(stderr, "cache-writefail: could not open injected ZIP\n");
|
||||
fail++;
|
||||
continue;
|
||||
}
|
||||
|
||||
opt->state.exit_xh = 0; /* clear; the failing write must set it to -1 */
|
||||
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||
if (!cache.zipWriteFailed) {
|
||||
fprintf(stderr, "cache-writefail: phase %d: write error not caught\n",
|
||||
phase);
|
||||
fail++;
|
||||
}
|
||||
if (opt->state.exit_xh != -1) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: phase %d: mirror not aborted (exit_xh=%d)\n",
|
||||
phase, opt->state.exit_xh);
|
||||
fail++;
|
||||
}
|
||||
|
||||
/* a flagged cache must no-op a sibling write: no further backend write */
|
||||
writes_after_fail = inj.writes;
|
||||
writefail_store(opt, &cache, "/blob2.bin", body, 16);
|
||||
if (inj.writes != writes_after_fail) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: phase %d: sibling write re-entered the broken "
|
||||
"stream (%d extra backend writes)\n",
|
||||
phase, inj.writes - writes_after_fail);
|
||||
fail++;
|
||||
}
|
||||
|
||||
if (cache.zipOutput != NULL) {
|
||||
zipClose(cache.zipOutput,
|
||||
NULL); /* best-effort; may fail on the backend */
|
||||
cache.zipOutput = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
freet(body);
|
||||
return fail;
|
||||
}
|
||||
|
||||
int cache_selftests(httrackp *opt, const char *dir) {
|
||||
int failures = 0;
|
||||
cache_back cache;
|
||||
|
||||
@@ -52,6 +52,10 @@ int cache_selftests(httrackp *opt, const char *dir);
|
||||
committed file, never by the test). Returns the failed-check count. */
|
||||
int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
|
||||
|
||||
/* #174/#219: assert a failing cache write aborts the mirror cleanly instead of
|
||||
crashing. Returns the failed-check count. */
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -214,6 +214,8 @@ struct cache_back {
|
||||
cache_back_zip_entry *zipEntries;
|
||||
int zipEntriesOffs;
|
||||
int zipEntriesCapa;
|
||||
hts_boolean
|
||||
zipWriteFailed; /**< a cache write failed; stop touching the stream */
|
||||
};
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_hash_struct
|
||||
|
||||
@@ -353,6 +353,14 @@ static void basic_selftests(void) {
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"noextfile", 1) == 1);
|
||||
assertf(strcmp(r.contenttype, "application/octet-stream") == 0);
|
||||
// empty fil: no extension to scan; must not over-read before the string.
|
||||
// flag==0 -> 0 (nothing written), flag==1 -> octet-stream.
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype), "",
|
||||
0) == 0);
|
||||
assertf(r.contenttype[0] == '\0');
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype), "",
|
||||
1) == 1);
|
||||
assertf(strcmp(r.contenttype, "application/octet-stream") == 0);
|
||||
// a user --assume rule with an empty value matches but writes nothing:
|
||||
// get_userhttptype returns 1 with the buffer empty, so get_httptype_sized
|
||||
// must still report 0 (callers test the return like the old
|
||||
@@ -2445,6 +2453,20 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
case 'W': // cache write-failure handling: httrack -#W <dir>
|
||||
if (na + 1 < argc) {
|
||||
const int err =
|
||||
cache_write_failure_selftest(opt, argv[na + 1]);
|
||||
|
||||
printf("cache-writefail: %s\n", err ? "FAIL" : "OK");
|
||||
htsmain_free();
|
||||
return err;
|
||||
} else {
|
||||
fprintf(stderr, "Option #W requires a directory argument\n");
|
||||
htsmain_free();
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
case 'B': // golden cache fixture read: httrack -#B <dir> [regen]
|
||||
if (na + 1 < argc) {
|
||||
const int regen =
|
||||
|
||||
@@ -4177,9 +4177,10 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
/* Check html -> text/html */
|
||||
const char *a = fil + strlen(fil) - 1;
|
||||
|
||||
while((*a != '.') && (*a != '/') && (a > fil))
|
||||
/* a < fil when fil is empty: bound before dereferencing */
|
||||
while ((a > fil) && (*a != '.') && (*a != '/'))
|
||||
a--;
|
||||
if (*a == '.' && strlen(a) < 32) {
|
||||
if (a >= fil && *a == '.' && strlen(a) < 32) {
|
||||
int j = 0;
|
||||
|
||||
a++;
|
||||
|
||||
24
tests/01_engine-cache-writefail.test
Normal file
24
tests/01_engine-cache-writefail.test
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Keep this POSIX-portable: the harness runs it via $(BASH), which is a plain
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Cache write-failure handling (httrack -#W <dir>). #174/#219.
|
||||
# A failing new.zip write (disk full) used to crash the process via assertf; it
|
||||
# must instead stop the mirror with a fatal error (exit_xh=-1), no crash. The
|
||||
# self-test asserts that; reverting the fix makes -#W abort (SIGABRT) and fail.
|
||||
|
||||
set -eu
|
||||
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
out=$(httrack -#W "$dir")
|
||||
|
||||
# Match the exact success line (error logs also go to stdout); a bad -#W falls
|
||||
# through to the usage screen, which exits non-zero but never prints this.
|
||||
printf '%s\n' "$out" | grep -qx "cache-writefail: OK" || {
|
||||
echo "expected 'cache-writefail: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
17
tests/22_local-broken-size.test
Executable file
17
tests/22_local-broken-size.test
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
# Issues #32/#41: a Content-Length that disagrees with the body warns "bogus
|
||||
# state (broken size)" and skips the cache; -%B (tolerant) accepts it.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# Default: warn, but the file is still written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-found 'bogus state \(broken size' \
|
||||
httrack 'BASEURL/size/index.html'
|
||||
|
||||
# -%B (tolerant): no warning, file written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-not-found 'bogus state' \
|
||||
httrack 'BASEURL/size/index.html' '-%B'
|
||||
19
tests/23_local-errpage.test
Normal file
19
tests/23_local-errpage.test
Normal file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
# Issue #17: with "no error pages" (-o0), 4xx/5xx bodies must not be written;
|
||||
# a genuine 0-byte 200 stays. Default (-o1) writes the error page. (#17's purge
|
||||
# half also does not reproduce; the purge path is not exercised here.)
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# -o0: 404 suppressed, good page and the legit 0-byte 200 kept.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'errpage/good.html' \
|
||||
--found 'errpage/empty.html' \
|
||||
--not-found 'errpage/missing.html' \
|
||||
httrack 'BASEURL/errpage/index.html' '-o0'
|
||||
|
||||
# Control -o1 (default): the 404 error page is written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'errpage/missing.html' \
|
||||
httrack 'BASEURL/errpage/index.html' '-o1'
|
||||
@@ -26,6 +26,7 @@ TESTS = \
|
||||
00_runnable.test \
|
||||
01_engine-cache.test \
|
||||
01_engine-cache-golden.test \
|
||||
01_engine-cache-writefail.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-cookies.test \
|
||||
@@ -61,6 +62,8 @@ TESTS = \
|
||||
18_local-update.test \
|
||||
19_local-connect-fallback.test \
|
||||
20_local-resume-loop.test \
|
||||
21_local-intl-update.test
|
||||
21_local-intl-update.test \
|
||||
22_local-broken-size.test \
|
||||
23_local-errpage.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -14,7 +14,9 @@
|
||||
# Usage:
|
||||
# bash local-crawl.sh [--tls] [--root DIR] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
|
||||
set -u
|
||||
|
||||
@@ -107,7 +109,7 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--found | --not-found | --directory)
|
||||
--found | --not-found | --directory | --log-found | --log-not-found)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
@@ -257,6 +259,22 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--log-found)
|
||||
i=$((i + 1))
|
||||
info "checking log matches ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then result "OK"; else
|
||||
result "not in log"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--log-not-found)
|
||||
i=$((i + 1))
|
||||
info "checking log lacks ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then
|
||||
result "present in log"
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
@@ -225,6 +225,39 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
# error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
|
||||
# bodies off disk; a genuine 0-byte 200 is a valid file and stays.
|
||||
def route_errpage_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="good.html">good</a>\n'
|
||||
'\t<a href="missing.html">missing</a>\n'
|
||||
'\t<a href="empty.html">empty</a>\n'
|
||||
)
|
||||
|
||||
def route_errpage_good(self):
|
||||
self.send_raw(b"<html><body>good page</body></html>\n", "text/html")
|
||||
|
||||
def route_errpage_missing(self):
|
||||
self.send_html("\t404 error body", status=404, extra_status="Not Found")
|
||||
|
||||
def route_errpage_empty(self):
|
||||
self.send_raw(b"", "text/html")
|
||||
|
||||
# broken Content-Length (#32/#41): declared size != bytes sent. httrack
|
||||
# warns "bogus state (broken size)" and skips the cache unless -%B.
|
||||
def route_size_index(self):
|
||||
self.send_html('\t<a href="oversize.bin">over</a>\n')
|
||||
|
||||
def route_size_oversize(self):
|
||||
body = b"A" * 100
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", str(len(body) - 2)) # lie: too short
|
||||
self.send_header("Connection", "close")
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
@@ -248,6 +281,12 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/intl/" + INTL_NAME: route_intl_page,
|
||||
"/resume/index.html": route_resume_index,
|
||||
"/resume/blob.txt": route_resume,
|
||||
"/size/index.html": route_size_index,
|
||||
"/size/oversize.bin": route_size_oversize,
|
||||
"/errpage/index.html": route_errpage_index,
|
||||
"/errpage/good.html": route_errpage_good,
|
||||
"/errpage/missing.html": route_errpage_missing,
|
||||
"/errpage/empty.html": route_errpage_empty,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user