Compare commits

...

1 Commits

Author SHA1 Message Date
Xavier Roche
f900ca5efd Stop the mirror with a fatal error on a cache write failure, don't crash (#174/#219)
A failed write to the new.zip cache (zipOpenNewFileInZip / zipWriteInFileInZip /
zipCloseFileInZip / zipFlush returning non-Z_OK) was a fatal assertf() that
aborted the whole process and popped CRASH.TXT. The trigger is storage going
away mid-crawl: a disk filling up overnight (#174) or a network share holding
the mirror dropping (#219); WinHTTrack users commonly mirror to a NAS or mapped
drive.

The cache lives in the same output tree as the mirror, so a cache write failing
means the mirror files can no longer be written either. Continuing would only
produce a broken, incomplete mirror reported as success. So treat it the same
way the engine already treats a failed mirror-file write (htscore.c:1961,
htsback.c:2933): log the error and set opt->state.exit_xh = -1 to stop the
mirror cleanly and exit non-zero. No crash, no CRASH.TXT.

Route the cache_add() write sites through cache_zip_write_failed(), which logs
once (the standard "disk full or filesystem problems" message when
check_fatal_io_errno() confirms it) and flags the cache so sibling cache_add()
calls don't re-enter the broken stream before the loop notices exit_xh. The flag
is appended to the end of the engine-owned, non-installed struct cache_back, so
the ABI is unchanged.

Add an in-process self-test (httrack -#W) that drives cache_add() into a ZIP
whose disk-full backend fails its writes; 01_engine-cache-writefail.test asserts
httrack signals a fatal abort instead of crashing. Negative controls proven:
reverting the fix makes -#W abort (SIGABRT); dropping the exit_xh assignment
makes the test fail on the abort-signal check.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-26 06:44:23 +02:00
7 changed files with 211 additions and 15 deletions

View File

@@ -220,6 +220,25 @@ struct cache_back_zip_entry {
} \
} while(0)
/* A cache (new.zip) write failed: storage is gone (disk full / dropped share),
so the mirror is doomed too. Abort it via exit_xh, don't crash as assertf
did. */
static void cache_zip_write_failed(httrackp *opt, cache_back *cache,
const char *what, int zErr) {
if (!cache->zipWriteFailed) {
cache->zipWriteFailed = HTS_TRUE;
if (check_fatal_io_errno()) {
hts_log_print(opt, LOG_ERROR,
"Mirror aborted: disk full or filesystem problems");
} else {
hts_log_print(opt, LOG_ERROR,
"Mirror aborted: cache write failed (%s): %s", what,
hts_get_zerror(zErr));
}
}
opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
}
/* Ajout d'un fichier en cache */
void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
const char *url_adr, const char *url_fil, const char *url_save,
@@ -236,6 +255,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
const char *url_save_suffix = url_save;
int zErr;
/* already failed and aborting; don't touch the broken stream again */
if (cache->zipWriteFailed)
return;
// robots.txt hack
if (url_save == NULL) {
dataincache = 0; // testing links
@@ -346,9 +369,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
*/
headers, (uInt) strlen(headers), NULL, 0, NULL, /* comment */
Z_DEFLATED, Z_DEFAULT_COMPRESSION)) != Z_OK) {
int zip_zipOpenNewFileInZip_failed = 0;
assertf(zip_zipOpenNewFileInZip_failed);
cache_zip_write_failed(opt, cache, "opening a cache entry", zErr);
return;
}
/* Write data in cache */
@@ -358,9 +380,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
if ((zErr =
zipWriteInFileInZip((zipFile) cache->zipOutput, r->adr,
(int) r->size)) != Z_OK) {
int zip_zipWriteInFileInZip_failed = 0;
assertf(zip_zipWriteInFileInZip_failed);
cache_zip_write_failed(opt, cache, "writing to the cache", zErr);
return;
}
}
} else {
@@ -381,9 +402,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
if ((zErr =
zipWriteInFileInZip((zipFile) cache->zipOutput, buff,
(int) nl)) != Z_OK) {
int zip_zipWriteInFileInZip_failed = 0;
assertf(zip_zipWriteInFileInZip_failed);
cache_zip_write_failed(opt, cache, "writing to the cache",
zErr);
fclose(fp);
return;
}
}
} while(nl > 0);
@@ -397,16 +419,14 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
/* Close */
if ((zErr = zipCloseFileInZip((zipFile) cache->zipOutput)) != Z_OK) {
int zip_zipCloseFileInZip_failed = 0;
assertf(zip_zipCloseFileInZip_failed);
cache_zip_write_failed(opt, cache, "closing a cache entry", zErr);
return;
}
/* Flush */
if ((zErr = zipFlush((zipFile) cache->zipOutput)) != 0) {
int zip_zipFlush_failed = 0;
assertf(zip_zipFlush_failed);
cache_zip_write_failed(opt, cache, "flushing the cache", zErr);
return;
}
}

View File

@@ -47,6 +47,7 @@ Please visit our Website: http://www.httrack.com
#include "htslib.h"
#include "htszlib.h"
#include <errno.h>
#include <stdio.h>
#include <string.h>
@@ -316,6 +317,136 @@ static int disk_fallback_selftest(httrackp *opt) {
return fail;
}
typedef struct {
size_t budget; /**< bytes allowed through before writes start failing */
int fail_errno; /**< errno set on the failing write (ENOSPC, EIO, ...) */
int writes; /**< zwrite call count, to detect re-entry into the stream */
} writefail_inject;
/* zwrite that copies until the budget runs out, then fails with inj->fail_errno
(the #174/#219 condition). Counts calls so the test can prove a flagged cache
never re-enters the stream. */
static uLong selftest_failing_zwrite(voidpf opaque, voidpf stream,
const void *buf, uLong size) {
writefail_inject *inj = (writefail_inject *) opaque;
inj->writes++;
if (inj->budget >= (size_t) size) {
inj->budget -= (size_t) size;
return (uLong) fwrite(buf, 1, (size_t) size, (FILE *) stream);
}
errno = inj->fail_errno;
return 0; /* short write -> the minizip op returns an error */
}
/* Open a ZIP whose writes fail past inj->budget, so cache_add() hits an error.
*/
static zipFile selftest_open_failing_zip(const char *path,
writefail_inject *inj) {
zlib_filefunc_def ff;
fill_fopen_filefunc(&ff); /* real fopen/read/seek/close; ignores opaque */
ff.zwrite_file = selftest_failing_zwrite;
ff.opaque = inj;
return zipOpen2(path, APPEND_STATUS_CREATE, NULL, &ff);
}
/* Store one octet-stream body into `cache` (all-in-cache, body in the ZIP). */
static void writefail_store(httrackp *opt, cache_back *cache, const char *fil,
const char *body, size_t body_len) {
htsblk r;
char locbuf[4];
char *bodycopy = malloct(body_len);
hts_init_htsblk(&r);
r.statuscode = 200;
r.size = (LLint) body_len;
strcpybuff(r.msg, "OK");
strcpybuff(r.contenttype, "application/octet-stream");
locbuf[0] = '\0';
r.location = locbuf;
r.is_write = 0;
memcpy(bodycopy, body, body_len);
r.adr = bodycopy;
cache_add(opt, cache, &r, "example.com", fil, "example.com/blob.bin", 1,
NULL);
freet(bodycopy);
}
/* #174/#219: a failing cache write used to crash via assertf(); it must instead
stop the mirror (exit_xh = -1) without crashing. Assert that, plus the cache
is flagged and a sibling write doesn't re-enter the broken stream. */
int cache_write_failure_selftest(httrackp *opt, const char *dir) {
int fail = 0;
char path[HTS_URLMAXSIZE];
/* incompressible + big, so deflate flushes (and fails) mid-write, before
* close */
static const size_t body_len = 256 * 1024;
char *body = malloct(body_len);
int phase;
gen_body(body, body_len, 1 /* incompressible */);
fconcat(path, sizeof(path), dir, "/wfail.zip");
/* phase 0: fail on the body write, fatal errno (ENOSPC, the disk-full
branch). phase 1: fail on the open, non-fatal errno (EIO, dropped-share
branch). Both must abort the mirror. */
for (phase = 0; phase < 2; phase++) {
cache_back cache;
writefail_inject inj;
int writes_after_fail;
inj.budget = (phase == 0) ? 4096 : 0;
inj.fail_errno = (phase == 0) ? ENOSPC : EIO;
inj.writes = 0;
memset(&cache, 0, sizeof(cache));
cache.type = 1;
cache.log = stderr;
cache.errlog = stderr;
cache.hashtable = coucal_new(0);
cache.zipOutput = selftest_open_failing_zip(path, &inj);
if (cache.zipOutput == NULL) {
fprintf(stderr, "cache-writefail: could not open injected ZIP\n");
fail++;
continue;
}
opt->state.exit_xh = 0; /* clear; the failing write must set it to -1 */
writefail_store(opt, &cache, "/blob.bin", body, body_len);
if (!cache.zipWriteFailed) {
fprintf(stderr, "cache-writefail: phase %d: write error not caught\n",
phase);
fail++;
}
if (opt->state.exit_xh != -1) {
fprintf(stderr,
"cache-writefail: phase %d: mirror not aborted (exit_xh=%d)\n",
phase, opt->state.exit_xh);
fail++;
}
/* a flagged cache must no-op a sibling write: no further backend write */
writes_after_fail = inj.writes;
writefail_store(opt, &cache, "/blob2.bin", body, 16);
if (inj.writes != writes_after_fail) {
fprintf(stderr,
"cache-writefail: phase %d: sibling write re-entered the broken "
"stream (%d extra backend writes)\n",
phase, inj.writes - writes_after_fail);
fail++;
}
if (cache.zipOutput != NULL) {
zipClose(cache.zipOutput,
NULL); /* best-effort; may fail on the backend */
cache.zipOutput = NULL;
}
}
freet(body);
return fail;
}
int cache_selftests(httrackp *opt, const char *dir) {
int failures = 0;
cache_back cache;

View File

@@ -52,6 +52,10 @@ int cache_selftests(httrackp *opt, const char *dir);
committed file, never by the test). Returns the failed-check count. */
int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
/* #174/#219: assert a failing cache write aborts the mirror cleanly instead of
crashing. Returns the failed-check count. */
int cache_write_failure_selftest(httrackp *opt, const char *dir);
#endif
#endif

View File

@@ -214,6 +214,8 @@ struct cache_back {
cache_back_zip_entry *zipEntries;
int zipEntriesOffs;
int zipEntriesCapa;
hts_boolean
zipWriteFailed; /**< a cache write failed; stop touching the stream */
};
#ifndef HTS_DEF_FWSTRUCT_hash_struct

View File

@@ -2453,6 +2453,20 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
return 1;
}
break;
case 'W': // cache write-failure handling: httrack -#W <dir>
if (na + 1 < argc) {
const int err =
cache_write_failure_selftest(opt, argv[na + 1]);
printf("cache-writefail: %s\n", err ? "FAIL" : "OK");
htsmain_free();
return err;
} else {
fprintf(stderr, "Option #W requires a directory argument\n");
htsmain_free();
return 1;
}
break;
case 'B': // golden cache fixture read: httrack -#B <dir> [regen]
if (na + 1 < argc) {
const int regen =

View File

@@ -0,0 +1,24 @@
#!/bin/bash
#
# Keep this POSIX-portable: the harness runs it via $(BASH), which is a plain
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
# tool flags despite the #!/bin/bash above.
# Cache write-failure handling (httrack -#W <dir>). #174/#219.
# A failing new.zip write (disk full) used to crash the process via assertf; it
# must instead stop the mirror with a fatal error (exit_xh=-1), no crash. The
# self-test asserts that; reverting the fix makes -#W abort (SIGABRT) and fail.
set -eu
dir=$(mktemp -d)
trap 'rm -rf "$dir"' EXIT
out=$(httrack -#W "$dir")
# Match the exact success line (error logs also go to stdout); a bad -#W falls
# through to the usage screen, which exits non-zero but never prints this.
printf '%s\n' "$out" | grep -qx "cache-writefail: OK" || {
echo "expected 'cache-writefail: OK', got: $out" >&2
exit 1
}

View File

@@ -26,6 +26,7 @@ TESTS = \
00_runnable.test \
01_engine-cache.test \
01_engine-cache-golden.test \
01_engine-cache-writefail.test \
01_engine-charset.test \
01_engine-cmdline.test \
01_engine-cookies.test \