mirror of
https://github.com/xroche/httrack.git
synced 2026-07-06 00:46:30 +03:00
Compare commits
5 Commits
p2-4-cache
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7928db5729 | ||
|
|
20be48d56c | ||
|
|
484fc47eab | ||
|
|
abaf9b69a2 | ||
|
|
b0466b1d7b |
@@ -572,9 +572,12 @@ int back_finalize(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
&& back[p].r.size != back[p].r.totalsize && !opt->tolerant) {
|
||||
if (back[p].status == STATUS_READY) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"file not stored in cache due to bogus state (broken size, expected "
|
||||
LLintP " got " LLintP "): %s%s", back[p].r.totalsize,
|
||||
back[p].r.size, back[p].url_adr, back[p].url_fil);
|
||||
"incomplete transfer (expected " LLintP
|
||||
" bytes, got " LLintP
|
||||
"): file not cached, will be retried on the next update"
|
||||
" (use -%%B to cache anyway): %s%s",
|
||||
back[p].r.totalsize, back[p].r.size, back[p].url_adr,
|
||||
back[p].url_fil);
|
||||
} else {
|
||||
hts_log_print(opt, LOG_INFO,
|
||||
"incomplete file not yet stored in cache (expected "
|
||||
@@ -879,11 +882,12 @@ int back_finalize(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
back[p].url_fil, NULL);
|
||||
} else {
|
||||
/* Partial file, but marked as "ok" ? */
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"file not stored in cache due to bogus state (incomplete type with %s (%d), size "
|
||||
LLintP "): %s%s", back[p].r.msg, back[p].r.statuscode,
|
||||
(LLint) back[p].r.size, back[p].url_adr,
|
||||
back[p].url_fil);
|
||||
hts_log_print(
|
||||
opt, LOG_WARNING,
|
||||
"file with unresolved type not cached (%s (%d), size " LLintP
|
||||
"): %s%s",
|
||||
back[p].r.msg, back[p].r.statuscode, (LLint) back[p].r.size,
|
||||
back[p].url_adr, back[p].url_fil);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1358,9 +1362,39 @@ int back_flush_output(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Move a still-writing .delayed placeholder to its final name (#483). */
|
||||
hts_boolean back_delayed_rename(httrackp *opt, lien_back *back,
|
||||
const char *newname) {
|
||||
hts_boolean renamed;
|
||||
|
||||
if (!back->r.is_write || back->tmpfile != NULL ||
|
||||
!IS_DELAYED_EXT(back->url_sav) || strcmp(back->url_sav, newname) == 0)
|
||||
return HTS_TRUE; /* nothing bound to the placeholder name */
|
||||
if (back->r.out != NULL) {
|
||||
fclose(back->r.out);
|
||||
back->r.out = NULL;
|
||||
}
|
||||
renamed = RENAME(back->url_sav, newname) == 0 ? HTS_TRUE : HTS_FALSE;
|
||||
if (renamed && (back->status == STATUS_READY ||
|
||||
(back->r.out = FOPEN(newname, "ab")) != NULL)) {
|
||||
filenote(&opt->state.strc, newname, NULL);
|
||||
hts_log_print(opt, LOG_DEBUG, "moved placeholder %s to %s", back->url_sav,
|
||||
newname);
|
||||
return HTS_TRUE;
|
||||
}
|
||||
/* partial lost: drop only what we own (Windows rename won't overwrite) */
|
||||
hts_log_print(opt, LOG_WARNING | LOG_ERRNO, "unable to move %s to %s",
|
||||
back->url_sav, newname);
|
||||
back->r.statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(back->r.msg, "Write error on disk");
|
||||
back->r.is_write = 0;
|
||||
(void) UNLINK(renamed ? newname : back->url_sav);
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
// effacer entrée
|
||||
/* Discard a cancelled mid-write .delayed placeholder (unusable across runs). */
|
||||
static void back_delayed_discard(httrackp *opt, lien_back *back) {
|
||||
void back_delayed_discard(httrackp *opt, lien_back *back) {
|
||||
if (back->r.out != NULL) {
|
||||
fclose(back->r.out);
|
||||
back->r.out = NULL;
|
||||
|
||||
@@ -113,6 +113,12 @@ void back_set_locked(struct_back * sback, const int p);
|
||||
void back_set_unlocked(struct_back * sback, const int p);
|
||||
int back_delete(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
const int p);
|
||||
/* Discard back's on-disk .delayed placeholder and its refname. */
|
||||
void back_delayed_discard(httrackp *opt, lien_back *back);
|
||||
/* Move back's .delayed placeholder (and open stream) to newname;
|
||||
HTS_FALSE = file lost, slot flagged in error. */
|
||||
hts_boolean back_delayed_rename(httrackp *opt, lien_back *back,
|
||||
const char *newname);
|
||||
void back_index_unlock(struct_back * sback, const int p);
|
||||
int back_clear_entry(lien_back * back);
|
||||
int back_flush_output(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
|
||||
@@ -221,23 +221,38 @@ struct cache_back_zip_entry {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/* A cache (new.zip) write failed: storage is gone (disk full / dropped share),
|
||||
so the mirror is doomed too. Abort it via exit_xh, don't crash as assertf
|
||||
did. */
|
||||
/* Consecutive entry write failures before the cache stream is declared dead. */
|
||||
#define CACHE_MAX_WRITE_FAILURES 8
|
||||
|
||||
/* Cache write failed: a fatal errno or a failure streak aborts the mirror
|
||||
(exit_xh); an isolated failure only drops the current entry. */
|
||||
static void cache_zip_write_failed(httrackp *opt, cache_back *cache,
|
||||
const char *what, int zErr) {
|
||||
if (!cache->zipWriteFailed) {
|
||||
cache->zipWriteFailed = HTS_TRUE;
|
||||
if (check_fatal_io_errno()) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: cache write failed (%s): %s", what,
|
||||
hts_get_zerror(zErr));
|
||||
const char *what, int zErr,
|
||||
hts_boolean entry_open, const char *url_adr,
|
||||
const char *url_fil) {
|
||||
const int fatal_errno = zErr == ZIP_ERRNO && check_fatal_io_errno();
|
||||
|
||||
cache->zipWriteFailures++;
|
||||
if (fatal_errno || cache->zipWriteFailures >= CACHE_MAX_WRITE_FAILURES) {
|
||||
if (!cache->zipWriteFailed) {
|
||||
cache->zipWriteFailed = HTS_TRUE;
|
||||
if (fatal_errno) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: cache write failed (%s): %s", what,
|
||||
hts_get_zerror(zErr));
|
||||
}
|
||||
}
|
||||
opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
|
||||
} else {
|
||||
if (entry_open)
|
||||
zipCloseFileInZip((zipFile) cache->zipOutput); /* abandon, best-effort */
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"cache write failed (%s: %s), entry not cached: %s%s", what,
|
||||
hts_get_zerror(zErr), url_adr, url_fil);
|
||||
}
|
||||
opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
|
||||
}
|
||||
|
||||
/* Ajout d'un fichier en cache */
|
||||
@@ -287,10 +302,19 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if (r->size < 0) // error
|
||||
return;
|
||||
|
||||
// data in cache
|
||||
if (dataincache) {
|
||||
assertf(((int) r->size) == r->size);
|
||||
//entryBodySize = (int) r->size;
|
||||
// data in cache: the body must fit the 32-bit zip write API
|
||||
if (dataincache && (LLint) (int) r->size != r->size) {
|
||||
if (r->is_write && url_save != NULL && strnotempty(url_save)) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"file too large for the cache, storing headers only: %s%s",
|
||||
url_adr, url_fil);
|
||||
dataincache = 0;
|
||||
} else {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"entry too large for the cache, not cached: %s%s", url_adr,
|
||||
url_fil);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Fields */
|
||||
@@ -370,7 +394,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
*/
|
||||
headers, (uInt) strlen(headers), NULL, 0, NULL, /* comment */
|
||||
Z_DEFLATED, Z_DEFAULT_COMPRESSION)) != Z_OK) {
|
||||
cache_zip_write_failed(opt, cache, "opening a cache entry", zErr);
|
||||
cache_zip_write_failed(opt, cache, "opening a cache entry", zErr, HTS_FALSE,
|
||||
url_adr, url_fil);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -381,7 +406,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, r->adr,
|
||||
(int) r->size)) != Z_OK) {
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache", zErr);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache", zErr,
|
||||
HTS_TRUE, url_adr, url_fil);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -403,8 +429,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, buff,
|
||||
(int) nl)) != Z_OK) {
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache",
|
||||
zErr);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache", zErr,
|
||||
HTS_TRUE, url_adr, url_fil);
|
||||
fclose(fp);
|
||||
return;
|
||||
}
|
||||
@@ -420,15 +446,19 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
|
||||
/* Close */
|
||||
if ((zErr = zipCloseFileInZip((zipFile) cache->zipOutput)) != Z_OK) {
|
||||
cache_zip_write_failed(opt, cache, "closing a cache entry", zErr);
|
||||
cache_zip_write_failed(opt, cache, "closing a cache entry", zErr, HTS_FALSE,
|
||||
url_adr, url_fil);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Flush */
|
||||
if ((zErr = zipFlush((zipFile) cache->zipOutput)) != 0) {
|
||||
cache_zip_write_failed(opt, cache, "flushing the cache", zErr);
|
||||
cache_zip_write_failed(opt, cache, "flushing the cache", zErr, HTS_FALSE,
|
||||
url_adr, url_fil);
|
||||
return;
|
||||
}
|
||||
|
||||
cache->zipWriteFailures = 0; /* entry stored: reset the failure streak */
|
||||
}
|
||||
|
||||
#else
|
||||
@@ -769,11 +799,11 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
strlcpybuff(return_save, previous_save, HTS_URLMAXSIZE * 2);
|
||||
}
|
||||
|
||||
/* A tampered X-Size must be rejected before the size-driven malloc.
|
||||
The alloc casts to int (malloct((int) r.size + 1)), so bound it to
|
||||
[0, INT_MAX): a negative value, or a positive one whose (int) cast
|
||||
truncates negative, would otherwise wrap to a huge allocation. */
|
||||
if (r.size < 0 || r.size >= INT_MAX) {
|
||||
/* A negative X-Size is corrupt; so is one >= INT_MAX when the data
|
||||
is in the zip (the write path asserts int-sized). Headers-only
|
||||
entries legitimately exceed INT_MAX (>2GB body on disk): keep
|
||||
them, or every update would re-fetch the file. */
|
||||
if (r.size < 0 || (dataincache && r.size >= INT_MAX)) {
|
||||
r.statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(r.msg, "Cache Read Error : Bad Size");
|
||||
}
|
||||
@@ -969,7 +999,10 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
strcpybuff(r.msg,
|
||||
"Previous cache file not found (empty filename)");
|
||||
}
|
||||
} else { /* Read in memory from disk */
|
||||
} else if (r.size >= INT_MAX) { /* too big to read in memory */
|
||||
r.statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(r.msg, "Cache Read Error : Bad Size");
|
||||
} else { /* Read in memory from disk */
|
||||
FILE *const fp = FOPEN(fconv(catbuff, sizeof(catbuff), previous_save), "rb");
|
||||
|
||||
if (fp != NULL) {
|
||||
|
||||
@@ -48,6 +48,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htszlib.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
@@ -321,6 +322,7 @@ typedef struct {
|
||||
size_t budget; /**< bytes allowed through before writes start failing */
|
||||
int fail_errno; /**< errno set on the failing write (ENOSPC, EIO, ...) */
|
||||
int writes; /**< zwrite call count, to detect re-entry into the stream */
|
||||
int fail_once; /**< recover (unlimited budget) after the first failure */
|
||||
} writefail_inject;
|
||||
|
||||
/* zwrite that copies until the budget runs out, then fails with inj->fail_errno
|
||||
@@ -335,6 +337,8 @@ static uLong selftest_failing_zwrite(voidpf opaque, voidpf stream,
|
||||
inj->budget -= (size_t) size;
|
||||
return (uLong) fwrite(buf, 1, (size_t) size, (FILE *) stream);
|
||||
}
|
||||
if (inj->fail_once)
|
||||
inj->budget = (size_t) -1; /* the backend recovers after this failure */
|
||||
errno = inj->fail_errno;
|
||||
return 0; /* short write -> the minizip op returns an error */
|
||||
}
|
||||
@@ -373,9 +377,50 @@ static void writefail_store(httrackp *opt, cache_back *cache, const char *fil,
|
||||
freet(bodycopy);
|
||||
}
|
||||
|
||||
/* #174/#219: a failing cache write used to crash via assertf(); it must instead
|
||||
stop the mirror (exit_xh = -1) without crashing. Assert that, plus the cache
|
||||
is flagged and a sibling write doesn't re-enter the broken stream. */
|
||||
/* Store an entry claiming a >2GB body; the degrade path never reads data. */
|
||||
static void writefail_store_oversized(httrackp *opt, cache_back *cache,
|
||||
const char *fil, int is_write) {
|
||||
htsblk r;
|
||||
char locbuf[4];
|
||||
|
||||
hts_init_htsblk(&r);
|
||||
r.statuscode = 200;
|
||||
r.size = (LLint) INT_MAX + 1;
|
||||
strcpybuff(r.msg, "OK");
|
||||
strcpybuff(r.contenttype, "application/octet-stream");
|
||||
locbuf[0] = '\0';
|
||||
r.location = locbuf;
|
||||
r.is_write = (short int) is_write;
|
||||
cache_add(opt, cache, &r, "example.com", fil, "example.com/big.bin", 1, NULL);
|
||||
}
|
||||
|
||||
/* Read back `entryname`: extra field (cached headers) and body. Returns the
|
||||
body length, or -1 if the entry is absent or unreadable. */
|
||||
static int writefail_read_entry(const char *path, const char *entryname,
|
||||
char *extra, size_t extralen, char *body,
|
||||
size_t bodylen) {
|
||||
unzFile z = unzOpen(path);
|
||||
int n = -1;
|
||||
|
||||
if (z == NULL)
|
||||
return -1;
|
||||
if (unzLocateFile(z, entryname, 1) == UNZ_OK &&
|
||||
unzOpenCurrentFile(z) == UNZ_OK) {
|
||||
const int elen = unzGetLocalExtrafield(z, extra, (unsigned) (extralen - 1));
|
||||
|
||||
if (elen >= 0) {
|
||||
extra[elen] = '\0';
|
||||
n = unzReadCurrentFile(z, body, (unsigned) bodylen);
|
||||
}
|
||||
unzCloseCurrentFile(z);
|
||||
}
|
||||
unzClose(z);
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Cache write-failure policy (#174/#219): fatal errno or a failure streak
|
||||
stops the mirror (exit_xh=-1, no crash); isolated/oversized drops the entry.
|
||||
*/
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
int fail = 0;
|
||||
char path[HTS_URLMAXSIZE];
|
||||
@@ -388,9 +433,8 @@ int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
gen_body(body, body_len, 1 /* incompressible */);
|
||||
fconcat(path, sizeof(path), dir, "/wfail.zip");
|
||||
|
||||
/* phase 0: fail on the body write, fatal errno (ENOSPC, the disk-full
|
||||
branch). phase 1: fail on the open, non-fatal errno (EIO, dropped-share
|
||||
branch). Both must abort the mirror. */
|
||||
/* phase 0: fatal errno (ENOSPC) aborts at once; phase 1: persistent EIO
|
||||
drops entries until the streak caps out, then aborts. */
|
||||
for (phase = 0; phase < 2; phase++) {
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
@@ -399,6 +443,7 @@ int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
inj.budget = (phase == 0) ? 4096 : 0;
|
||||
inj.fail_errno = (phase == 0) ? ENOSPC : EIO;
|
||||
inj.writes = 0;
|
||||
inj.fail_once = 0;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
@@ -412,7 +457,25 @@ int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
}
|
||||
|
||||
opt->state.exit_xh = 0; /* clear; the failing write must set it to -1 */
|
||||
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||
if (phase == 0) {
|
||||
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||
} else {
|
||||
/* the abort must land exactly on the 8th consecutive failure */
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 7; i++) {
|
||||
char fil[32];
|
||||
|
||||
snprintf(fil, sizeof(fil), "/b%d.bin", i);
|
||||
writefail_store(opt, &cache, fil, body, 16);
|
||||
}
|
||||
if (cache.zipWriteFailed) {
|
||||
fprintf(stderr, "cache-writefail: phase 1: aborted before the "
|
||||
"8th consecutive failure\n");
|
||||
fail++;
|
||||
}
|
||||
writefail_store(opt, &cache, "/b7.bin", body, 16);
|
||||
}
|
||||
if (!cache.zipWriteFailed) {
|
||||
fprintf(stderr, "cache-writefail: phase %d: write error not caught\n",
|
||||
phase);
|
||||
@@ -443,6 +506,136 @@ int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
}
|
||||
}
|
||||
|
||||
/* failures with successes in between reset the streak: never aborts */
|
||||
{
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
int i;
|
||||
|
||||
inj.budget = (size_t) -1;
|
||||
inj.fail_errno = EIO;
|
||||
inj.writes = 0;
|
||||
inj.fail_once = 0;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||
opt->state.exit_xh = 0;
|
||||
|
||||
for (i = 0; i < 10; i++) {
|
||||
char fil[32];
|
||||
|
||||
inj.budget = 0; /* this store fails */
|
||||
snprintf(fil, sizeof(fil), "/s%d.bin", i);
|
||||
writefail_store(opt, &cache, fil, body, 16);
|
||||
inj.budget = (size_t) -1; /* this one succeeds and resets the streak */
|
||||
snprintf(fil, sizeof(fil), "/ok%d.bin", i);
|
||||
writefail_store(opt, &cache, fil, body, 16);
|
||||
}
|
||||
if (cache.zipWriteFailed || opt->state.exit_xh != 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: scattered: non-consecutive failures aborted "
|
||||
"the mirror (flagged=%d, exit_xh=%d)\n",
|
||||
(int) cache.zipWriteFailed, opt->state.exit_xh);
|
||||
fail++;
|
||||
}
|
||||
zipClose(cache.zipOutput, NULL);
|
||||
cache.zipOutput = NULL;
|
||||
}
|
||||
|
||||
/* isolated failure: only that entry drops; a later sibling round-trips */
|
||||
{
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
char extra[8192];
|
||||
char rbody[64];
|
||||
int n;
|
||||
|
||||
inj.budget = 4096;
|
||||
inj.fail_errno = EIO;
|
||||
inj.writes = 0;
|
||||
inj.fail_once = 1;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||
opt->state.exit_xh = 0;
|
||||
|
||||
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||
if (cache.zipWriteFailed || opt->state.exit_xh != 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: skip: isolated failure aborted the mirror "
|
||||
"(flagged=%d, exit_xh=%d)\n",
|
||||
(int) cache.zipWriteFailed, opt->state.exit_xh);
|
||||
fail++;
|
||||
}
|
||||
writefail_store(opt, &cache, "/blob2.bin", body, 16);
|
||||
zipClose(cache.zipOutput, NULL);
|
||||
cache.zipOutput = NULL;
|
||||
n = writefail_read_entry(path, "http://example.com/blob2.bin", extra,
|
||||
sizeof(extra), rbody, sizeof(rbody));
|
||||
if (n != 16 || memcmp(rbody, body, 16) != 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: skip: sibling entry lost after a skipped "
|
||||
"entry (%d)\n",
|
||||
n);
|
||||
fail++;
|
||||
}
|
||||
}
|
||||
|
||||
/* >2GB bodies: in-memory drops the entry, on-disk degrades to headers-only */
|
||||
{
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
char extra[8192];
|
||||
char rbody[64];
|
||||
int n;
|
||||
|
||||
inj.budget = (size_t) -1; /* no injected failure */
|
||||
inj.fail_errno = 0;
|
||||
inj.writes = 0;
|
||||
inj.fail_once = 0;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||
opt->state.exit_xh = 0;
|
||||
|
||||
writefail_store_oversized(opt, &cache, "/bigmem.bin", 0 /* in-memory */);
|
||||
writefail_store_oversized(opt, &cache, "/bigdisk.bin", 1 /* on-disk */);
|
||||
zipClose(cache.zipOutput, NULL);
|
||||
cache.zipOutput = NULL;
|
||||
|
||||
if (cache.zipWriteFailed || opt->state.exit_xh != 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: oversize: mirror aborted (flagged=%d, "
|
||||
"exit_xh=%d)\n",
|
||||
(int) cache.zipWriteFailed, opt->state.exit_xh);
|
||||
fail++;
|
||||
}
|
||||
if (writefail_read_entry(path, "http://example.com/bigmem.bin", extra,
|
||||
sizeof(extra), rbody, sizeof(rbody)) >= 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: oversize: in-memory entry was stored\n");
|
||||
fail++;
|
||||
}
|
||||
n = writefail_read_entry(path, "http://example.com/bigdisk.bin", extra,
|
||||
sizeof(extra), rbody, sizeof(rbody));
|
||||
if (n != 0 || strstr(extra, "X-In-Cache: 0") == NULL) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: oversize: on-disk entry not stored "
|
||||
"headers-only (%d)\n",
|
||||
n);
|
||||
fail++;
|
||||
}
|
||||
}
|
||||
|
||||
freet(body);
|
||||
return fail;
|
||||
}
|
||||
@@ -965,6 +1158,46 @@ static void corrupt_build_etag(httrackp *opt) {
|
||||
selftest_close(&cache);
|
||||
}
|
||||
|
||||
/* Like corrupt_build_etag, but the victim is headers-only (X-In-Cache: 0,
|
||||
body on disk): the shape every non-html file is stored with. */
|
||||
static void corrupt_build_disk(httrackp *opt) {
|
||||
cache_back cache;
|
||||
htsblk w;
|
||||
char locw[4];
|
||||
char BIGSTK save[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK catbuff[HTS_URLMAXSIZE * 2];
|
||||
char *path;
|
||||
FILE *fp;
|
||||
|
||||
memset(corrupt_body_a, 'a', sizeof(corrupt_body_a) - 1);
|
||||
remove(reconcile_st_path(opt, "hts-cache/new.zip"));
|
||||
remove(reconcile_st_path(opt, "hts-cache/old.zip"));
|
||||
fconcat(save, sizeof(save), StringBuff(opt->path_html_utf8),
|
||||
CORRUPT_ADR "/victim.bin");
|
||||
selftest_open_for_write(&cache, opt);
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/canary.html", "canary.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "", "", "", corrupt_body_a,
|
||||
strlen(corrupt_body_a));
|
||||
hts_init_htsblk(&w);
|
||||
w.statuscode = 200;
|
||||
w.size = (LLint) sizeof(corrupt_body_b) - 1;
|
||||
strcpybuff(w.msg, "OK");
|
||||
strcpybuff(w.contenttype, "application/octet-stream");
|
||||
strcpybuff(w.etag, "AAAAAAAAAAAAAAAAAAAA");
|
||||
locw[0] = '\0';
|
||||
w.location = locw;
|
||||
w.is_write = 0;
|
||||
cache_add(opt, &cache, &w, CORRUPT_ADR, "/victim.bin", save,
|
||||
0 /* all_in_cache */, NULL);
|
||||
selftest_close(&cache);
|
||||
/* the reader only checks this file exists; it never reads it here */
|
||||
path = fconv(catbuff, sizeof(catbuff), save);
|
||||
(void) structcheck(path);
|
||||
fp = FOPEN(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/* Patch the nth of total occurrences of pat (same-length rep) in new.zip. */
|
||||
static void corrupt_patch(httrackp *opt, const char *pat, size_t patlen,
|
||||
const char *rep, size_t nth, size_t total) {
|
||||
@@ -1024,8 +1257,8 @@ static void corrupt_victim_body(httrackp *opt) {
|
||||
/canary.html: the victim must be rejected (wantmsg pins which path) and the
|
||||
canary must still decode byte-exact, proving one bad entry never taints a
|
||||
sibling read. */
|
||||
static int corrupt_expect_victim(httrackp *opt, const char *wantmsg,
|
||||
const char *what) {
|
||||
static int corrupt_expect_victim_fil(httrackp *opt, const char *fil,
|
||||
const char *wantmsg, const char *what) {
|
||||
cache_back cache;
|
||||
htsblk v, c;
|
||||
char BIGSTK lv[HTS_URLMAXSIZE * 2];
|
||||
@@ -1034,7 +1267,7 @@ static int corrupt_expect_victim(httrackp *opt, const char *wantmsg,
|
||||
|
||||
selftest_open_for_read(&cache, opt);
|
||||
lv[0] = lc[0] = '\0';
|
||||
v = cache_readex(opt, &cache, CORRUPT_ADR, "/victim.html", "", lv, NULL, 1);
|
||||
v = cache_readex(opt, &cache, CORRUPT_ADR, fil, "", lv, NULL, 1);
|
||||
if (v.statuscode != STATUSCODE_INVALID) {
|
||||
fprintf(stderr, "%s: %s: victim: statuscode is %d, expected %d\n",
|
||||
selftest_tag, what, v.statuscode, STATUSCODE_INVALID);
|
||||
@@ -1061,6 +1294,34 @@ static int corrupt_expect_victim(httrackp *opt, const char *wantmsg,
|
||||
return fail;
|
||||
}
|
||||
|
||||
static int corrupt_expect_victim(httrackp *opt, const char *wantmsg,
|
||||
const char *what) {
|
||||
return corrupt_expect_victim_fil(opt, "/victim.html", wantmsg, what);
|
||||
}
|
||||
|
||||
/* Headers-only probe of the disk victim: must parse OK with the size kept. */
|
||||
static int corrupt_expect_disk_header(httrackp *opt, LLint wantsize,
|
||||
const char *what) {
|
||||
cache_back cache;
|
||||
htsblk v;
|
||||
char BIGSTK lv[HTS_URLMAXSIZE * 2];
|
||||
int fail = 0;
|
||||
|
||||
selftest_open_for_read(&cache, opt);
|
||||
lv[0] = '\0';
|
||||
v = cache_readex(opt, &cache, CORRUPT_ADR, "/victim.bin", NULL, lv, NULL, 1);
|
||||
if (v.statuscode != 200 || v.size != wantsize) {
|
||||
fprintf(stderr,
|
||||
"%s: %s: statuscode %d size " LLintP ", expected 200/" LLintP "\n",
|
||||
selftest_tag, what, v.statuscode, (LLint) v.size, wantsize);
|
||||
fail++;
|
||||
}
|
||||
if (v.adr != NULL)
|
||||
freet(v.adr);
|
||||
selftest_close(&cache);
|
||||
return fail;
|
||||
}
|
||||
|
||||
/* One zip corruption case: build, patch, then check victim+canary in-session.
|
||||
*/
|
||||
static int corrupt_case_zip(httrackp *opt, const char *pat, const char *rep,
|
||||
@@ -1109,5 +1370,33 @@ int cache_corruption_selftest(httrackp *opt, const char *dir) {
|
||||
failures += corrupt_expect_victim(opt, "Cache Read Error : Bad Size",
|
||||
"X-Size above INT_MAX");
|
||||
|
||||
/* A headers-only entry (X-In-Cache: 0) may carry an X-Size >= INT_MAX: that
|
||||
is how every >2GB non-html file is stored. It must survive a header probe
|
||||
(or every update re-fetches the file); an in-memory read still rejects. */
|
||||
corrupt_build_disk(opt);
|
||||
corrupt_patch(opt, "Etag: AAAAAAAAAAAAAAAAAAAA", 26,
|
||||
"X-Size: 2147483648AAAAAAAA", 1, 1);
|
||||
failures += corrupt_expect_disk_header(opt, (LLint) 2147483648LL,
|
||||
"headers-only X-Size above INT_MAX");
|
||||
failures += corrupt_expect_victim_fil(opt, "/victim.bin",
|
||||
"Cache Read Error : Bad Size",
|
||||
"in-memory X-Size above INT_MAX");
|
||||
|
||||
/* exactly INT_MAX pins the >= boundary: (int) r.size + 1 would overflow */
|
||||
corrupt_build_disk(opt);
|
||||
corrupt_patch(opt, "Etag: AAAAAAAAAAAAAAAAAAAA", 26,
|
||||
"X-Size: 2147483647AAAAAAAA", 1, 1);
|
||||
failures += corrupt_expect_victim_fil(opt, "/victim.bin",
|
||||
"Cache Read Error : Bad Size",
|
||||
"in-memory X-Size at INT_MAX");
|
||||
|
||||
/* the negative check must stay global, headers-only included */
|
||||
corrupt_build_disk(opt);
|
||||
corrupt_patch(opt, "Etag: AAAAAAAAAAAAAAAAAAAA", 26,
|
||||
"X-Size: -2147483648AAAAAAA", 1, 1);
|
||||
failures += corrupt_expect_victim_fil(opt, "/victim.bin",
|
||||
"Cache Read Error : Bad Size",
|
||||
"headers-only negative X-Size");
|
||||
|
||||
return failures;
|
||||
}
|
||||
|
||||
@@ -52,8 +52,8 @@ int cache_selftests(httrackp *opt, const char *dir);
|
||||
committed file, never by the test). Returns the failed-check count. */
|
||||
int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
|
||||
|
||||
/* #174/#219: assert a failing cache write aborts the mirror cleanly instead of
|
||||
crashing. Returns the failed-check count. */
|
||||
/* Cache write-failure policy (#174/#219): abort on fatal errno or a streak,
|
||||
drop just the entry otherwise. Returns the failed-check count. */
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
/* Exercise the hts_cache_reconcile() generation policies on file fixtures
|
||||
|
||||
@@ -2852,6 +2852,9 @@ int check_fatal_io_errno(void) {
|
||||
#endif
|
||||
#ifdef EROFS
|
||||
case EROFS: /* Read-only file system */
|
||||
#endif
|
||||
#ifdef EDQUOT
|
||||
case EDQUOT: /* Disk quota exceeded */
|
||||
#endif
|
||||
return 1;
|
||||
break;
|
||||
|
||||
@@ -216,6 +216,7 @@ struct cache_back {
|
||||
int zipEntriesCapa;
|
||||
hts_boolean
|
||||
zipWriteFailed; /**< a cache write failed; stop touching the stream */
|
||||
int zipWriteFailures; /**< consecutive entry write failures; reset on store */
|
||||
};
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_hash_struct
|
||||
|
||||
@@ -4566,6 +4566,12 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
back_maydelete(opt, cache, sback, b); // cancel
|
||||
b = -1;
|
||||
|
||||
/* the cancel may leave the now-unreferenced placeholder on disk
|
||||
* (#483) */
|
||||
if (fexist_utf8(delayed_back.url_sav)) {
|
||||
back_delayed_discard(opt, &delayed_back);
|
||||
}
|
||||
|
||||
/* Recompute filename with MIME type */
|
||||
afs->save[0] = '\0';
|
||||
url_savename(afs, former, heap(ptr)->adr,
|
||||
@@ -4782,6 +4788,9 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
|
||||
/* Still have a back reference */
|
||||
if (b >= 0) {
|
||||
/* move a still-writing placeholder before the url_sav patch
|
||||
blinds every cleanup to it (#483) */
|
||||
back_delayed_rename(opt, &back[b], afs->save);
|
||||
/* patch url_sav BEFORE finalize: it records/caches under this name
|
||||
*/
|
||||
strcpybuff(back[b].url_sav, afs->save);
|
||||
|
||||
@@ -4,10 +4,9 @@
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Cache write-failure handling (httrack -#test=cache-writefail <dir>). #174/#219.
|
||||
# A failing new.zip write (disk full) used to crash the process via assertf; it
|
||||
# must instead stop the mirror with a fatal error (exit_xh=-1), no crash. The
|
||||
# self-test asserts that; reverting the fix makes -#test=cache-writefail abort (SIGABRT) and fail.
|
||||
# Cache write-failure policy (-#test=cache-writefail <dir>). #174/#219: disk
|
||||
# full or a failure streak aborts cleanly; an isolated failure or an oversized
|
||||
# entry is only dropped.
|
||||
|
||||
set -eu
|
||||
|
||||
@@ -22,3 +21,9 @@ printf '%s\n' "$out" | grep -qx "cache-writefail: OK" || {
|
||||
echo "expected 'cache-writefail: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# A skipped entry must be warned about with its URL.
|
||||
printf '%s\n' "$out" | grep -q "entry not cached: example.com/" || {
|
||||
echo "expected a URL-bearing skip warning" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -1,17 +1,19 @@
|
||||
#!/bin/bash
|
||||
# Issues #32/#41: a Content-Length that disagrees with the body warns "bogus
|
||||
# state (broken size)" and skips the cache; -%B (tolerant) accepts it.
|
||||
# Issues #32/#41: a Content-Length that disagrees with the body warns
|
||||
# "incomplete transfer" and skips the cache; -%B (tolerant) accepts it.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# Default: warn, but the file is still written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-found 'bogus state \(broken size' \
|
||||
--log-found 'incomplete transfer \(expected' \
|
||||
httrack 'BASEURL/size/index.html'
|
||||
|
||||
# -%B (tolerant): no warning, file written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-not-found 'bogus state' \
|
||||
--log-not-found 'incomplete transfer|not cached' \
|
||||
httrack 'BASEURL/size/index.html' '-%B'
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# Degenerate delayed-type paths (#5/#107 family): redirects that never resolve
|
||||
# a name must drop cleanly -- no .delayed leftovers (audited by local-crawl.sh),
|
||||
# no "bogus state" cache warnings, resolvable links still land correctly.
|
||||
# no "not cached" warnings, resolvable links still land correctly.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -16,5 +16,5 @@ bash "$top_srcdir/tests/local-crawl.sh" --rerun --errors 0 \
|
||||
--not-found 'delayed/noloc.html' \
|
||||
--not-found 'delayed/selfloop.html' \
|
||||
--not-found 'delayed/chain9.pdf' \
|
||||
--log-not-found 'bogus state' \
|
||||
--log-not-found 'not cached' \
|
||||
httrack 'BASEURL/delayed/index.html'
|
||||
|
||||
@@ -7,10 +7,8 @@ set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# cancelled crawls can orphan .delayed placeholders (#483): skip that audit
|
||||
start=$(date +%s)
|
||||
bash "$top_srcdir/tests/local-crawl.sh" \
|
||||
--skip-delayed-audit \
|
||||
--log-found 'More than 2 seconds passed' \
|
||||
httrack 'BASEURL/trickle/index.html' -E2 -c4
|
||||
wall=$(($(date +%s) - start))
|
||||
|
||||
@@ -44,7 +44,7 @@ bash "$top_srcdir/tests/local-crawl.sh" --rerun \
|
||||
--file-matches 'big/a/blk2.css' 'url\(blk2-bg\.png\)' \
|
||||
--file-matches 'big/p/5.html' "document\\.write\\('<a href=\"\\.\\./f5/dw\\.html\"" \
|
||||
--file-not-matches 'big/p/1.html' 'href="/big/' \
|
||||
--log-not-found 'bogus state|[Pp]anic|assert' \
|
||||
--log-not-found 'not cached|[Pp]anic|assert' \
|
||||
--log-found '\(404\) at link [^ ]*/big/e/404\.html' \
|
||||
--log-found '\(410\) at link [^ ]*/big/e/410\.html' \
|
||||
--log-found '\(500\) at link [^ ]*/big/e/500\.html' \
|
||||
|
||||
13
tests/39_local-delayed-cancel.test
Normal file
13
tests/39_local-delayed-cancel.test
Normal file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Cancelled delayed-type-checks must not orphan .delayed placeholders (#483).
|
||||
# Timing-dependent (hence two tries); -A keeps the window reachable.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
for _ in 1 2; do
|
||||
bash "$top_srcdir/tests/local-crawl.sh" \
|
||||
httrack 'BASEURL/dcancel/index.html' -E1 -c4 -A25000
|
||||
done
|
||||
@@ -103,6 +103,7 @@ TESTS = \
|
||||
35_local-maxsize.test \
|
||||
36_local-bigcrawl.test \
|
||||
37_local-cache-outage.test \
|
||||
38_local-update-304.test
|
||||
38_local-update-304.test \
|
||||
39_local-delayed-cancel.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -97,7 +97,6 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create
|
||||
# --- parse leading control flags --------------------------------------------
|
||||
declare -a audit=()
|
||||
declare -a cookies=()
|
||||
skip_delayed_audit=""
|
||||
scheme=http
|
||||
pos=0
|
||||
args=("$@")
|
||||
@@ -123,9 +122,6 @@ while test "$pos" -lt "$nargs"; do
|
||||
pos=$((pos + 1))
|
||||
cookies+=("${args[$pos]}")
|
||||
;;
|
||||
--skip-delayed-audit)
|
||||
skip_delayed_audit=1
|
||||
;;
|
||||
--errors | --files)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
@@ -293,15 +289,12 @@ done
|
||||
test -n "$hostroot" || die "could not find host root under $out"
|
||||
debug "host root: $hostroot"
|
||||
|
||||
# A completed crawl must leave no .delayed temporaries (issue #107).
|
||||
# --skip-delayed-audit: a cancelled crawl can orphan placeholders (issue #483)
|
||||
if test -z "$skip_delayed_audit"; then
|
||||
info "checking for leftover .delayed files"
|
||||
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
|
||||
if test -z "$leftovers"; then result "OK"; else
|
||||
result "leftover: $leftovers"
|
||||
exit 1
|
||||
fi
|
||||
# No crawl, even a cancelled one, may leave .delayed temporaries (#107, #483).
|
||||
info "checking for leftover .delayed files"
|
||||
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
|
||||
if test -z "$leftovers"; then result "OK"; else
|
||||
result "leftover: $leftovers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- audit -------------------------------------------------------------------
|
||||
|
||||
@@ -781,7 +781,7 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
self.send_raw(b"", "text/html")
|
||||
|
||||
# broken Content-Length (#32/#41): declared size != bytes sent. httrack
|
||||
# warns "bogus state (broken size)" and skips the cache unless -%B.
|
||||
# warns "incomplete transfer" and skips the cache unless -%B.
|
||||
def route_size_index(self):
|
||||
self.send_html('\t<a href="oversize.bin">over</a>\n')
|
||||
|
||||
@@ -913,6 +913,26 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# #483: trickled .bin pages so the -E stop lands in the type waiter's
|
||||
# unlock-to-patch window with body bytes pending.
|
||||
def route_dcancel_index(self):
|
||||
self.send_bin_index()
|
||||
|
||||
def route_dcancel_page(self):
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", "4096")
|
||||
self.end_headers()
|
||||
if self.command == "HEAD":
|
||||
return
|
||||
try:
|
||||
for _ in range(32):
|
||||
self.wfile.write(b"z" * 128)
|
||||
self.wfile.flush()
|
||||
time.sleep(0.05)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# -M byte cap (#77): large fast files so a crawl overruns -M immediately.
|
||||
BIGFILE_BYTES = 640 * 1024
|
||||
|
||||
@@ -976,6 +996,15 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/trickle/p5.bin": route_trickle_page,
|
||||
"/trickle/p6.bin": route_trickle_page,
|
||||
"/trickle/p7.bin": route_trickle_page,
|
||||
"/dcancel/index.html": route_dcancel_index,
|
||||
"/dcancel/p0.bin": route_dcancel_page,
|
||||
"/dcancel/p1.bin": route_dcancel_page,
|
||||
"/dcancel/p2.bin": route_dcancel_page,
|
||||
"/dcancel/p3.bin": route_dcancel_page,
|
||||
"/dcancel/p4.bin": route_dcancel_page,
|
||||
"/dcancel/p5.bin": route_dcancel_page,
|
||||
"/dcancel/p6.bin": route_dcancel_page,
|
||||
"/dcancel/p7.bin": route_dcancel_page,
|
||||
"/bigfiles/index.html": route_bigfiles_index,
|
||||
"/bigfiles/p0.bin": route_bigfile,
|
||||
"/bigfiles/p1.bin": route_bigfile,
|
||||
|
||||
Reference in New Issue
Block a user