mirror of
https://github.com/xroche/httrack.git
synced 2026-07-06 00:46:30 +03:00
Compare commits
12 Commits
maxsize-te
...
p2-xsize-h
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
55fc3407f2 | ||
|
|
d93e6f90da | ||
|
|
484fc47eab | ||
|
|
abaf9b69a2 | ||
|
|
b0466b1d7b | ||
|
|
f785286c87 | ||
|
|
440a8603a9 | ||
|
|
4979e58dc0 | ||
|
|
894cf5a8d2 | ||
|
|
0c1aa51385 | ||
|
|
fb4267c6d7 | ||
|
|
f0b044c2f3 |
@@ -1,6 +1,6 @@
|
||||
AC_PREREQ([2.71])
|
||||
|
||||
AC_INIT([httrack], [3.49.10], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_INIT([httrack], [3.49.11], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_COPYRIGHT([
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2015 Xavier Roche and other contributors
|
||||
@@ -29,10 +29,11 @@ AC_CONFIG_SRCDIR(src/httrack.c)
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
AM_INIT_AUTOMAKE([subdir-objects])
|
||||
# 3:2:0: 3.49.10 only appends tail fields to the options struct (no existing
|
||||
# symbol or offset changed vs 3.49.9), so it stays soname .so.3; bump revision.
|
||||
# 3:3:0: 3.49.11 only adds enum values, macros and inline helpers to the
|
||||
# installed headers (no struct layout or exported signature changed vs
|
||||
# 3.49.10), so it stays soname .so.3; bump revision.
|
||||
# (3:0:0 was the htsblk mime-buffer widening, the ABI break that moved .so.2 -> .so.3.)
|
||||
VERSION_INFO="3:2:0"
|
||||
VERSION_INFO="3:3:0"
|
||||
AM_MAINTAINER_MODE
|
||||
AC_USE_SYSTEM_EXTENSIONS
|
||||
|
||||
|
||||
21
debian/changelog
vendored
21
debian/changelog
vendored
@@ -1,3 +1,24 @@
|
||||
httrack (3.49.11-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: crawl correctness and security fixes (network-facing
|
||||
buffer overflows, file-type detection, redirect handling) and modernized
|
||||
web defaults; full list in history.txt.
|
||||
* Add DEP-12 upstream metadata (#466).
|
||||
* Bump debhelper compat to 14 (#466).
|
||||
* Drop the redundant Priority field and update the NMU lintian override to
|
||||
the current tag names (#466).
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sun, 05 Jul 2026 00:03:18 +0200
|
||||
|
||||
httrack (3.49.10-2) unstable; urgency=medium
|
||||
|
||||
* Fix FTBFS: tests/28_local-pause failed instead of skipping when python3 is
|
||||
absent (the local-server tests need python3, which the buildds lack). Add
|
||||
patches/skip-local-pause-test-without-python3.patch to guard the test on
|
||||
python3 up front, like its siblings, so it skips cleanly.
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sun, 28 Jun 2026 20:18:46 +0200
|
||||
|
||||
httrack (3.49.10-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: new download-pacing and URL-handling options plus a
|
||||
|
||||
17
history.txt
17
history.txt
@@ -4,6 +4,23 @@ HTTrack Website Copier release history:
|
||||
|
||||
This file lists all changes and fixes that have been made for HTTrack
|
||||
|
||||
3.49-11
|
||||
+ New: parse robots.txt Allow rules and path wildcards per RFC 9309 (#452)
|
||||
+ New: advertise deflate in Accept-Encoding and decode deflate responses (#450)
|
||||
+ New: follow <source> and <track> media elements as embedded links (#451)
|
||||
+ New: added modern web MIME types to the type/extension table (#448)
|
||||
+ Fixed: enforce the -E time limit during a slow transfer instead of only between files (#481)
|
||||
+ Fixed: sniff the leading bytes of a download so a misdeclared Content-Type no longer renames a correct URL extension
|
||||
+ Fixed: fast transfers could be saved under their temporary .delayed placeholder name (#5, #107)
|
||||
+ Fixed: follow a redirect that maps to the same saved file instead of writing a self-pointing stub (#159)
|
||||
+ Fixed: several network-facing buffer overflows in the FTP, Java and HTML parsers
|
||||
+ Fixed: the htsjava plugin could not be loaded (hidden entry points, stale library name)
|
||||
+ Fixed: HTML-escape truncation and a cache-buffer leak in the parser
|
||||
+ Changed: modernized the default User-Agent to an honest HTTrack identifier (#449)
|
||||
+ Changed: decode the full WHATWG set of HTML named character references (#443)
|
||||
+ Changed: refreshed stale HTTP status, proxy-port and TLS-floor constants (#453)
|
||||
+ Changed: multiple internal hardening, build, test and CI improvements
|
||||
|
||||
3.49-10
|
||||
+ New: --cookies-file to preload a Netscape cookies.txt before crawling (#215)
|
||||
+ New: --pause to space out file downloads by a random delay (#185)
|
||||
|
||||
@@ -572,9 +572,12 @@ int back_finalize(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
&& back[p].r.size != back[p].r.totalsize && !opt->tolerant) {
|
||||
if (back[p].status == STATUS_READY) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"file not stored in cache due to bogus state (broken size, expected "
|
||||
LLintP " got " LLintP "): %s%s", back[p].r.totalsize,
|
||||
back[p].r.size, back[p].url_adr, back[p].url_fil);
|
||||
"incomplete transfer (expected " LLintP
|
||||
" bytes, got " LLintP
|
||||
"): file not cached, will be retried on the next update"
|
||||
" (use -%%B to cache anyway): %s%s",
|
||||
back[p].r.totalsize, back[p].r.size, back[p].url_adr,
|
||||
back[p].url_fil);
|
||||
} else {
|
||||
hts_log_print(opt, LOG_INFO,
|
||||
"incomplete file not yet stored in cache (expected "
|
||||
@@ -879,11 +882,12 @@ int back_finalize(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
back[p].url_fil, NULL);
|
||||
} else {
|
||||
/* Partial file, but marked as "ok" ? */
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"file not stored in cache due to bogus state (incomplete type with %s (%d), size "
|
||||
LLintP "): %s%s", back[p].r.msg, back[p].r.statuscode,
|
||||
(LLint) back[p].r.size, back[p].url_adr,
|
||||
back[p].url_fil);
|
||||
hts_log_print(
|
||||
opt, LOG_WARNING,
|
||||
"file with unresolved type not cached (%s (%d), size " LLintP
|
||||
"): %s%s",
|
||||
back[p].r.msg, back[p].r.statuscode, (LLint) back[p].r.size,
|
||||
back[p].url_adr, back[p].url_fil);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
176
src/htscache.c
176
src/htscache.c
@@ -40,6 +40,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htscore.h"
|
||||
#include "htsbasenet.h"
|
||||
#include "htsmd5.h"
|
||||
#include <limits.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "htszlib.h"
|
||||
@@ -220,23 +221,38 @@ struct cache_back_zip_entry {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/* A cache (new.zip) write failed: storage is gone (disk full / dropped share),
|
||||
so the mirror is doomed too. Abort it via exit_xh, don't crash as assertf
|
||||
did. */
|
||||
/* Consecutive entry write failures before the cache stream is declared dead. */
|
||||
#define CACHE_MAX_WRITE_FAILURES 8
|
||||
|
||||
/* Cache write failed: a fatal errno or a failure streak aborts the mirror
|
||||
(exit_xh); an isolated failure only drops the current entry. */
|
||||
static void cache_zip_write_failed(httrackp *opt, cache_back *cache,
|
||||
const char *what, int zErr) {
|
||||
if (!cache->zipWriteFailed) {
|
||||
cache->zipWriteFailed = HTS_TRUE;
|
||||
if (check_fatal_io_errno()) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: cache write failed (%s): %s", what,
|
||||
hts_get_zerror(zErr));
|
||||
const char *what, int zErr,
|
||||
hts_boolean entry_open, const char *url_adr,
|
||||
const char *url_fil) {
|
||||
const int fatal_errno = zErr == ZIP_ERRNO && check_fatal_io_errno();
|
||||
|
||||
cache->zipWriteFailures++;
|
||||
if (fatal_errno || cache->zipWriteFailures >= CACHE_MAX_WRITE_FAILURES) {
|
||||
if (!cache->zipWriteFailed) {
|
||||
cache->zipWriteFailed = HTS_TRUE;
|
||||
if (fatal_errno) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: cache write failed (%s): %s", what,
|
||||
hts_get_zerror(zErr));
|
||||
}
|
||||
}
|
||||
opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
|
||||
} else {
|
||||
if (entry_open)
|
||||
zipCloseFileInZip((zipFile) cache->zipOutput); /* abandon, best-effort */
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"cache write failed (%s: %s), entry not cached: %s%s", what,
|
||||
hts_get_zerror(zErr), url_adr, url_fil);
|
||||
}
|
||||
opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
|
||||
}
|
||||
|
||||
/* Ajout d'un fichier en cache */
|
||||
@@ -286,10 +302,19 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if (r->size < 0) // error
|
||||
return;
|
||||
|
||||
// data in cache
|
||||
if (dataincache) {
|
||||
assertf(((int) r->size) == r->size);
|
||||
//entryBodySize = (int) r->size;
|
||||
// data in cache: the body must fit the 32-bit zip write API
|
||||
if (dataincache && (LLint) (int) r->size != r->size) {
|
||||
if (r->is_write && url_save != NULL && strnotempty(url_save)) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"file too large for the cache, storing headers only: %s%s",
|
||||
url_adr, url_fil);
|
||||
dataincache = 0;
|
||||
} else {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"entry too large for the cache, not cached: %s%s", url_adr,
|
||||
url_fil);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Fields */
|
||||
@@ -369,7 +394,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
*/
|
||||
headers, (uInt) strlen(headers), NULL, 0, NULL, /* comment */
|
||||
Z_DEFLATED, Z_DEFAULT_COMPRESSION)) != Z_OK) {
|
||||
cache_zip_write_failed(opt, cache, "opening a cache entry", zErr);
|
||||
cache_zip_write_failed(opt, cache, "opening a cache entry", zErr, HTS_FALSE,
|
||||
url_adr, url_fil);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -380,7 +406,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, r->adr,
|
||||
(int) r->size)) != Z_OK) {
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache", zErr);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache", zErr,
|
||||
HTS_TRUE, url_adr, url_fil);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -402,8 +429,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, buff,
|
||||
(int) nl)) != Z_OK) {
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache",
|
||||
zErr);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache", zErr,
|
||||
HTS_TRUE, url_adr, url_fil);
|
||||
fclose(fp);
|
||||
return;
|
||||
}
|
||||
@@ -419,15 +446,19 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
|
||||
/* Close */
|
||||
if ((zErr = zipCloseFileInZip((zipFile) cache->zipOutput)) != Z_OK) {
|
||||
cache_zip_write_failed(opt, cache, "closing a cache entry", zErr);
|
||||
cache_zip_write_failed(opt, cache, "closing a cache entry", zErr, HTS_FALSE,
|
||||
url_adr, url_fil);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Flush */
|
||||
if ((zErr = zipFlush((zipFile) cache->zipOutput)) != 0) {
|
||||
cache_zip_write_failed(opt, cache, "flushing the cache", zErr);
|
||||
cache_zip_write_failed(opt, cache, "flushing the cache", zErr, HTS_FALSE,
|
||||
url_adr, url_fil);
|
||||
return;
|
||||
}
|
||||
|
||||
cache->zipWriteFailures = 0; /* entry stored: reset the failure streak */
|
||||
}
|
||||
|
||||
#else
|
||||
@@ -768,6 +799,15 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
strlcpybuff(return_save, previous_save, HTS_URLMAXSIZE * 2);
|
||||
}
|
||||
|
||||
/* A negative X-Size is corrupt; so is one >= INT_MAX when the data
|
||||
is in the zip (the write path asserts int-sized). Headers-only
|
||||
entries legitimately exceed INT_MAX (>2GB body on disk): keep
|
||||
them, or every update would re-fetch the file. */
|
||||
if (r.size < 0 || (dataincache && r.size >= INT_MAX)) {
|
||||
r.statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(r.msg, "Cache Read Error : Bad Size");
|
||||
}
|
||||
|
||||
/* Complete fields */
|
||||
r.totalsize = r.size;
|
||||
r.adr = NULL;
|
||||
@@ -794,7 +834,8 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
} // otherwise, the ZIP file is supposed to be consistent with data.
|
||||
}
|
||||
/* Read data ? */
|
||||
else { /* ne pas lire uniquement header */
|
||||
else if (r.statuscode !=
|
||||
STATUSCODE_INVALID) { /* ne pas lire uniquement header */
|
||||
int ok = 0;
|
||||
|
||||
#if HTS_DIRECTDISK
|
||||
@@ -958,7 +999,10 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
strcpybuff(r.msg,
|
||||
"Previous cache file not found (empty filename)");
|
||||
}
|
||||
} else { /* Read in memory from disk */
|
||||
} else if (r.size >= INT_MAX) { /* too big to read in memory */
|
||||
r.statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(r.msg, "Cache Read Error : Bad Size");
|
||||
} else { /* Read in memory from disk */
|
||||
FILE *const fp = FOPEN(fconv(catbuff, sizeof(catbuff), previous_save), "rb");
|
||||
|
||||
if (fp != NULL) {
|
||||
@@ -1420,6 +1464,86 @@ static int hts_rename(httrackp * opt, const char *a, const char *b) {
|
||||
return rename(a, b);
|
||||
}
|
||||
|
||||
/* Pathname of a file inside the mirror dir (rotating concat buffer). */
|
||||
static char *reconcile_path(httrackp *opt, const char *name) {
|
||||
return fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), name);
|
||||
}
|
||||
|
||||
/* Interrupted-run heuristic: prefer the old generation when the new cache
|
||||
stalled below NEW_TINY while the old one grew past OLD_SOLID (historical
|
||||
arbitrary thresholds). */
|
||||
#define CACHE_RECONCILE_NEW_TINY 32768
|
||||
#define CACHE_RECONCILE_OLD_SOLID 65536
|
||||
|
||||
/* Replace the new-generation file by the old one, when the old one exists. */
|
||||
static void reconcile_promote(httrackp *opt, const char *oldname,
|
||||
const char *newname) {
|
||||
if (fexist(reconcile_path(opt, oldname))) {
|
||||
remove(reconcile_path(opt, newname));
|
||||
rename(reconcile_path(opt, oldname), reconcile_path(opt, newname));
|
||||
}
|
||||
}
|
||||
|
||||
void hts_cache_reconcile(httrackp *opt, hts_cache_reconcile_mode mode) {
|
||||
switch (mode) {
|
||||
case CACHE_RECONCILE_PROMOTE:
|
||||
/* Previous run rotated new.* to old.* then died before writing: promote
|
||||
the old generation back, whichever format it uses. */
|
||||
if (!fexist(reconcile_path(opt, "hts-cache/new.zip")))
|
||||
reconcile_promote(opt, "hts-cache/old.zip", "hts-cache/new.zip");
|
||||
if ((!fexist(reconcile_path(opt, "hts-cache/new.dat")) ||
|
||||
!fexist(reconcile_path(opt, "hts-cache/new.ndx"))) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.dat")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.ndx"))) {
|
||||
reconcile_promote(opt, "hts-cache/old.dat", "hts-cache/new.dat");
|
||||
reconcile_promote(opt, "hts-cache/old.ndx", "hts-cache/new.ndx");
|
||||
}
|
||||
break;
|
||||
case CACHE_RECONCILE_INTERRUPTED:
|
||||
/* Aborted run: keep the larger generation when the new cache is
|
||||
suspiciously small next to the old one. The new file must exist: fsize()
|
||||
is -1 for a missing file, which would spuriously pass the "< TINY" test
|
||||
and overwrite a solid old generation that PROMOTE/ROLLBACK should keep.
|
||||
*/
|
||||
if (!opt->cache || !fexist(reconcile_path(opt, "hts-in_progress.lock")))
|
||||
break;
|
||||
if (fexist(reconcile_path(opt, "hts-cache/new.zip")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.zip")) &&
|
||||
fsize(reconcile_path(opt, "hts-cache/new.zip")) <
|
||||
CACHE_RECONCILE_NEW_TINY &&
|
||||
fsize(reconcile_path(opt, "hts-cache/old.zip")) >
|
||||
CACHE_RECONCILE_OLD_SOLID &&
|
||||
fsize(reconcile_path(opt, "hts-cache/old.zip")) >
|
||||
fsize(reconcile_path(opt, "hts-cache/new.zip")))
|
||||
reconcile_promote(opt, "hts-cache/old.zip", "hts-cache/new.zip");
|
||||
if (fexist(reconcile_path(opt, "hts-cache/new.dat")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.dat")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.ndx")) &&
|
||||
fsize(reconcile_path(opt, "hts-cache/new.dat")) <
|
||||
CACHE_RECONCILE_NEW_TINY &&
|
||||
fsize(reconcile_path(opt, "hts-cache/old.dat")) >
|
||||
CACHE_RECONCILE_OLD_SOLID &&
|
||||
fsize(reconcile_path(opt, "hts-cache/old.dat")) >
|
||||
fsize(reconcile_path(opt, "hts-cache/new.dat"))) {
|
||||
reconcile_promote(opt, "hts-cache/old.dat", "hts-cache/new.dat");
|
||||
reconcile_promote(opt, "hts-cache/old.ndx", "hts-cache/new.ndx");
|
||||
}
|
||||
break;
|
||||
case CACHE_RECONCILE_ROLLBACK:
|
||||
/* Nothing transferred: restore the previous generation and sidecars. */
|
||||
reconcile_promote(opt, "hts-cache/old.zip", "hts-cache/new.zip");
|
||||
if (fexist(reconcile_path(opt, "hts-cache/old.dat")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.ndx"))) {
|
||||
reconcile_promote(opt, "hts-cache/old.dat", "hts-cache/new.dat");
|
||||
reconcile_promote(opt, "hts-cache/old.ndx", "hts-cache/new.ndx");
|
||||
}
|
||||
reconcile_promote(opt, "hts-cache/old.lst", "hts-cache/new.lst");
|
||||
reconcile_promote(opt, "hts-cache/old.txt", "hts-cache/new.txt");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// renvoyer uniquement en tête, ou NULL si erreur
|
||||
// return NULL upon error, and set -1 to r.statuscode
|
||||
htsblk *cache_header(httrackp * opt, cache_back * cache, const char *adr,
|
||||
|
||||
@@ -78,6 +78,17 @@ htsblk *cache_header(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, htsblk * r);
|
||||
void cache_init(cache_back * cache, httrackp * opt);
|
||||
|
||||
/* Which hts-cache/ generation (new.* vs old.*) is authoritative. */
|
||||
typedef enum {
|
||||
CACHE_RECONCILE_PROMOTE, /* no new cache: promote the old generation */
|
||||
CACHE_RECONCILE_INTERRUPTED, /* aborted run: keep the larger generation */
|
||||
CACHE_RECONCILE_ROLLBACK /* nothing transferred: restore the old one */
|
||||
} hts_cache_reconcile_mode;
|
||||
|
||||
/* Reconcile the on-disk cache generations according to mode; a no-op when
|
||||
the involved files are absent. */
|
||||
void hts_cache_reconcile(httrackp *opt, hts_cache_reconcile_mode mode);
|
||||
|
||||
int cache_writedata(FILE * cache_ndx, FILE * cache_dat, const char *str1,
|
||||
const char *str2, char *outbuff, int len);
|
||||
int cache_readdata(cache_back * cache, const char *str1, const char *str2,
|
||||
|
||||
@@ -48,6 +48,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htszlib.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
@@ -321,6 +322,7 @@ typedef struct {
|
||||
size_t budget; /**< bytes allowed through before writes start failing */
|
||||
int fail_errno; /**< errno set on the failing write (ENOSPC, EIO, ...) */
|
||||
int writes; /**< zwrite call count, to detect re-entry into the stream */
|
||||
int fail_once; /**< recover (unlimited budget) after the first failure */
|
||||
} writefail_inject;
|
||||
|
||||
/* zwrite that copies until the budget runs out, then fails with inj->fail_errno
|
||||
@@ -335,6 +337,8 @@ static uLong selftest_failing_zwrite(voidpf opaque, voidpf stream,
|
||||
inj->budget -= (size_t) size;
|
||||
return (uLong) fwrite(buf, 1, (size_t) size, (FILE *) stream);
|
||||
}
|
||||
if (inj->fail_once)
|
||||
inj->budget = (size_t) -1; /* the backend recovers after this failure */
|
||||
errno = inj->fail_errno;
|
||||
return 0; /* short write -> the minizip op returns an error */
|
||||
}
|
||||
@@ -373,9 +377,50 @@ static void writefail_store(httrackp *opt, cache_back *cache, const char *fil,
|
||||
freet(bodycopy);
|
||||
}
|
||||
|
||||
/* #174/#219: a failing cache write used to crash via assertf(); it must instead
|
||||
stop the mirror (exit_xh = -1) without crashing. Assert that, plus the cache
|
||||
is flagged and a sibling write doesn't re-enter the broken stream. */
|
||||
/* Store an entry claiming a >2GB body; the degrade path never reads data. */
|
||||
static void writefail_store_oversized(httrackp *opt, cache_back *cache,
|
||||
const char *fil, int is_write) {
|
||||
htsblk r;
|
||||
char locbuf[4];
|
||||
|
||||
hts_init_htsblk(&r);
|
||||
r.statuscode = 200;
|
||||
r.size = (LLint) INT_MAX + 1;
|
||||
strcpybuff(r.msg, "OK");
|
||||
strcpybuff(r.contenttype, "application/octet-stream");
|
||||
locbuf[0] = '\0';
|
||||
r.location = locbuf;
|
||||
r.is_write = (short int) is_write;
|
||||
cache_add(opt, cache, &r, "example.com", fil, "example.com/big.bin", 1, NULL);
|
||||
}
|
||||
|
||||
/* Read back `entryname`: extra field (cached headers) and body. Returns the
|
||||
body length, or -1 if the entry is absent or unreadable. */
|
||||
static int writefail_read_entry(const char *path, const char *entryname,
|
||||
char *extra, size_t extralen, char *body,
|
||||
size_t bodylen) {
|
||||
unzFile z = unzOpen(path);
|
||||
int n = -1;
|
||||
|
||||
if (z == NULL)
|
||||
return -1;
|
||||
if (unzLocateFile(z, entryname, 1) == UNZ_OK &&
|
||||
unzOpenCurrentFile(z) == UNZ_OK) {
|
||||
const int elen = unzGetLocalExtrafield(z, extra, (unsigned) (extralen - 1));
|
||||
|
||||
if (elen >= 0) {
|
||||
extra[elen] = '\0';
|
||||
n = unzReadCurrentFile(z, body, (unsigned) bodylen);
|
||||
}
|
||||
unzCloseCurrentFile(z);
|
||||
}
|
||||
unzClose(z);
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Cache write-failure policy (#174/#219): fatal errno or a failure streak
|
||||
stops the mirror (exit_xh=-1, no crash); isolated/oversized drops the entry.
|
||||
*/
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
int fail = 0;
|
||||
char path[HTS_URLMAXSIZE];
|
||||
@@ -388,9 +433,8 @@ int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
gen_body(body, body_len, 1 /* incompressible */);
|
||||
fconcat(path, sizeof(path), dir, "/wfail.zip");
|
||||
|
||||
/* phase 0: fail on the body write, fatal errno (ENOSPC, the disk-full
|
||||
branch). phase 1: fail on the open, non-fatal errno (EIO, dropped-share
|
||||
branch). Both must abort the mirror. */
|
||||
/* phase 0: fatal errno (ENOSPC) aborts at once; phase 1: persistent EIO
|
||||
drops entries until the streak caps out, then aborts. */
|
||||
for (phase = 0; phase < 2; phase++) {
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
@@ -399,6 +443,7 @@ int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
inj.budget = (phase == 0) ? 4096 : 0;
|
||||
inj.fail_errno = (phase == 0) ? ENOSPC : EIO;
|
||||
inj.writes = 0;
|
||||
inj.fail_once = 0;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
@@ -412,7 +457,25 @@ int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
}
|
||||
|
||||
opt->state.exit_xh = 0; /* clear; the failing write must set it to -1 */
|
||||
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||
if (phase == 0) {
|
||||
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||
} else {
|
||||
/* the abort must land exactly on the 8th consecutive failure */
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 7; i++) {
|
||||
char fil[32];
|
||||
|
||||
snprintf(fil, sizeof(fil), "/b%d.bin", i);
|
||||
writefail_store(opt, &cache, fil, body, 16);
|
||||
}
|
||||
if (cache.zipWriteFailed) {
|
||||
fprintf(stderr, "cache-writefail: phase 1: aborted before the "
|
||||
"8th consecutive failure\n");
|
||||
fail++;
|
||||
}
|
||||
writefail_store(opt, &cache, "/b7.bin", body, 16);
|
||||
}
|
||||
if (!cache.zipWriteFailed) {
|
||||
fprintf(stderr, "cache-writefail: phase %d: write error not caught\n",
|
||||
phase);
|
||||
@@ -443,6 +506,136 @@ int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
}
|
||||
}
|
||||
|
||||
/* failures with successes in between reset the streak: never aborts */
|
||||
{
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
int i;
|
||||
|
||||
inj.budget = (size_t) -1;
|
||||
inj.fail_errno = EIO;
|
||||
inj.writes = 0;
|
||||
inj.fail_once = 0;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||
opt->state.exit_xh = 0;
|
||||
|
||||
for (i = 0; i < 10; i++) {
|
||||
char fil[32];
|
||||
|
||||
inj.budget = 0; /* this store fails */
|
||||
snprintf(fil, sizeof(fil), "/s%d.bin", i);
|
||||
writefail_store(opt, &cache, fil, body, 16);
|
||||
inj.budget = (size_t) -1; /* this one succeeds and resets the streak */
|
||||
snprintf(fil, sizeof(fil), "/ok%d.bin", i);
|
||||
writefail_store(opt, &cache, fil, body, 16);
|
||||
}
|
||||
if (cache.zipWriteFailed || opt->state.exit_xh != 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: scattered: non-consecutive failures aborted "
|
||||
"the mirror (flagged=%d, exit_xh=%d)\n",
|
||||
(int) cache.zipWriteFailed, opt->state.exit_xh);
|
||||
fail++;
|
||||
}
|
||||
zipClose(cache.zipOutput, NULL);
|
||||
cache.zipOutput = NULL;
|
||||
}
|
||||
|
||||
/* isolated failure: only that entry drops; a later sibling round-trips */
|
||||
{
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
char extra[8192];
|
||||
char rbody[64];
|
||||
int n;
|
||||
|
||||
inj.budget = 4096;
|
||||
inj.fail_errno = EIO;
|
||||
inj.writes = 0;
|
||||
inj.fail_once = 1;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||
opt->state.exit_xh = 0;
|
||||
|
||||
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||
if (cache.zipWriteFailed || opt->state.exit_xh != 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: skip: isolated failure aborted the mirror "
|
||||
"(flagged=%d, exit_xh=%d)\n",
|
||||
(int) cache.zipWriteFailed, opt->state.exit_xh);
|
||||
fail++;
|
||||
}
|
||||
writefail_store(opt, &cache, "/blob2.bin", body, 16);
|
||||
zipClose(cache.zipOutput, NULL);
|
||||
cache.zipOutput = NULL;
|
||||
n = writefail_read_entry(path, "http://example.com/blob2.bin", extra,
|
||||
sizeof(extra), rbody, sizeof(rbody));
|
||||
if (n != 16 || memcmp(rbody, body, 16) != 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: skip: sibling entry lost after a skipped "
|
||||
"entry (%d)\n",
|
||||
n);
|
||||
fail++;
|
||||
}
|
||||
}
|
||||
|
||||
/* >2GB bodies: in-memory drops the entry, on-disk degrades to headers-only */
|
||||
{
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
char extra[8192];
|
||||
char rbody[64];
|
||||
int n;
|
||||
|
||||
inj.budget = (size_t) -1; /* no injected failure */
|
||||
inj.fail_errno = 0;
|
||||
inj.writes = 0;
|
||||
inj.fail_once = 0;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||
opt->state.exit_xh = 0;
|
||||
|
||||
writefail_store_oversized(opt, &cache, "/bigmem.bin", 0 /* in-memory */);
|
||||
writefail_store_oversized(opt, &cache, "/bigdisk.bin", 1 /* on-disk */);
|
||||
zipClose(cache.zipOutput, NULL);
|
||||
cache.zipOutput = NULL;
|
||||
|
||||
if (cache.zipWriteFailed || opt->state.exit_xh != 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: oversize: mirror aborted (flagged=%d, "
|
||||
"exit_xh=%d)\n",
|
||||
(int) cache.zipWriteFailed, opt->state.exit_xh);
|
||||
fail++;
|
||||
}
|
||||
if (writefail_read_entry(path, "http://example.com/bigmem.bin", extra,
|
||||
sizeof(extra), rbody, sizeof(rbody)) >= 0) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: oversize: in-memory entry was stored\n");
|
||||
fail++;
|
||||
}
|
||||
n = writefail_read_entry(path, "http://example.com/bigdisk.bin", extra,
|
||||
sizeof(extra), rbody, sizeof(rbody));
|
||||
if (n != 0 || strstr(extra, "X-In-Cache: 0") == NULL) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: oversize: on-disk entry not stored "
|
||||
"headers-only (%d)\n",
|
||||
n);
|
||||
fail++;
|
||||
}
|
||||
}
|
||||
|
||||
freet(body);
|
||||
return fail;
|
||||
}
|
||||
@@ -716,3 +909,494 @@ int cache_golden_selftest(httrackp *opt, const char *dir, int regen) {
|
||||
|
||||
return failures;
|
||||
}
|
||||
|
||||
/* --- hts_cache_reconcile() policies -------------------------------------- */
|
||||
|
||||
/* All reconcile inputs/outputs, wiped between cases. */
|
||||
static const char *const reconcile_files[] = {
|
||||
"hts-cache/new.zip", "hts-cache/old.zip", "hts-cache/new.dat",
|
||||
"hts-cache/old.dat", "hts-cache/new.ndx", "hts-cache/old.ndx",
|
||||
"hts-cache/new.lst", "hts-cache/old.lst", "hts-cache/new.txt",
|
||||
"hts-cache/old.txt", "hts-in_progress.lock"};
|
||||
|
||||
static char *reconcile_st_path(httrackp *opt, const char *name) {
|
||||
return fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), name);
|
||||
}
|
||||
|
||||
static void reconcile_wipe(httrackp *opt) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < sizeof(reconcile_files) / sizeof(reconcile_files[0]); i++)
|
||||
remove(reconcile_st_path(opt, reconcile_files[i]));
|
||||
}
|
||||
|
||||
/* Create a filler file of exactly `size` bytes. */
|
||||
static void reconcile_put(httrackp *opt, const char *name, size_t size) {
|
||||
FILE *const fp = fopen(reconcile_st_path(opt, name), "wb");
|
||||
static const char filler[1024] = {'x'};
|
||||
|
||||
assertf(fp != NULL);
|
||||
while (size > 0) {
|
||||
const size_t n = size > sizeof(filler) ? sizeof(filler) : size;
|
||||
|
||||
assertf(fwrite(filler, 1, n, fp) == n);
|
||||
size -= n;
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/* Expect `name` to weigh `size` bytes, or be absent when size == -1. */
|
||||
static int reconcile_expect(httrackp *opt, const char *name, off_t size,
|
||||
const char *what) {
|
||||
const off_t got = fsize(reconcile_st_path(opt, name));
|
||||
|
||||
if (got != size) {
|
||||
fprintf(stderr, "cache-reconcile: %s: %s is %d bytes, expected %d\n", what,
|
||||
name, (int) got, (int) size);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cache_reconcile_selftest(httrackp *opt, const char *dir) {
|
||||
int failures = 0;
|
||||
|
||||
/* around the interrupted-run thresholds (new < 32768, old > 65536) */
|
||||
static const off_t TINY = 1024, MID = 40000, SOLID = 131072;
|
||||
|
||||
golden_setup(opt, dir);
|
||||
#ifdef _WIN32
|
||||
mkdir(reconcile_st_path(opt, "hts-cache"));
|
||||
#else
|
||||
mkdir(reconcile_st_path(opt, "hts-cache"), HTS_PROTECT_FOLDER);
|
||||
#endif
|
||||
|
||||
/* PROMOTE: a zip old generation replaces a missing new one */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.zip", SOLID, "promote-zip");
|
||||
failures += reconcile_expect(opt, "hts-cache/old.zip", -1, "promote-zip");
|
||||
|
||||
/* PROMOTE: an existing new.zip is left alone */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", TINY, "promote-zip-noop");
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/old.zip", SOLID, "promote-zip-noop");
|
||||
|
||||
/* PROMOTE: a pure-legacy old generation is promoted too (was dead when no
|
||||
zip cache existed) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/old.dat", SOLID);
|
||||
reconcile_put(opt, "hts-cache/old.ndx", TINY);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.dat", SOLID, "promote-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.ndx", TINY, "promote-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/old.dat", -1, "promote-dat");
|
||||
|
||||
/* PROMOTE: a half-written legacy new pair is replaced by the old pair */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.dat", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.dat", SOLID);
|
||||
reconcile_put(opt, "hts-cache/old.ndx", TINY);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.dat", SOLID, "promote-dat-partial");
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.ndx", TINY, "promote-dat-partial");
|
||||
|
||||
/* INTERRUPTED: no lock file, no action */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", TINY, "interrupted-nolock");
|
||||
|
||||
/* INTERRUPTED: an absent new.zip must NOT promote old.zip (fsize(-1) would
|
||||
spuriously pass "< TINY"); leave the solid old generation for ROLLBACK */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", -1, "interrupted-nonew");
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/old.zip", SOLID, "interrupted-nonew");
|
||||
|
||||
/* INTERRUPTED: stalled tiny new.zip loses to a solid old.zip (was dead for
|
||||
zip caches: the arm was gated on a legacy new.dat) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", SOLID, "interrupted-zip");
|
||||
failures += reconcile_expect(opt, "hts-cache/old.zip", -1, "interrupted-zip");
|
||||
|
||||
/* INTERRUPTED: old below the confidence threshold, keep new */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", MID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", TINY, "interrupted-smallold");
|
||||
|
||||
/* INTERRUPTED: new big enough to trust, keep it */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/new.zip", MID);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", MID, "interrupted-bignew");
|
||||
|
||||
/* INTERRUPTED: the legacy pair follows the same size rule (was dead code) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/new.dat", TINY);
|
||||
reconcile_put(opt, "hts-cache/new.ndx", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.dat", SOLID);
|
||||
reconcile_put(opt, "hts-cache/old.ndx", MID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.dat", SOLID, "interrupted-dat");
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.ndx", MID, "interrupted-dat");
|
||||
|
||||
/* ROLLBACK: the old zip generation is restored (a zip cache used to lose
|
||||
its only good generation here) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.zip", SOLID, "rollback-zip");
|
||||
failures += reconcile_expect(opt, "hts-cache/old.zip", -1, "rollback-zip");
|
||||
|
||||
/* ROLLBACK: sidecars are restored regardless of format */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.lst", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.lst", MID);
|
||||
reconcile_put(opt, "hts-cache/old.txt", MID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.lst", MID, "rollback-lst");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.txt", MID, "rollback-txt");
|
||||
|
||||
/* ROLLBACK: full legacy generation incl. sidecars (historical behavior) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.dat", TINY);
|
||||
reconcile_put(opt, "hts-cache/new.ndx", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.dat", SOLID);
|
||||
reconcile_put(opt, "hts-cache/old.ndx", MID);
|
||||
reconcile_put(opt, "hts-cache/old.lst", MID);
|
||||
reconcile_put(opt, "hts-cache/old.txt", MID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.dat", SOLID, "rollback-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.ndx", MID, "rollback-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.lst", MID, "rollback-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.txt", MID, "rollback-dat");
|
||||
|
||||
/* ROLLBACK: nothing to restore, the new generation stays */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.zip", TINY, "rollback-noop");
|
||||
|
||||
reconcile_wipe(opt);
|
||||
return failures;
|
||||
}
|
||||
|
||||
/* --- read-side corruption injection --------------------------------------- */
|
||||
|
||||
/* canary read back intact after each corruption; victim gets the byte surgery
|
||||
*/
|
||||
#define CORRUPT_ADR "corrupt.example.com"
|
||||
static char corrupt_body_a[33 + 1];
|
||||
static char corrupt_body_b[44 + 1];
|
||||
|
||||
/* Write a fresh two-entry cache: /canary.html then /victim.html. */
|
||||
static void corrupt_build(httrackp *opt) {
|
||||
cache_back cache;
|
||||
|
||||
memset(corrupt_body_a, 'a', sizeof(corrupt_body_a) - 1);
|
||||
memset(corrupt_body_b, 'b', sizeof(corrupt_body_b) - 1);
|
||||
remove(reconcile_st_path(opt, "hts-cache/new.zip"));
|
||||
remove(reconcile_st_path(opt, "hts-cache/old.zip"));
|
||||
selftest_open_for_write(&cache, opt);
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/canary.html", "canary.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "", "", "", corrupt_body_a,
|
||||
strlen(corrupt_body_a));
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/victim.html", "victim.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "", "", "", corrupt_body_b,
|
||||
strlen(corrupt_body_b));
|
||||
selftest_close(&cache);
|
||||
}
|
||||
|
||||
/* Like corrupt_build, but the victim carries a 20-char Etag whose header line
|
||||
is later overwritten with a forged oversized X-Size (same byte length). */
|
||||
static void corrupt_build_etag(httrackp *opt) {
|
||||
cache_back cache;
|
||||
|
||||
memset(corrupt_body_a, 'a', sizeof(corrupt_body_a) - 1);
|
||||
memset(corrupt_body_b, 'b', sizeof(corrupt_body_b) - 1);
|
||||
remove(reconcile_st_path(opt, "hts-cache/new.zip"));
|
||||
remove(reconcile_st_path(opt, "hts-cache/old.zip"));
|
||||
selftest_open_for_write(&cache, opt);
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/canary.html", "canary.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "", "", "", corrupt_body_a,
|
||||
strlen(corrupt_body_a));
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/victim.html", "victim.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "AAAAAAAAAAAAAAAAAAAA", "", "",
|
||||
corrupt_body_b, strlen(corrupt_body_b));
|
||||
selftest_close(&cache);
|
||||
}
|
||||
|
||||
/* Like corrupt_build_etag, but the victim is headers-only (X-In-Cache: 0,
|
||||
body on disk): the shape every non-html file is stored with. */
|
||||
static void corrupt_build_disk(httrackp *opt) {
|
||||
cache_back cache;
|
||||
htsblk w;
|
||||
char locw[4];
|
||||
char BIGSTK save[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK catbuff[HTS_URLMAXSIZE * 2];
|
||||
char *path;
|
||||
FILE *fp;
|
||||
|
||||
memset(corrupt_body_a, 'a', sizeof(corrupt_body_a) - 1);
|
||||
remove(reconcile_st_path(opt, "hts-cache/new.zip"));
|
||||
remove(reconcile_st_path(opt, "hts-cache/old.zip"));
|
||||
fconcat(save, sizeof(save), StringBuff(opt->path_html_utf8),
|
||||
CORRUPT_ADR "/victim.bin");
|
||||
selftest_open_for_write(&cache, opt);
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/canary.html", "canary.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "", "", "", corrupt_body_a,
|
||||
strlen(corrupt_body_a));
|
||||
hts_init_htsblk(&w);
|
||||
w.statuscode = 200;
|
||||
w.size = (LLint) sizeof(corrupt_body_b) - 1;
|
||||
strcpybuff(w.msg, "OK");
|
||||
strcpybuff(w.contenttype, "application/octet-stream");
|
||||
strcpybuff(w.etag, "AAAAAAAAAAAAAAAAAAAA");
|
||||
locw[0] = '\0';
|
||||
w.location = locw;
|
||||
w.is_write = 0;
|
||||
cache_add(opt, &cache, &w, CORRUPT_ADR, "/victim.bin", save,
|
||||
0 /* all_in_cache */, NULL);
|
||||
selftest_close(&cache);
|
||||
/* the reader only checks this file exists; it never reads it here */
|
||||
path = fconv(catbuff, sizeof(catbuff), save);
|
||||
(void) structcheck(path);
|
||||
fp = FOPEN(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/* Patch the nth of total occurrences of pat (same-length rep) in new.zip. */
|
||||
static void corrupt_patch(httrackp *opt, const char *pat, size_t patlen,
|
||||
const char *rep, size_t nth, size_t total) {
|
||||
LLint fsz = 0;
|
||||
char *data = readfile2(reconcile_st_path(opt, "hts-cache/new.zip"), &fsz);
|
||||
const size_t n = (size_t) fsz;
|
||||
size_t k, hits = 0, at = 0;
|
||||
FILE *fp;
|
||||
|
||||
assertf(data != NULL);
|
||||
for (k = 0; k + patlen <= n; k++) {
|
||||
if (memcmp(data + k, pat, patlen) == 0) {
|
||||
hits++;
|
||||
if (hits == nth)
|
||||
at = k;
|
||||
}
|
||||
}
|
||||
assertf(hits == total);
|
||||
memcpy(data + at, rep, patlen);
|
||||
fp = fopen(reconcile_st_path(opt, "hts-cache/new.zip"), "wb");
|
||||
assertf(fp != NULL);
|
||||
assertf(fwrite(data, 1, n, fp) == n);
|
||||
fclose(fp);
|
||||
freet(data);
|
||||
}
|
||||
|
||||
/* Garbage the first bytes of the victim's deflated data (2nd local header). */
|
||||
static void corrupt_victim_body(httrackp *opt) {
|
||||
LLint fsz = 0;
|
||||
char *data = readfile2(reconcile_st_path(opt, "hts-cache/new.zip"), &fsz);
|
||||
const size_t n = (size_t) fsz;
|
||||
size_t k, hits = 0, off = 0;
|
||||
FILE *fp;
|
||||
|
||||
assertf(data != NULL);
|
||||
for (k = 0; k + 4 <= n; k++) {
|
||||
if (memcmp(data + k, "PK\x03\x04", 4) == 0 && ++hits == 2) {
|
||||
const size_t namelen =
|
||||
(unsigned char) data[k + 26] | ((unsigned char) data[k + 27] << 8);
|
||||
const size_t extralen =
|
||||
(unsigned char) data[k + 28] | ((unsigned char) data[k + 29] << 8);
|
||||
|
||||
off = k + 30 + namelen + extralen;
|
||||
}
|
||||
}
|
||||
assertf(hits == 2);
|
||||
assertf(off != 0 && off + 4 <= n);
|
||||
memset(data + off, 0xFF, 4);
|
||||
fp = fopen(reconcile_st_path(opt, "hts-cache/new.zip"), "wb");
|
||||
assertf(fp != NULL);
|
||||
assertf(fwrite(data, 1, n, fp) == n);
|
||||
fclose(fp);
|
||||
freet(data);
|
||||
}
|
||||
|
||||
/* Read the corrupt /victim.html and, in the SAME read session, the intact
|
||||
/canary.html: the victim must be rejected (wantmsg pins which path) and the
|
||||
canary must still decode byte-exact, proving one bad entry never taints a
|
||||
sibling read. */
|
||||
static int corrupt_expect_victim_fil(httrackp *opt, const char *fil,
|
||||
const char *wantmsg, const char *what) {
|
||||
cache_back cache;
|
||||
htsblk v, c;
|
||||
char BIGSTK lv[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK lc[HTS_URLMAXSIZE * 2];
|
||||
int fail = 0;
|
||||
|
||||
selftest_open_for_read(&cache, opt);
|
||||
lv[0] = lc[0] = '\0';
|
||||
v = cache_readex(opt, &cache, CORRUPT_ADR, fil, "", lv, NULL, 1);
|
||||
if (v.statuscode != STATUSCODE_INVALID) {
|
||||
fprintf(stderr, "%s: %s: victim: statuscode is %d, expected %d\n",
|
||||
selftest_tag, what, v.statuscode, STATUSCODE_INVALID);
|
||||
fail++;
|
||||
}
|
||||
if (wantmsg != NULL && strcmp(v.msg, wantmsg) != 0) {
|
||||
fprintf(stderr, "%s: %s: victim: msg is '%s', expected '%s'\n",
|
||||
selftest_tag, what, v.msg, wantmsg);
|
||||
fail++;
|
||||
}
|
||||
c = cache_readex(opt, &cache, CORRUPT_ADR, "/canary.html", "", lc, NULL, 1);
|
||||
if (c.statuscode != 200 || c.adr == NULL ||
|
||||
c.size != (LLint) strlen(corrupt_body_a) ||
|
||||
memcmp(c.adr, corrupt_body_a, strlen(corrupt_body_a)) != 0) {
|
||||
fprintf(stderr, "%s: %s: canary tainted (status %d)\n", selftest_tag, what,
|
||||
c.statuscode);
|
||||
fail++;
|
||||
}
|
||||
if (v.adr != NULL)
|
||||
freet(v.adr);
|
||||
if (c.adr != NULL)
|
||||
freet(c.adr);
|
||||
selftest_close(&cache);
|
||||
return fail;
|
||||
}
|
||||
|
||||
static int corrupt_expect_victim(httrackp *opt, const char *wantmsg,
|
||||
const char *what) {
|
||||
return corrupt_expect_victim_fil(opt, "/victim.html", wantmsg, what);
|
||||
}
|
||||
|
||||
/* Headers-only probe of the disk victim: must parse OK with the size kept. */
|
||||
static int corrupt_expect_disk_header(httrackp *opt, LLint wantsize,
|
||||
const char *what) {
|
||||
cache_back cache;
|
||||
htsblk v;
|
||||
char BIGSTK lv[HTS_URLMAXSIZE * 2];
|
||||
int fail = 0;
|
||||
|
||||
selftest_open_for_read(&cache, opt);
|
||||
lv[0] = '\0';
|
||||
v = cache_readex(opt, &cache, CORRUPT_ADR, "/victim.bin", NULL, lv, NULL, 1);
|
||||
if (v.statuscode != 200 || v.size != wantsize) {
|
||||
fprintf(stderr,
|
||||
"%s: %s: statuscode %d size " LLintP ", expected 200/" LLintP "\n",
|
||||
selftest_tag, what, v.statuscode, (LLint) v.size, wantsize);
|
||||
fail++;
|
||||
}
|
||||
if (v.adr != NULL)
|
||||
freet(v.adr);
|
||||
selftest_close(&cache);
|
||||
return fail;
|
||||
}
|
||||
|
||||
/* One zip corruption case: build, patch, then check victim+canary in-session.
|
||||
*/
|
||||
static int corrupt_case_zip(httrackp *opt, const char *pat, const char *rep,
|
||||
size_t nth, size_t total, const char *wantmsg,
|
||||
const char *what) {
|
||||
corrupt_build(opt);
|
||||
corrupt_patch(opt, pat, strlen(pat), rep, nth, total);
|
||||
return corrupt_expect_victim(opt, wantmsg, what);
|
||||
}
|
||||
|
||||
int cache_corruption_selftest(httrackp *opt, const char *dir) {
|
||||
int failures = 0;
|
||||
|
||||
selftest_tag = "cache-corrupt";
|
||||
golden_setup(opt, dir);
|
||||
|
||||
failures +=
|
||||
corrupt_case_zip(opt, "X-Size: 44", "X-Size: 99", 1, 1,
|
||||
"Cache Read Error : Read Data", "oversized X-Size");
|
||||
failures +=
|
||||
corrupt_case_zip(opt, "X-Size: 44", "X-Size: -4", 1, 1,
|
||||
"Cache Read Error : Bad Size", "negative X-Size");
|
||||
/* both entries carry the line; the victim's is the second */
|
||||
failures += corrupt_case_zip(opt, "X-In-Cache: 1", "X-In-Cache: 0", 2, 2,
|
||||
"Previous cache file not found (empty filename)",
|
||||
"blanked X-In-Cache");
|
||||
/* smashed local file header: the entry is dropped at index load */
|
||||
failures +=
|
||||
corrupt_case_zip(opt, "PK\x03\x04", "XK\x03\x04", 2, 2,
|
||||
"File Cache Entry Not Found", "smashed local header");
|
||||
|
||||
corrupt_build(opt);
|
||||
corrupt_victim_body(opt);
|
||||
failures += corrupt_expect_victim(opt, "Cache Read Error : Read Data",
|
||||
"garbled deflate stream");
|
||||
|
||||
/* An X-Size above INT_MAX is positive as int64 (slips a bare sign check) but
|
||||
truncates negative in the (int) cast the malloc uses: a wraparound alloc.
|
||||
cache_add asserts size fits an int, so such a value only reaches the reader
|
||||
from a corrupt/foreign cache; inject it by overwriting the victim's long
|
||||
Etag line with a same-length forged X-Size line (the parser keeps the last
|
||||
X-Size it sees), keeping the zip byte-length and offsets intact. */
|
||||
corrupt_build_etag(opt);
|
||||
corrupt_patch(opt, "Etag: AAAAAAAAAAAAAAAAAAAA", 26,
|
||||
"X-Size: 2147483648AAAAAAAA", 1, 1);
|
||||
failures += corrupt_expect_victim(opt, "Cache Read Error : Bad Size",
|
||||
"X-Size above INT_MAX");
|
||||
|
||||
/* A headers-only entry (X-In-Cache: 0) may carry an X-Size >= INT_MAX: that
|
||||
is how every >2GB non-html file is stored. It must survive a header probe
|
||||
(or every update re-fetches the file); an in-memory read still rejects. */
|
||||
corrupt_build_disk(opt);
|
||||
corrupt_patch(opt, "Etag: AAAAAAAAAAAAAAAAAAAA", 26,
|
||||
"X-Size: 2147483648AAAAAAAA", 1, 1);
|
||||
failures += corrupt_expect_disk_header(opt, (LLint) 2147483648LL,
|
||||
"headers-only X-Size above INT_MAX");
|
||||
failures += corrupt_expect_victim_fil(opt, "/victim.bin",
|
||||
"Cache Read Error : Bad Size",
|
||||
"in-memory X-Size above INT_MAX");
|
||||
|
||||
/* exactly INT_MAX pins the >= boundary: (int) r.size + 1 would overflow */
|
||||
corrupt_build_disk(opt);
|
||||
corrupt_patch(opt, "Etag: AAAAAAAAAAAAAAAAAAAA", 26,
|
||||
"X-Size: 2147483647AAAAAAAA", 1, 1);
|
||||
failures += corrupt_expect_victim_fil(opt, "/victim.bin",
|
||||
"Cache Read Error : Bad Size",
|
||||
"in-memory X-Size at INT_MAX");
|
||||
|
||||
/* the negative check must stay global, headers-only included */
|
||||
corrupt_build_disk(opt);
|
||||
corrupt_patch(opt, "Etag: AAAAAAAAAAAAAAAAAAAA", 26,
|
||||
"X-Size: -2147483648AAAAAAA", 1, 1);
|
||||
failures += corrupt_expect_victim_fil(opt, "/victim.bin",
|
||||
"Cache Read Error : Bad Size",
|
||||
"headers-only negative X-Size");
|
||||
|
||||
return failures;
|
||||
}
|
||||
|
||||
@@ -52,10 +52,19 @@ int cache_selftests(httrackp *opt, const char *dir);
|
||||
committed file, never by the test). Returns the failed-check count. */
|
||||
int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
|
||||
|
||||
/* #174/#219: assert a failing cache write aborts the mirror cleanly instead of
|
||||
crashing. Returns the failed-check count. */
|
||||
/* Cache write-failure policy (#174/#219): abort on fatal errno or a streak,
|
||||
drop just the entry otherwise. Returns the failed-check count. */
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
/* Exercise the hts_cache_reconcile() generation policies on file fixtures
|
||||
under <dir>. Returns the failed-check count. */
|
||||
int cache_reconcile_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
/* Inject read-side corruption (zip byte surgery: bad size, header, deflate)
|
||||
under <dir> and assert every case degrades to STATUSCODE_INVALID without
|
||||
tainting a sibling entry. */
|
||||
int cache_corruption_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2137,47 +2137,7 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
hts_log_print(opt, LOG_NOTICE,
|
||||
"No data seems to have been transferred during this session! : restoring previous one!");
|
||||
XH_uninit;
|
||||
if ((fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log), "hts-cache/old.dat")))
|
||||
&&
|
||||
(fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx")))) {
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.lst"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.txt"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.lst"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.lst"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.txt"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.txt"));
|
||||
}
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
opt->state.exit_xh = 2; /* interrupted (no connection detected) */
|
||||
return 1;
|
||||
}
|
||||
@@ -2892,6 +2852,9 @@ int check_fatal_io_errno(void) {
|
||||
#endif
|
||||
#ifdef EROFS
|
||||
case EROFS: /* Read-only file system */
|
||||
#endif
|
||||
#ifdef EDQUOT
|
||||
case EDQUOT: /* Disk quota exceeded */
|
||||
#endif
|
||||
return 1;
|
||||
break;
|
||||
@@ -3371,6 +3334,41 @@ int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
|
||||
return n;
|
||||
}
|
||||
|
||||
/* One engine-loop tick: refresh the transfer stats and run the loop callback
|
||||
for slot b (-1 = none). HTS_FALSE = the callback requested an abort. */
|
||||
hts_boolean hts_loop_tick(struct_back *sback, httrackp *opt, int b, int ptr) {
|
||||
engine_stats();
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
return RUN_CALLBACK7(
|
||||
opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
}
|
||||
|
||||
/* Single implementation of the historical WAIT_FOR_AVAILABLE_SOCKET macros. */
|
||||
hts_boolean hts_wait_available_socket(struct_back *sback, httrackp *opt,
|
||||
cache_back *cache, int ptr) {
|
||||
const int prev = opt->state._hts_in_html_parsing;
|
||||
|
||||
while (back_pluggable_sockets_strict(sback, opt) <= 0) {
|
||||
opt->state._hts_in_html_parsing = 6;
|
||||
back_wait(sback, opt, cache, 0);
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */
|
||||
if (!back_checkmirror(opt))
|
||||
break;
|
||||
if (!hts_loop_tick(sback, opt, -1, ptr))
|
||||
return HTS_FALSE;
|
||||
}
|
||||
opt->state._hts_in_html_parsing = prev;
|
||||
return HTS_TRUE;
|
||||
}
|
||||
|
||||
int back_pluggable_sockets(struct_back * sback, httrackp * opt) {
|
||||
int n;
|
||||
|
||||
|
||||
@@ -216,6 +216,7 @@ struct cache_back {
|
||||
int zipEntriesCapa;
|
||||
hts_boolean
|
||||
zipWriteFailed; /**< a cache write failed; stop touching the stream */
|
||||
int zipWriteFailures; /**< consecutive entry write failures; reset on store */
|
||||
};
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_hash_struct
|
||||
@@ -432,6 +433,15 @@ int back_pluggable_sockets(struct_back * sback, httrackp * opt);
|
||||
|
||||
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt);
|
||||
|
||||
/* One engine-loop tick: refresh the transfer stats and run the loop callback
|
||||
for slot b (-1 = none). HTS_FALSE = the callback requested an abort. */
|
||||
hts_boolean hts_loop_tick(struct_back *sback, httrackp *opt, int b, int ptr);
|
||||
|
||||
/* Wait until a test socket can be plugged, pumping transfers, stats and the
|
||||
loop callback; gives up past the -E deadline. HTS_FALSE = callback abort. */
|
||||
hts_boolean hts_wait_available_socket(struct_back *sback, httrackp *opt,
|
||||
cache_back *cache, int ptr);
|
||||
|
||||
/* Randomized inter-file pause target in [min_ms,max_ms] (#185), derived from a
|
||||
timestamp seed so it is stable within one gap and rerolls per launch. */
|
||||
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms);
|
||||
|
||||
@@ -544,69 +544,11 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
}
|
||||
}
|
||||
|
||||
// Existence d'un cache - pas de new mais un old.. renommer
|
||||
// No new cache but an old one? promote it
|
||||
#if DEBUG_STEPS
|
||||
printf("Checking cache\n");
|
||||
#endif
|
||||
if (!fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), "hts-cache/new.zip"))) {
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), "hts-cache/old.zip"))) {
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.zip"));
|
||||
}
|
||||
} else
|
||||
if ((!fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), "hts-cache/new.dat")))
|
||||
||
|
||||
(!fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx")))) {
|
||||
if ((fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), "hts-cache/old.dat")))
|
||||
&&
|
||||
(fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx")))) {
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
//remove(fconcat(StringBuff(opt->path_log),"hts-cache/new.lst"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
//rename(fconcat(StringBuff(opt->path_log),"hts-cache/old.lst"),fconcat(StringBuff(opt->path_log),"hts-cache/new.lst"));
|
||||
}
|
||||
}
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
|
||||
/* Interrupted mirror detected */
|
||||
if (!opt->quiet) {
|
||||
@@ -2554,109 +2496,8 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
printf("Cache & log settings\n");
|
||||
#endif
|
||||
|
||||
// on utilise le cache..
|
||||
// en cas de présence des deux versions, garder la version la plus avancée,
|
||||
// cad la version contenant le plus de fichiers
|
||||
if (opt->cache) {
|
||||
if (fexist(fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log), "hts-in_progress.lock"))) { // problemes..
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"))) {
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip"))) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.zip")) < 32768) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip")) > 65536) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip")) > fsize(fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->
|
||||
path_log),
|
||||
"hts-cache/new.zip")))
|
||||
{
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.zip"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.zip"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"))
|
||||
&&
|
||||
fexist(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"))) {
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat"))
|
||||
&&
|
||||
fexist(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx"))) {
|
||||
// switcher si new<32Ko et old>65Ko (tailles arbitraires) ?
|
||||
// ce cas est peut être une erreur ou un crash d'un miroir ancien, prendre
|
||||
// alors l'ancien cache
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat")) < 32768) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat")) > 65536) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat")) > fsize(fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->
|
||||
path_log),
|
||||
"hts-cache/new.dat")))
|
||||
{
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
//} else { // ne rien faire
|
||||
// remove("hts-cache/old.dat");
|
||||
// remove("hts-cache/old.ndx");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// If both cache generations exist, keep the most complete one
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
// Débuggage des en têtes
|
||||
if (_DEBUG_HEAD) {
|
||||
ioinfo =
|
||||
|
||||
@@ -43,8 +43,8 @@ Please visit our Website: http://www.httrack.com
|
||||
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
||||
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
||||
LIB_VERSION the data/cache format generation. */
|
||||
#define HTTRACK_VERSION "3.49-10"
|
||||
#define HTTRACK_VERSIONID "3.49.10"
|
||||
#define HTTRACK_VERSION "3.49-11"
|
||||
#define HTTRACK_VERSIONID "3.49.11"
|
||||
#define HTTRACK_AFF_VERSION "3.x"
|
||||
#define HTTRACK_LIB_VERSION "2.0"
|
||||
|
||||
|
||||
@@ -74,37 +74,6 @@ static const char *hts_tbdev[] = {
|
||||
""
|
||||
};
|
||||
|
||||
#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() \
|
||||
do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while (back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state._hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback, opt, cache, 0); \
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */ \
|
||||
if (!back_checkmirror(opt)) \
|
||||
break; \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error"); \
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning"); \
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info"); \
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr); \
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback); \
|
||||
/* Check */ \
|
||||
{ \
|
||||
if (!RUN_CALLBACK7( \
|
||||
opt, loop, sback->lnk, sback->count, -1, ptr, opt->lien_tot, \
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while (0)
|
||||
|
||||
/* Strip all // */
|
||||
static void cleanDoubleSlash(char *s) {
|
||||
int i, j;
|
||||
@@ -658,11 +627,10 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
int has_been_moved = 0;
|
||||
lien_adrfil current;
|
||||
|
||||
/* Ensure we don't use too many sockets by using a "testing" one
|
||||
If we have only 1 simultaneous connection authorized, wait for pending download
|
||||
Wait for an available slot
|
||||
/* Wait for an available test slot, honoring the connection limits
|
||||
*/
|
||||
URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
|
||||
if (!hts_wait_available_socket(sback, opt, cache, ptr))
|
||||
return -1;
|
||||
|
||||
/* Rock'in */
|
||||
current.adr[0] = current.fil[0] = '\0';
|
||||
@@ -692,24 +660,11 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (ptr >= 0) {
|
||||
back_fillmax(sback, opt, cache, ptr, numero_passe);
|
||||
}
|
||||
// on est obligé d'appeler le shell pour le refresh..
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart),
|
||||
&HTS_STAT)) {
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
|
||||
} else if (opt->state._hts_cancel ||
|
||||
!back_checkmirror(
|
||||
opt)) { // cancel level 2 or 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
stop_looping = 1;
|
||||
}
|
||||
@@ -774,8 +729,9 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
"Loop with HEAD request (during prefetch) at %s%s",
|
||||
current.adr, current.fil);
|
||||
}
|
||||
// Ajouter
|
||||
URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
|
||||
if (!hts_wait_available_socket(sback, opt,
|
||||
cache, ptr))
|
||||
return -1;
|
||||
if (back_add(sback, opt, cache, moved.adr, moved.fil, methode, referer_adr, referer_fil, 1) != -1) { // OK
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"(during prefetch) %s (%d) to link %s at %s%s",
|
||||
|
||||
101
src/htsparse.c
101
src/htsparse.c
@@ -3399,20 +3399,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
|
||||
back_fillmax(sback, opt, cache, ptr, numero_passe);
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, 0, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
if (!hts_loop_tick(sback, opt, 0, ptr)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
@@ -3423,7 +3410,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
nofollow = 1; // moins violent
|
||||
opt->state._hts_cancel = 0;
|
||||
}
|
||||
|
||||
}
|
||||
// refresh the backing system each 2 seconds
|
||||
if (engine_stats()) {
|
||||
@@ -3960,22 +3946,8 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
|
||||
{
|
||||
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
b = 0;
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)
|
||||
|| !back_checkmirror(opt)) {
|
||||
if (!hts_loop_tick(sback, opt, b, ptr) || !back_checkmirror(opt)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
@@ -4081,20 +4053,7 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
|
||||
if (!back_checkmirror(opt))
|
||||
break;
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
@@ -4281,26 +4240,12 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
|
||||
freet(s);
|
||||
}
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#if HTS_POLL
|
||||
@@ -4533,10 +4478,9 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
IS_DELAYED_EXT(afs->save) && continue_loop && loops < 7; loops++) {
|
||||
continue_loop = 0;
|
||||
|
||||
/*
|
||||
Wait for an available slot
|
||||
*/
|
||||
WAIT_FOR_AVAILABLE_SOCKET();
|
||||
/* Wait for an available slot */
|
||||
if (!hts_wait_available_socket(sback, opt, cache, ptr))
|
||||
return -1;
|
||||
|
||||
/* We can lookup directly in the cache to speedup this mess */
|
||||
if (opt->delayed_cached) {
|
||||
@@ -4682,29 +4626,14 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
if (ptr >= 0) {
|
||||
back_fillmax(sback, opt, cache, ptr, numero_passe);
|
||||
}
|
||||
// on est obligé d'appeler le shell pour le refresh..
|
||||
{
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
back_set_unlocked(sback, b);
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
break;
|
||||
}
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
back_set_unlocked(sback, b);
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel ||
|
||||
!back_checkmirror(
|
||||
opt)) { // cancel level 2 or 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
break;
|
||||
}
|
||||
} while (
|
||||
/* dns/connect/request */
|
||||
|
||||
@@ -175,33 +175,4 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
/* Apply changes */ \
|
||||
* str->ptr_ = ptr
|
||||
|
||||
#define WAIT_FOR_AVAILABLE_SOCKET() \
|
||||
do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while (back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state._hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback, opt, cache, 0); \
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */ \
|
||||
if (!back_checkmirror(opt)) \
|
||||
break; \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error"); \
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning"); \
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info"); \
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr); \
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback); \
|
||||
/* Check */ \
|
||||
if (!RUN_CALLBACK7( \
|
||||
opt, loop, sback->lnk, sback->count, -1, ptr, opt->lien_tot, \
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1347,6 +1347,30 @@ static int st_cache_writefail(httrackp *opt, int argc, char **argv) {
|
||||
return err;
|
||||
}
|
||||
|
||||
static int st_cache_corrupt(httrackp *opt, int argc, char **argv) {
|
||||
int err;
|
||||
|
||||
if (argc < 1) {
|
||||
fprintf(stderr, "cache-corrupt: needs a directory\n");
|
||||
return 1;
|
||||
}
|
||||
err = cache_corruption_selftest(opt, argv[0]);
|
||||
printf("cache-corrupt: %s\n", err ? "FAIL" : "OK");
|
||||
return err;
|
||||
}
|
||||
|
||||
static int st_reconcile(httrackp *opt, int argc, char **argv) {
|
||||
int err;
|
||||
|
||||
if (argc < 1) {
|
||||
fprintf(stderr, "reconcile: needs a directory\n");
|
||||
return 1;
|
||||
}
|
||||
err = cache_reconcile_selftest(opt, argv[0]);
|
||||
printf("cache-reconcile: %s\n", err ? "FAIL" : "OK");
|
||||
return err;
|
||||
}
|
||||
|
||||
static int st_dns(httrackp *opt, int argc, char **argv) {
|
||||
const int err = dns_selftests(opt);
|
||||
|
||||
@@ -2119,6 +2143,10 @@ static const struct selftest_entry {
|
||||
st_cache_golden},
|
||||
{"cache-writefail", "<dir>", "cache write-failure handling self-test",
|
||||
st_cache_writefail},
|
||||
{"reconcile", "<dir>", "cache generation reconcile policy self-test",
|
||||
st_reconcile},
|
||||
{"cache-corrupt", "<dir>", "cache read-side corruption self-test",
|
||||
st_cache_corrupt},
|
||||
{"dns", "", "DNS resolver/cache self-test", st_dns},
|
||||
{"cookies", "", "cookie request-header self-test", st_cookies},
|
||||
{"useragent", "", "default User-Agent self-test", st_useragent},
|
||||
|
||||
17
tests/01_engine-reconcile.test
Normal file
17
tests/01_engine-reconcile.test
Normal file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Cache generation reconcile policies (httrack -#test=reconcile <dir>):
|
||||
# promote a stranded old generation, keep the larger one after an aborted
|
||||
# run, and restore the old one when an update transferred nothing.
|
||||
|
||||
set -eu
|
||||
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
out=$(httrack -#test=reconcile "$dir")
|
||||
|
||||
test "$out" = "cache-reconcile: OK" || {
|
||||
echo "expected 'cache-reconcile: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
19
tests/01_zlib-cache-corrupt.test
Normal file
19
tests/01_zlib-cache-corrupt.test
Normal file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Read-side cache corruption (httrack -#test=cache-corrupt <dir>): zip byte
|
||||
# surgery (bad/oversized X-Size, blanked X-In-Cache, smashed header, garbled
|
||||
# deflate) must each be rejected per-entry, never crash, never taint the sibling.
|
||||
|
||||
set -eu
|
||||
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
# the smashed-header case logs expected "Corrupted cache entry" warnings on
|
||||
# stdout; the verdict is the last line
|
||||
out=$(httrack -#test=cache-corrupt "$dir" 2>/dev/null | tail -n1)
|
||||
|
||||
test "$out" = "cache-corrupt: OK" || {
|
||||
echo "expected 'cache-corrupt: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -4,10 +4,9 @@
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Cache write-failure handling (httrack -#test=cache-writefail <dir>). #174/#219.
|
||||
# A failing new.zip write (disk full) used to crash the process via assertf; it
|
||||
# must instead stop the mirror with a fatal error (exit_xh=-1), no crash. The
|
||||
# self-test asserts that; reverting the fix makes -#test=cache-writefail abort (SIGABRT) and fail.
|
||||
# Cache write-failure policy (-#test=cache-writefail <dir>). #174/#219: disk
|
||||
# full or a failure streak aborts cleanly; an isolated failure or an oversized
|
||||
# entry is only dropped.
|
||||
|
||||
set -eu
|
||||
|
||||
@@ -22,3 +21,9 @@ printf '%s\n' "$out" | grep -qx "cache-writefail: OK" || {
|
||||
echo "expected 'cache-writefail: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# A skipped entry must be warned about with its URL.
|
||||
printf '%s\n' "$out" | grep -q "entry not cached: example.com/" || {
|
||||
echo "expected a URL-bearing skip warning" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -1,17 +1,19 @@
|
||||
#!/bin/bash
|
||||
# Issues #32/#41: a Content-Length that disagrees with the body warns "bogus
|
||||
# state (broken size)" and skips the cache; -%B (tolerant) accepts it.
|
||||
# Issues #32/#41: a Content-Length that disagrees with the body warns
|
||||
# "incomplete transfer" and skips the cache; -%B (tolerant) accepts it.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# Default: warn, but the file is still written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-found 'bogus state \(broken size' \
|
||||
--log-found 'incomplete transfer \(expected' \
|
||||
httrack 'BASEURL/size/index.html'
|
||||
|
||||
# -%B (tolerant): no warning, file written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-not-found 'bogus state' \
|
||||
--log-not-found 'incomplete transfer|not cached' \
|
||||
httrack 'BASEURL/size/index.html' '-%B'
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# Degenerate delayed-type paths (#5/#107 family): redirects that never resolve
|
||||
# a name must drop cleanly -- no .delayed leftovers (audited by local-crawl.sh),
|
||||
# no "bogus state" cache warnings, resolvable links still land correctly.
|
||||
# no "not cached" warnings, resolvable links still land correctly.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -16,5 +16,5 @@ bash "$top_srcdir/tests/local-crawl.sh" --rerun --errors 0 \
|
||||
--not-found 'delayed/noloc.html' \
|
||||
--not-found 'delayed/selfloop.html' \
|
||||
--not-found 'delayed/chain9.pdf' \
|
||||
--log-not-found 'bogus state' \
|
||||
--log-not-found 'not cached' \
|
||||
httrack 'BASEURL/delayed/index.html'
|
||||
|
||||
55
tests/36_local-bigcrawl.test
Normal file
55
tests/36_local-bigcrawl.test
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Diverse seeded /big/ crawl: 12 pattern families, decoy absence, update pass
|
||||
# must 304-revalidate. 360 = 1 index + 96 pages + 192 imgs + 5 shared + 60
|
||||
# family + 6 singles; the 4 planted errors write -o1 pages, not counted.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --rerun \
|
||||
--errors 4 --files 360 \
|
||||
--found 'big/p/95.html' \
|
||||
--found 'big/a/d1/d2/d3/d4/d5/d6/d7/d8/deep.png' \
|
||||
--found 'big/a/f2-2x.png' \
|
||||
--found 'big/a/subs.vtt' \
|
||||
--found 'big/a/font.woff2' \
|
||||
--found 'big/a/js-data.bin' \
|
||||
--found 'big/d/01.pdf' \
|
||||
--found 'big/d/named.pdf' \
|
||||
--found 'big/a/doc.pdf' \
|
||||
--found "big/f9/caf$(printf '\xc3\xa9').html" \
|
||||
--found 'big/f7/fa.html' \
|
||||
--found 'big/a/ref.png' \
|
||||
--found 'big/f6/sub/leaf.html' \
|
||||
--found 'big/f1/dir/index.html' \
|
||||
--found 'big/f10/empty.html' \
|
||||
--found 'big/indexd41d.html' \
|
||||
--found 'big/a/i0a.png' \
|
||||
--not-found 'big/x/og' \
|
||||
--not-found 'big/x/tw' \
|
||||
--not-found 'big/x/jsonld.png' \
|
||||
--not-found 'big/x/never-scanned.png' \
|
||||
--not-found 'big/x/atom-only.html' \
|
||||
--not-found 'big/x/sitemap-only.html' \
|
||||
--not-found 'big/x/form-target.html' \
|
||||
--not-found 'big/x/formact' \
|
||||
--not-found 'big/x/ping' \
|
||||
--not-found 'big/x/aj.jar' \
|
||||
--not-found 'big/x/bj.jar' \
|
||||
--not-found 'big/x/is1.png' \
|
||||
--not-found 'big/x/concat.html' \
|
||||
--file-matches 'big/p/2.html' 'srcset="\.\./a/f2-1x\.png 1x, \.\./a/f2-2x\.png 2x"' \
|
||||
--file-matches 'big/a/blk2.css' 'url\(blk2-bg\.png\)' \
|
||||
--file-matches 'big/p/5.html' "document\\.write\\('<a href=\"\\.\\./f5/dw\\.html\"" \
|
||||
--file-not-matches 'big/p/1.html' 'href="/big/' \
|
||||
--log-not-found 'not cached|[Pp]anic|assert' \
|
||||
--log-found '\(404\) at link [^ ]*/big/e/404\.html' \
|
||||
--log-found '\(410\) at link [^ ]*/big/e/410\.html' \
|
||||
--log-found '\(500\) at link [^ ]*/big/e/500\.html' \
|
||||
--log-found 'decompressing.*big/e/gztrunc\.html' \
|
||||
--log-found ', no files updated' \
|
||||
--max-mirror-bytes 700000 \
|
||||
--min-mirror-bytes 500000 \
|
||||
httrack 'BASEURL/big/index.html' --retries=0 -c8 -%c100 -A100000000
|
||||
12
tests/37_local-cache-outage.test
Normal file
12
tests/37_local-cache-outage.test
Normal file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# An update run against a dead server must not destroy the cache: the no-data
|
||||
# rollback restores the previous hts-cache generation (zip caches lost it).
|
||||
|
||||
set -eu
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun-dead \
|
||||
--found 'simple/basic.html' \
|
||||
httrack 'BASEURL/simple/basic.html'
|
||||
14
tests/38_local-update-304.test
Normal file
14
tests/38_local-update-304.test
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# An all-304 update of a tiny site (headers under the 32K rollback threshold)
|
||||
# is a healthy run: it must not trip the no-data rollback as a fake outage.
|
||||
|
||||
set -eu
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--log-found 'no files updated' \
|
||||
--log-not-found 'No data seems to have been transferred' \
|
||||
--found 'mini304/index.html' --found 'mini304/page.html' \
|
||||
httrack 'BASEURL/mini304/index.html'
|
||||
@@ -48,6 +48,7 @@ TESTS = \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-reconcile.test \
|
||||
01_engine-redirect.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
@@ -63,6 +64,7 @@ TESTS = \
|
||||
01_engine-useragent.test \
|
||||
01_zlib-acceptencoding.test \
|
||||
01_zlib-cache.test \
|
||||
01_zlib-cache-corrupt.test \
|
||||
01_zlib-cache-golden.test \
|
||||
01_zlib-cache-writefail.test \
|
||||
01_zlib-savename-cached.test \
|
||||
@@ -98,6 +100,9 @@ TESTS = \
|
||||
32_local-cdispo.test \
|
||||
33_local-delayed.test \
|
||||
34_local-maxtime.test \
|
||||
35_local-maxsize.test
|
||||
35_local-maxsize.test \
|
||||
36_local-bigcrawl.test \
|
||||
37_local-cache-outage.test \
|
||||
38_local-update-304.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -19,12 +19,14 @@
|
||||
# --max-mirror-bytes N \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --max-mirror-bytes asserts the mirrored content (host root) stays under N.
|
||||
# --max/--min-mirror-bytes bound the mirrored content bytes (host root).
|
||||
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
|
||||
# host root), to assert rewritten link/content survived the crawl.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
# which the ephemeral port forces into the cookie domain) and passes it to
|
||||
# httrack via --cookies-file, to exercise preloaded cookies.
|
||||
# --rerun-dead re-runs with the server stopped: the no-data rollback must
|
||||
# restore the previous hts-cache generation byte-identical.
|
||||
|
||||
set -u
|
||||
|
||||
@@ -37,6 +39,7 @@ key="${testdir}/server.key"
|
||||
tls=
|
||||
verbose=
|
||||
rerun=
|
||||
rerun_dead=
|
||||
tmpdir=
|
||||
serverpid=
|
||||
crawlpid=
|
||||
@@ -102,7 +105,8 @@ nargs=$#
|
||||
while test "$pos" -lt "$nargs"; do
|
||||
case "${args[$pos]}" in
|
||||
--debug) verbose=1 ;;
|
||||
--rerun) rerun=1 ;; # run httrack a second time (update pass) before auditing
|
||||
--rerun) rerun=1 ;; # run httrack a second time (update pass) before auditing
|
||||
--rerun-dead) rerun_dead=1 ;; # re-run with the server stopped (cache rollback)
|
||||
--no-purge)
|
||||
nopurge=1
|
||||
audit+=("--no-purge")
|
||||
@@ -126,7 +130,7 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--found | --not-found | --directory | --log-found | --log-not-found | --max-mirror-bytes)
|
||||
--found | --not-found | --directory | --log-found | --log-not-found | --max-mirror-bytes | --min-mirror-bytes)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
@@ -241,6 +245,43 @@ if test -n "$rerun"; then
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- optional dead pass: server stopped, the cache must survive the rollback --
|
||||
if test -n "$rerun_dead"; then
|
||||
zip="${out}/hts-cache/new.zip"
|
||||
test -s "$zip" || die "no cache was written by the first pass"
|
||||
cp "$zip" "${tmpdir}/cache-before.zip"
|
||||
cp "${out}/hts-log.txt" "${tmpdir}/log-before.txt"
|
||||
kill "$serverpid" 2>/dev/null
|
||||
wait "$serverpid" 2>/dev/null
|
||||
serverpid=
|
||||
info "re-running httrack against the stopped server"
|
||||
httrack -O "$out" --user-agent="httrack $ver local ($(uname -omrs))" \
|
||||
"${moreargs[@]}" "${hts[@]}" >"${log}.dead" 2>&1 &
|
||||
crawlpid=$!
|
||||
wait "$crawlpid" || true
|
||||
crawlpid=
|
||||
result "OK (dead pass ran)"
|
||||
# The dead pass must have gone through the no-data rollback, not bailed out
|
||||
# before the mirror loop (which would leave the cache trivially untouched).
|
||||
info "checking the dead pass hit the rollback"
|
||||
if grep -aq "No data seems to have been transferred" "${out}/hts-log.txt"; then
|
||||
result "OK"
|
||||
else
|
||||
result "rollback notice not found in hts-log.txt"
|
||||
exit 1
|
||||
fi
|
||||
info "checking the previous cache generation was restored"
|
||||
if cmp -s "$zip" "${tmpdir}/cache-before.zip" &&
|
||||
test ! -e "${out}/hts-cache/old.zip"; then
|
||||
result "OK"
|
||||
else
|
||||
result "new.zip differs from the pre-outage cache (or old.zip left behind)"
|
||||
exit 1
|
||||
fi
|
||||
# Audits below describe the healthy crawl, not the dead pass.
|
||||
cp "${tmpdir}/log-before.txt" "${out}/hts-log.txt"
|
||||
fi
|
||||
|
||||
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||
hostroot=
|
||||
for cand in "${out}/127.0.0.1_${port}" "${out}/127.0.0.1"; do
|
||||
@@ -327,6 +368,15 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--min-mirror-bytes)
|
||||
i=$((i + 1))
|
||||
sz=$(find "$hostroot" -type f -exec cat {} + | wc -c | tr -d '[:space:]')
|
||||
info "checking mirror size ${sz} >= ${audit[$i]} bytes"
|
||||
if test "$sz" -ge "${audit[$i]}"; then result "OK"; else
|
||||
result "mirror too small"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--file-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
|
||||
@@ -15,6 +15,7 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import hashlib
|
||||
import os
|
||||
import time
|
||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||
@@ -42,6 +43,416 @@ PAGE = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"""
|
||||
|
||||
|
||||
# --- /big/ seeded pseudo-site (36_local-bigcrawl) ---------------------------
|
||||
# Deterministic ~360-file tree; bodies derive from sha256(BIG_SEED, name) so
|
||||
# every run serves identical content and the test pins exact counts.
|
||||
BIG_SEED = "bigcrawl-lite-1"
|
||||
BIG_PAGES = 96
|
||||
BIG_FANOUT = 4
|
||||
# Fixed validator: a matching If-Modified-Since gets 304, so the update pass
|
||||
# revalidates instead of re-downloading.
|
||||
BIG_LASTMOD = "Mon, 01 Jan 2024 00:00:00 GMT"
|
||||
|
||||
BIG_CTYPES = {
|
||||
"html": "text/html",
|
||||
"css": "text/css",
|
||||
"js": "application/x-javascript",
|
||||
"png": "image/png",
|
||||
"gif": "image/gif",
|
||||
"jpg": "image/jpeg",
|
||||
"webp": "image/webp",
|
||||
"pdf": "application/pdf",
|
||||
"woff2": "font/woff2",
|
||||
"mp4": "video/mp4",
|
||||
"webm": "video/webm",
|
||||
"mp3": "audio/mpeg",
|
||||
"vtt": "text/vtt",
|
||||
"xml": "text/xml",
|
||||
"svg": "image/svg+xml",
|
||||
"jar": "application/java-archive",
|
||||
"bin": "application/octet-stream",
|
||||
}
|
||||
|
||||
# Honest magic bytes per claimed type so the #478 sniff never contests.
|
||||
BIG_MAGIC = {
|
||||
"png": b"\x89PNG\r\n\x1a\n",
|
||||
"gif": b"GIF89a",
|
||||
"jpg": b"\xff\xd8\xff\xe0",
|
||||
"webp": b"RIFF\x10\x27\x00\x00WEBPVP8 ",
|
||||
"pdf": b"%PDF-1.4\n",
|
||||
"woff2": b"wOF2",
|
||||
"mp4": b"\x00\x00\x00\x18ftypmp42",
|
||||
"webm": b"\x1a\x45\xdf\xa3",
|
||||
"mp3": b"ID3\x04\x00\x00\x00\x00\x00\x00",
|
||||
"jar": b"PK\x03\x04",
|
||||
}
|
||||
|
||||
|
||||
def big_blob(name, size):
|
||||
out = b""
|
||||
n = 0
|
||||
while len(out) < size:
|
||||
out += hashlib.sha256(f"{BIG_SEED}/{name}/{n}".encode()).digest()
|
||||
n += 1
|
||||
return out[:size]
|
||||
|
||||
|
||||
def big_asset(name):
|
||||
ext = name.rsplit(".", 1)[-1]
|
||||
size = 200 + int(hashlib.sha256(name.encode()).hexdigest(), 16) % 3800
|
||||
raw = big_blob(name, size)
|
||||
if ext in ("css", "js", "txt"):
|
||||
return b"/* " + raw.hex().encode() + b" */"
|
||||
return BIG_MAGIC.get(ext, b"") + raw
|
||||
|
||||
|
||||
def big_html(title, inner):
|
||||
page = (
|
||||
"<!DOCTYPE html><html><head><title>%s</title></head><body>\n%s\n</body></html>"
|
||||
% (
|
||||
title,
|
||||
inner,
|
||||
)
|
||||
)
|
||||
return page.encode()
|
||||
|
||||
|
||||
def _hexfill(name):
|
||||
return big_blob(name, 160).hex()
|
||||
|
||||
|
||||
HOME = '<a href="/big/index.html">home</a>'
|
||||
|
||||
BIG_TEXT_ASSETS = {
|
||||
"site.css": (
|
||||
"body { background: url(bg.png); } /* %s */" % _hexfill("site.css"),
|
||||
"text/css",
|
||||
),
|
||||
"print.css": ("p { margin: 0; } /* %s */" % _hexfill("print.css"), "text/css"),
|
||||
"blk.css": (
|
||||
'@import "blk2.css";\n'
|
||||
'@font-face { font-family: big; src: local("Nope Sans"), '
|
||||
'url(font.woff2) format("woff2"); }\n'
|
||||
"/* %s */" % _hexfill("blk.css"),
|
||||
"text/css",
|
||||
),
|
||||
# Absolute url() must come back relative after the rewrite (test greps it);
|
||||
# the \/ escapes collapse to an already-linked URL if taken literally.
|
||||
"blk2.css": (
|
||||
"body { background: url(/big/a/blk2-bg.png); }\n"
|
||||
"i { background: url(/big\\/a\\/bg.png); }\n"
|
||||
"/* %s */" % _hexfill("blk2.css"),
|
||||
"text/css",
|
||||
),
|
||||
# .open() grabs its first arg only (a method there is rejected, #218), so
|
||||
# the window.open single-URL form is the token-detected shape.
|
||||
"app.js": (
|
||||
'var im = new Image(); im.src = "/big/a/js-img.png";\n'
|
||||
'function pop() { window.open("/big/a/js-data.bin"); }\n'
|
||||
"// %s\n" % _hexfill("app.js"),
|
||||
"application/x-javascript",
|
||||
),
|
||||
"heavy.js": (
|
||||
'var h = new Image(); h.src = "/big/a/js1.png";\n'
|
||||
'function nav() { location.href = "/big/p/1.html"; }\n'
|
||||
'function pop() { window.open("/big/a/js2.bin"); }\n'
|
||||
"// %s\n" % _hexfill("heavy.js"),
|
||||
"application/x-javascript",
|
||||
),
|
||||
# text/javascript is fetched but never scanned: the URL inside must stay
|
||||
# out of the mirror.
|
||||
"decoy.js": (
|
||||
'var d = new Image(); d.src = "/big/x/never-scanned.png";\n',
|
||||
"text/javascript",
|
||||
),
|
||||
"subs.vtt": ("WEBVTT\n\n00:00.000 --> 00:01.000\nbig\n", "text/vtt"),
|
||||
"logo.svg": (
|
||||
'<svg xmlns="http://www.w3.org/2000/svg" width="4" height="4">'
|
||||
'<image href="ref.png" width="4" height="4"/></svg>',
|
||||
"image/svg+xml",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _fam_feeds(port):
|
||||
return (
|
||||
'<link rel="alternate" type="application/rss+xml" href="/big/f12/rss.xml">'
|
||||
'<a href="/big/f12/atom.xml">atom</a>'
|
||||
'<a href="/big/f12/sitemap.xml">sitemap</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_plain(port):
|
||||
return (
|
||||
'<a href="../f1/one.html">one</a>'
|
||||
'<a href="./two.html">two</a>'
|
||||
'<a href="../../big/f1/tri.html">tri</a>'
|
||||
'<a href="/big/f1/abs.html">abs</a>'
|
||||
'<a href="/big/f1/list.html">list</a>'
|
||||
'<a href="/big/f1/list.html?page=2">p2</a>'
|
||||
'<a href="/big/f1/list.html?page=3&sort=asc">p3</a>'
|
||||
'<a href="/big/f1/dir">dir</a>'
|
||||
'<a href="">self</a><a href="#">frag</a>'
|
||||
'<a href="mailto:big@example.com">mail</a>'
|
||||
'<a href="tel:+15551234">tel</a>'
|
||||
'<a href="data:text/plain;base64,aGk=">data</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_srcset(port):
|
||||
return (
|
||||
'<img src="/big/a/f2-base.png">'
|
||||
'<img srcset="/big/a/f2-1x.png 1x, /big/a/f2-2x.png 2x"'
|
||||
' src="/big/a/f2-base.png">'
|
||||
'<img data-srcset="/big/a/f2-1x.png 1x, /big/a/f2-2x.png 2x"'
|
||||
' src="/big/a/f2-base.png" loading="lazy">'
|
||||
'<picture><source type="image/webp" srcset="/big/a/f2-alt.webp">'
|
||||
'<img src="/big/a/f2-base.png"></picture>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_media(port):
|
||||
return (
|
||||
'<video src="/big/a/clip.mp4" poster="/big/a/poster.jpg">'
|
||||
'<source src="/big/a/clip.webm" type="video/webm">'
|
||||
'<track src="/big/a/subs.vtt" kind="subtitles" srclang="en">'
|
||||
"</video>"
|
||||
'<audio><source src="/big/a/tune.mp3" type="audio/mpeg"></audio>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_css(port):
|
||||
# image-set with descriptors is a proven-safe decoy (engine-surface §6).
|
||||
return (
|
||||
'<link rel="stylesheet" href="/big/a/print.css" media="print">'
|
||||
'<div style="background:url(/big/a/attr-bg.png)">styled</div>'
|
||||
'<style>@import "/big/a/blk.css"; h1 { background: url(/big/a/blk-bg.gif); }'
|
||||
' h2 { background-image: image-set("/big/x/is1.png" 1x, "/big/x/is2.png" 2x); }'
|
||||
"</style>"
|
||||
)
|
||||
|
||||
|
||||
def _fam_js(port):
|
||||
# The concatenated string is rejected by the scanner (no single literal).
|
||||
return (
|
||||
'<script src="/big/a/heavy.js"></script>'
|
||||
'<script src="/big/a/decoy.js"></script>'
|
||||
"<script>document.write('<a href=\"/big/f5/dw.html\">dw</a>');\n"
|
||||
'var nope = "xx-" + "/big/x/concat.html";</script>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_meta(port):
|
||||
# Extensionless decoy targets stay unfetchable even if the aggressive
|
||||
# parser fires (no known extension, no scheme: rejected in every state).
|
||||
return (
|
||||
'<meta http-equiv="refresh" content="2;URL=/big/f6/refreshed.html">'
|
||||
'<a href="/big/f6/based.html">based</a>'
|
||||
'<meta property="og:image" content="/big/x/og">'
|
||||
'<meta name="twitter:image" content="/big/x/tw">'
|
||||
'<script type="application/ld+json">'
|
||||
'{"@type": "Thing", "image": "/big/x/jsonld.png"}</script>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_legacy(port):
|
||||
# Comma-valued applet archive is rejected whole by the engine (decoy).
|
||||
return (
|
||||
'<a href="/big/f7/frames.html">frames</a>'
|
||||
'<img src="/big/a/map.gif" usemap="#m">'
|
||||
'<map name="m">'
|
||||
'<area shape="rect" coords="0,0,9,9" href="/big/f7/area.html"></map>'
|
||||
'<embed src="/big/a/e.pdf" type="application/pdf" width="9" height="9">'
|
||||
'<object data="/big/a/o.pdf" type="application/pdf"></object>'
|
||||
'<applet archive="/big/x/aj.jar,/big/x/bj.jar" width="1" height="1"></applet>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_svg(port):
|
||||
return (
|
||||
'<svg width="9" height="9">'
|
||||
'<image href="/big/a/svg-in.png" width="4" height="4"/>'
|
||||
'<use xlink:href="#icon"/></svg>'
|
||||
'<img src="/big/a/logo.svg">'
|
||||
)
|
||||
|
||||
|
||||
def _fam_i18n(port):
|
||||
return (
|
||||
'<a href="/big/f9/caf%C3%A9.html">cafe</a>'
|
||||
'<a href="/big/f9/latin1.html">latin1</a>'
|
||||
'<a href="/big/f9/metaonly.html">meta</a>'
|
||||
'<a href="/big/f9/bom.html">bom</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_http(port):
|
||||
return (
|
||||
'<a href="/big/r/hop1">chain</a>'
|
||||
'<a href="/big/r/get42">get42</a>'
|
||||
'<a href="/big/d/01">d01</a>'
|
||||
'<a href="/big/d/02">d02</a>'
|
||||
'<a href="/big/f10/empty.html">empty</a>'
|
||||
'<a href="/big/d/dl">dl</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_forms(port):
|
||||
# GET form action is rewritten but never fetched; formaction/ping are
|
||||
# outside the attribute tables (decoys).
|
||||
return (
|
||||
'<form action="/big/x/form-target.html" method="get">'
|
||||
'<input type="text" name="q">'
|
||||
'<input type="image" src="/big/a/btn.png" alt="go"></form>'
|
||||
'<a href="/big/f11/page.html">bare</a>'
|
||||
'<a href="/big/f11/page.html?utm_source=news&utm_medium=mail">utm</a>'
|
||||
'<a href="/big/f11/sess.html?PHPSESSID=deadbeef123">sess</a>'
|
||||
'<button formaction="/big/x/formact">go</button>'
|
||||
'<a href="/big/f11/page.html" ping="/big/x/ping">ping</a>'
|
||||
)
|
||||
|
||||
|
||||
BIG_FAMILIES = [
|
||||
_fam_feeds,
|
||||
_fam_plain,
|
||||
_fam_srcset,
|
||||
_fam_media,
|
||||
_fam_css,
|
||||
_fam_js,
|
||||
_fam_meta,
|
||||
_fam_legacy,
|
||||
_fam_svg,
|
||||
_fam_i18n,
|
||||
_fam_http,
|
||||
_fam_forms,
|
||||
]
|
||||
|
||||
|
||||
def big_link(m, style):
|
||||
return ["%d.html" % m, "../p/%d.html" % m, "/big/p/%d.html" % m][style]
|
||||
|
||||
|
||||
def big_page(n, port):
|
||||
style = n % 3
|
||||
home = ["../index.html", "/big/index.html", "../index.html"][style]
|
||||
parts = ['<a href="%s">home</a>' % home]
|
||||
if n > 0:
|
||||
parts.append('<a href="%s">up</a>' % big_link((n - 1) // BIG_FANOUT, style))
|
||||
for c in range(n * BIG_FANOUT + 1, n * BIG_FANOUT + BIG_FANOUT + 1):
|
||||
if c < BIG_PAGES:
|
||||
parts.append('<a href="%s">p%d</a>' % (big_link(c, style), c))
|
||||
parts.append('<link rel="stylesheet" href="/big/a/site.css">')
|
||||
parts.append('<script src="/big/a/app.js"></script>')
|
||||
exts = ["png", "gif", "jpg"]
|
||||
ia = "/big/a/i%da.%s" % (n, exts[n % 3])
|
||||
ib = "/big/a/i%db.%s" % (n, exts[(n + 1) % 3])
|
||||
# Rotate the second-image construct across deterministic table attributes.
|
||||
con = n % 4
|
||||
if con == 0:
|
||||
parts.append('<img src="%s"><img src="%s">' % (ia, ib))
|
||||
elif con == 1:
|
||||
parts.append(
|
||||
'<img src="%s"><table background="%s"><tr><td>t</td></tr></table>'
|
||||
% (ia, ib)
|
||||
)
|
||||
elif con == 2:
|
||||
parts.append('<img src="%s"><img src="%s" data-src="%s">' % (ia, ia, ib))
|
||||
else:
|
||||
parts.append(
|
||||
'<img src="%s" loading="lazy"><video poster="%s"></video>' % (ia, ib)
|
||||
)
|
||||
parts.append(BIG_FAMILIES[n % 12](port))
|
||||
return big_html("p%d" % n, "\n".join(parts))
|
||||
|
||||
|
||||
def big_index(port):
|
||||
return big_html(
|
||||
"big index",
|
||||
'<link rel="stylesheet" href="/big/a/site.css">'
|
||||
'<script src="/big/a/app.js"></script>'
|
||||
'<a href="p/0.html">root</a>'
|
||||
'<img src="/big/a/d1/d2/d3/d4/d5/d6/d7/d8/deep.png">'
|
||||
'<a href="/big/f1/long.html?x=%s">long</a>'
|
||||
'<a href="/big/f1/gzok.html">gzok</a>'
|
||||
'<a href="//127.0.0.1:%d/big/f1/protorel.html">protorel</a>'
|
||||
'<a href="http://127.0.0.1:%d/big/f1/abshost.html">abshost</a>'
|
||||
'<a href="/big/e/404.html">e404</a>'
|
||||
'<a href="/big/e/410.html">e410</a>'
|
||||
'<a href="/big/e/500.html">e500</a>'
|
||||
'<a href="/big/e/gztrunc.html">gzt</a>'
|
||||
'<a href="?">query</a>' % ("a" * 900, port, port),
|
||||
)
|
||||
|
||||
|
||||
BIG_REDIRECTS = {
|
||||
"/big/r/hop1": (301, "/big/r/hop2"),
|
||||
"/big/r/hop2": (302, "/big/f10/land.html"),
|
||||
"/big/r/get42": (301, "/big/a/doc.pdf"),
|
||||
"/big/f1/dir": (301, "/big/f1/dir/"),
|
||||
}
|
||||
|
||||
BIG_SIMPLE_PAGES = {
|
||||
"/big/p/two.html": "dot-slash target",
|
||||
"/big/f1/one.html": "one",
|
||||
"/big/f1/tri.html": "tri",
|
||||
"/big/f1/abs.html": "abs",
|
||||
"/big/f1/dir/": "dir index",
|
||||
"/big/f1/long.html": "long",
|
||||
"/big/f1/gzok.html": "gzok",
|
||||
"/big/f1/protorel.html": "protorel",
|
||||
"/big/f1/abshost.html": "abshost",
|
||||
"/big/f5/dw.html": "dw target",
|
||||
"/big/f6/refreshed.html": "refreshed",
|
||||
"/big/f6/sub/leaf.html": "leaf",
|
||||
"/big/f7/fa.html": "frame a",
|
||||
"/big/f7/fb.html": "frame b",
|
||||
"/big/f7/fn.html": "noframes",
|
||||
"/big/f7/area.html": "area",
|
||||
"/big/f10/land.html": "landed",
|
||||
"/big/f11/page.html": "the page",
|
||||
"/big/f11/sess.html": "the sess page",
|
||||
}
|
||||
|
||||
# Extensionless downloads: name resolution is wire-type driven (#478 contract).
|
||||
BIG_DOWNLOADS = {
|
||||
"/big/d/01": ("pdf", None),
|
||||
"/big/d/02": ("png", None),
|
||||
"/big/d/dl": ("pdf", 'attachment; filename="named.pdf"'),
|
||||
}
|
||||
|
||||
|
||||
def _big_rss(port):
|
||||
# purl.org marker makes the feed parse; item URLs are already-linked pages.
|
||||
return (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">\n'
|
||||
"<channel><title>big</title><link>http://127.0.0.1:%d/big/index.html</link>\n"
|
||||
"<item><title>i1</title><link>http://127.0.0.1:%d/big/p/1.html</link>\n"
|
||||
'<enclosure url="http://127.0.0.1:%d/big/p/2.html" type="text/html"/></item>\n'
|
||||
"</channel></rss>\n" % (port, port, port)
|
||||
).encode()
|
||||
|
||||
|
||||
def _big_atom(port):
|
||||
# No purl marker: emitted verbatim, its URL must never be fetched.
|
||||
return (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<feed xmlns="http://www.w3.org/2005/Atom"><title>big</title>\n'
|
||||
"<entry><title>e1</title>"
|
||||
'<link href="http://127.0.0.1:%d/big/x/atom-only.html"/>'
|
||||
"</entry></feed>\n" % port
|
||||
).encode()
|
||||
|
||||
|
||||
def _big_sitemap(port):
|
||||
return (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'
|
||||
"<url><loc>http://127.0.0.1:%d/big/x/sitemap-only.html</loc></url>\n"
|
||||
"</urlset>\n" % port
|
||||
).encode()
|
||||
|
||||
|
||||
class Handler(SimpleHTTPRequestHandler):
|
||||
# Quieter logging; the launcher captures httrack's own log anyway.
|
||||
def log_message(self, fmt, *args):
|
||||
@@ -370,7 +781,7 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
self.send_raw(b"", "text/html")
|
||||
|
||||
# broken Content-Length (#32/#41): declared size != bytes sent. httrack
|
||||
# warns "bogus state (broken size)" and skips the cache unless -%B.
|
||||
# warns "incomplete transfer" and skips the cache unless -%B.
|
||||
def route_size_index(self):
|
||||
self.send_html('\t<a href="oversize.bin">over</a>\n')
|
||||
|
||||
@@ -420,6 +831,16 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def route_redir_target(self):
|
||||
self.send_raw(b"<html><body>redirect target</body></html>\n", "text/html")
|
||||
|
||||
# --- /mini304/: tiny fully-cacheable site (an update gets only 304s) ---
|
||||
def route_mini304_index(self):
|
||||
self.big_send(
|
||||
b'<html><body>\n\t<a href="page.html">page</a>\n</body></html>\n',
|
||||
"text/html",
|
||||
)
|
||||
|
||||
def route_mini304_page(self):
|
||||
self.big_send(b"<html><body>tiny cacheable page</body></html>\n", "text/html")
|
||||
|
||||
# --- delayed-type degenerate paths (issues #5/#107) --------------------
|
||||
def route_delayed_index(self):
|
||||
self.send_html(
|
||||
@@ -582,8 +1003,150 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/redir/index.html": route_redir_index,
|
||||
"/redir/go.php": route_redir_go,
|
||||
"/redir/target.html": route_redir_target,
|
||||
"/mini304/index.html": route_mini304_index,
|
||||
"/mini304/page.html": route_mini304_page,
|
||||
}
|
||||
|
||||
# --- /big/ seeded pseudo-site ------------------------------------------
|
||||
|
||||
def big_send(self, body, ctype, code=200, extra=()):
|
||||
if code == 200 and self.headers.get("If-Modified-Since") == BIG_LASTMOD:
|
||||
self.send_response(304)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
return
|
||||
self.send_response(code)
|
||||
if code == 200:
|
||||
self.send_header("Last-Modified", BIG_LASTMOD)
|
||||
self.send_header("Content-Type", ctype)
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
for name, value in extra:
|
||||
self.send_header(name, value)
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
def big_error(self, code, reason):
|
||||
body = big_html("error", "<p>%d</p>%s" % (code, HOME))
|
||||
self.big_send(body, "text/html", code=code, extra=[("X-Reason", reason)])
|
||||
|
||||
def route_big(self):
|
||||
split = urlsplit(self.path)
|
||||
path = unquote(split.path)
|
||||
port = self.server.server_address[1]
|
||||
if path in BIG_REDIRECTS:
|
||||
code, location = BIG_REDIRECTS[path]
|
||||
self.send_response(code)
|
||||
self.send_header("Location", location)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
elif path == "/big/index.html":
|
||||
self.big_send(big_index(port), "text/html")
|
||||
elif path in BIG_SIMPLE_PAGES:
|
||||
body = big_html(path, "<p>%s</p>%s" % (BIG_SIMPLE_PAGES[path], HOME))
|
||||
if path == "/big/f1/gzok.html":
|
||||
self.big_send(
|
||||
gzip.compress(body, mtime=0),
|
||||
"text/html",
|
||||
extra=[("Content-Encoding", "gzip")],
|
||||
)
|
||||
else:
|
||||
self.big_send(body, "text/html")
|
||||
elif path == "/big/f1/list.html":
|
||||
# Pagination: distinct content per query string.
|
||||
body = big_html("list", "<p>listing %s</p>%s" % (split.query or "1", HOME))
|
||||
self.big_send(body, "text/html")
|
||||
elif path == "/big/f6/based.html":
|
||||
self.big_send(
|
||||
big_html(
|
||||
"based",
|
||||
'<base href="http://127.0.0.1:%d/big/f6/sub/">'
|
||||
'<a href="leaf.html">leaf</a>' % port,
|
||||
),
|
||||
"text/html",
|
||||
)
|
||||
elif path == "/big/f7/frames.html":
|
||||
self.big_send(
|
||||
b'<html><frameset cols="50%,50%"><frame src="fa.html">'
|
||||
b'<frame src="fb.html"><noframes><body><a href="fn.html">fn</a>'
|
||||
b"</body></noframes></frameset></html>",
|
||||
"text/html",
|
||||
)
|
||||
elif path == "/big/f9/café.html":
|
||||
self.big_send(big_html("cafe", "<p>cafe</p>%s" % HOME), "text/html")
|
||||
elif path == "/big/f9/latin1.html":
|
||||
self.big_send(
|
||||
b"<html><body><p>caf\xe9 latin</p></body></html>",
|
||||
"text/html; charset=ISO-8859-1",
|
||||
)
|
||||
elif path == "/big/f9/metaonly.html":
|
||||
self.big_send(
|
||||
'<html><head><meta charset="utf-8"></head>'
|
||||
"<body><p>café meta</p></body></html>".encode(),
|
||||
"text/html",
|
||||
)
|
||||
elif path == "/big/f9/bom.html":
|
||||
self.big_send(
|
||||
b"\xef\xbb\xbf" + big_html("bom", "<p>bom</p>%s" % HOME), "text/html"
|
||||
)
|
||||
elif path == "/big/f10/empty.html":
|
||||
self.big_send(b"", "text/html")
|
||||
elif path == "/big/f12/rss.xml":
|
||||
self.big_send(_big_rss(port), "text/xml")
|
||||
elif path == "/big/f12/atom.xml":
|
||||
self.big_send(_big_atom(port), "application/xml")
|
||||
elif path == "/big/f12/sitemap.xml":
|
||||
self.big_send(_big_sitemap(port), "text/xml")
|
||||
elif path.startswith("/big/p/"):
|
||||
try:
|
||||
n = int(path[len("/big/p/") : -len(".html")])
|
||||
except ValueError:
|
||||
n = -1
|
||||
if 0 <= n < BIG_PAGES and path.endswith(".html"):
|
||||
self.big_send(big_page(n, port), "text/html")
|
||||
else:
|
||||
self.big_error(404, "no such page")
|
||||
elif path.startswith("/big/a/") or path.startswith("/big/x/"):
|
||||
name = path[len("/big/a/") :]
|
||||
if path.startswith("/big/a/") and name in BIG_TEXT_ASSETS:
|
||||
text, ctype = BIG_TEXT_ASSETS[name]
|
||||
self.big_send(text.encode(), ctype)
|
||||
elif name.endswith(".html"):
|
||||
# Decoy targets 200 so a parser leak becomes a mirror file.
|
||||
self.big_send(big_html(name, "<p>%s</p>" % name), "text/html")
|
||||
else:
|
||||
ext = name.rsplit(".", 1)[-1]
|
||||
ctype = BIG_CTYPES.get(ext, "application/octet-stream")
|
||||
self.big_send(big_asset(name), ctype)
|
||||
elif path in BIG_DOWNLOADS:
|
||||
ext, cdispo = BIG_DOWNLOADS[path]
|
||||
extra = [("Content-Disposition", cdispo)] if cdispo else []
|
||||
self.big_send(
|
||||
big_asset(path[len("/big/") :] + "." + ext),
|
||||
BIG_CTYPES[ext],
|
||||
extra=extra,
|
||||
)
|
||||
elif path == "/big/e/404.html":
|
||||
self.big_error(404, "Not Found")
|
||||
elif path == "/big/e/410.html":
|
||||
self.big_error(410, "Gone")
|
||||
elif path == "/big/e/500.html":
|
||||
self.big_error(500, "Server Error")
|
||||
elif path == "/big/e/gztrunc.html":
|
||||
# Half a gzip stream, honest Content-Length: decode fails, and the
|
||||
# missing Last-Modified keeps it the one uncacheable resource.
|
||||
full = gzip.compress(big_html("gz", "x" * 3000), mtime=0)
|
||||
body = full[: len(full) // 2]
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/html")
|
||||
self.send_header("Content-Encoding", "gzip")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
else:
|
||||
self.big_error(404, "no such big path")
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
|
||||
def reject_fragment(self):
|
||||
@@ -599,6 +1162,9 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
if path.startswith("/big/"):
|
||||
self.route_big()
|
||||
return True
|
||||
# Match percent-encoded paths (accented #157 route) by their decoded form.
|
||||
handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
|
||||
if handler is not None:
|
||||
|
||||
@@ -211,7 +211,9 @@ main() {
|
||||
# lintian ourselves below as the real gate.
|
||||
local -a debuild_opts=(--no-lintian)
|
||||
local -a build_opts=()
|
||||
[[ $source_only -eq 1 ]] && build_opts+=(-S)
|
||||
# -d: a source build runs no debhelper, so don't require Build-Depends
|
||||
# locally (the buildds and the --sbuild gate enforce them).
|
||||
[[ $source_only -eq 1 ]] && build_opts+=(-S -d)
|
||||
if [[ $unsigned -eq 1 ]]; then
|
||||
build_opts+=(-us -uc)
|
||||
else
|
||||
@@ -234,12 +236,15 @@ main() {
|
||||
|
||||
# The real lintian gate (debuild only reports, it does not fail on tags).
|
||||
# --profile debian: CI runners are Ubuntu, whose vendor data would wrongly
|
||||
# reject the Debian "unstable" distribution. newer-standards-version only
|
||||
# means the local lintian is older than the buildds', not a package
|
||||
# defect, so suppress it. set -e turns any error/warning tag into a failure.
|
||||
# reject the Debian "unstable" distribution. Suppressed tags are stale-local-
|
||||
# lintian skew, not package defects: newer-standards-version, and
|
||||
# recommended-field (old lintian still wants the Priority field the sid
|
||||
# lintian in CI accepts dropping). set -e turns any error/warning tag into
|
||||
# a failure.
|
||||
info "running lintian gate (--fail-on=error,warning)"
|
||||
lintian --profile debian -I -i --fail-on=error,warning \
|
||||
--suppress-tags newer-standards-version "${changes[@]}"
|
||||
--suppress-tags newer-standards-version,recommended-field \
|
||||
"${changes[@]}"
|
||||
|
||||
dcmd cp -- "${changes[@]}" "$outdir/"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user