mirror of
https://github.com/xroche/httrack.git
synced 2026-06-26 12:07:54 +03:00
Compare commits
7 Commits
fix-delaye
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bfc4a016ab | ||
|
|
756d8fb8bd | ||
|
|
5501faa7b1 | ||
|
|
6322b6fb1f | ||
|
|
58f368a91a | ||
|
|
c97b3e233e | ||
|
|
b615a4e7fd |
@@ -33,8 +33,9 @@ the operational checklist: toolchain, invariants, and how to ship a change.
|
||||
- Be terse. Comment the why, in English; translate French comments you touch.
|
||||
- Strip AI tells from prose (em-dash overuse, rule-of-three, filler, vague
|
||||
attributions). Ref: Wikipedia "Signs of AI writing". Claude Code: `/humanizer`.
|
||||
- Behavior change → add a test. Fast path: a hidden `httrack -#N` debug
|
||||
subcommand (`htscoremain.c`) driven by a `tests/NN_*.test`, over a slow crawl.
|
||||
- Behavior change → add a test. Fast path: a hidden `httrack -#test=NAME` engine
|
||||
self-test (registry in `htsselftest.c`; `-#test` lists them) driven by a
|
||||
`tests/NN_*.test`, over a slow crawl.
|
||||
|
||||
## Review your change adversarially (strongly suggested)
|
||||
Before pushing, and when reviewing others, don't skim for bugs:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
.\"
|
||||
.\" This file is generated by man/makeman.sh; do not edit by hand.
|
||||
.\" SPDX-License-Identifier: GPL-3.0-or-later
|
||||
.TH httrack 1 "13 June 2026" "httrack website copier"
|
||||
.TH httrack 1 "26 June 2026" "httrack website copier"
|
||||
.SH NAME
|
||||
httrack \- offline browser : copy websites to a local directory
|
||||
.SH SYNOPSIS
|
||||
@@ -313,12 +313,8 @@ debug HTTP headers in logfile (\-\-debug\-headers)
|
||||
.SS Guru options: (do NOT use if possible)
|
||||
.IP \-#X
|
||||
*use optimized engine (limited memory boundary checks) (\-\-fast\-engine)
|
||||
.IP \-#0
|
||||
filter test (\-#0 '*.gif' 'www.bar.com/foo.gif') (\-\-debug\-testfilters <param>)
|
||||
.IP \-#1
|
||||
simplify test (\-#1 ./foo/bar/../foobar)
|
||||
.IP \-#2
|
||||
type test (\-#2 /foo/bar.php)
|
||||
.IP \-#test
|
||||
list engine self\-tests (run one with \-#test=NAME [args])
|
||||
.IP \-#C
|
||||
cache list (\-#C '*.com/spider*.gif' (\-\-debug\-cache <param>)
|
||||
.IP \-#R
|
||||
|
||||
@@ -56,7 +56,7 @@ whttrackrundir = $(bindir)
|
||||
whttrackrun_SCRIPTS = webhttrack
|
||||
|
||||
libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
htscache_selftest.c htsdns_selftest.c \
|
||||
htscache_selftest.c htsdns_selftest.c htsselftest.c \
|
||||
htscatchurl.c htsfilters.c htsftp.c htshash.c coucal/coucal.c \
|
||||
htshelp.c htslib.c htscoremain.c \
|
||||
htsname.c htsrobots.c htstools.c htswizard.c \
|
||||
@@ -66,7 +66,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
md5.c \
|
||||
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
||||
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
||||
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htsdns_selftest.h htscatchurl.h \
|
||||
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htsdns_selftest.h htsselftest.h htscatchurl.h \
|
||||
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
||||
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
||||
htshelp.h htsindex.h htslib.h htsmd5.h \
|
||||
|
||||
@@ -220,6 +220,25 @@ struct cache_back_zip_entry {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/* A cache (new.zip) write failed: storage is gone (disk full / dropped share),
|
||||
so the mirror is doomed too. Abort it via exit_xh, don't crash as assertf
|
||||
did. */
|
||||
static void cache_zip_write_failed(httrackp *opt, cache_back *cache,
|
||||
const char *what, int zErr) {
|
||||
if (!cache->zipWriteFailed) {
|
||||
cache->zipWriteFailed = HTS_TRUE;
|
||||
if (check_fatal_io_errno()) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: cache write failed (%s): %s", what,
|
||||
hts_get_zerror(zErr));
|
||||
}
|
||||
}
|
||||
opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
|
||||
}
|
||||
|
||||
/* Ajout d'un fichier en cache */
|
||||
void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
const char *url_adr, const char *url_fil, const char *url_save,
|
||||
@@ -236,6 +255,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
const char *url_save_suffix = url_save;
|
||||
int zErr;
|
||||
|
||||
/* already failed and aborting; don't touch the broken stream again */
|
||||
if (cache->zipWriteFailed)
|
||||
return;
|
||||
|
||||
// robots.txt hack
|
||||
if (url_save == NULL) {
|
||||
dataincache = 0; // testing links
|
||||
@@ -346,9 +369,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
*/
|
||||
headers, (uInt) strlen(headers), NULL, 0, NULL, /* comment */
|
||||
Z_DEFLATED, Z_DEFAULT_COMPRESSION)) != Z_OK) {
|
||||
int zip_zipOpenNewFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipOpenNewFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "opening a cache entry", zErr);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Write data in cache */
|
||||
@@ -358,9 +380,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, r->adr,
|
||||
(int) r->size)) != Z_OK) {
|
||||
int zip_zipWriteInFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipWriteInFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache", zErr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -381,9 +402,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, buff,
|
||||
(int) nl)) != Z_OK) {
|
||||
int zip_zipWriteInFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipWriteInFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache",
|
||||
zErr);
|
||||
fclose(fp);
|
||||
return;
|
||||
}
|
||||
}
|
||||
} while(nl > 0);
|
||||
@@ -397,16 +419,14 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
|
||||
/* Close */
|
||||
if ((zErr = zipCloseFileInZip((zipFile) cache->zipOutput)) != Z_OK) {
|
||||
int zip_zipCloseFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipCloseFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "closing a cache entry", zErr);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Flush */
|
||||
if ((zErr = zipFlush((zipFile) cache->zipOutput)) != 0) {
|
||||
int zip_zipFlush_failed = 0;
|
||||
|
||||
assertf(zip_zipFlush_failed);
|
||||
cache_zip_write_failed(opt, cache, "flushing the cache", zErr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htslib.h"
|
||||
#include "htszlib.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
@@ -316,6 +317,136 @@ static int disk_fallback_selftest(httrackp *opt) {
|
||||
return fail;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
size_t budget; /**< bytes allowed through before writes start failing */
|
||||
int fail_errno; /**< errno set on the failing write (ENOSPC, EIO, ...) */
|
||||
int writes; /**< zwrite call count, to detect re-entry into the stream */
|
||||
} writefail_inject;
|
||||
|
||||
/* zwrite that copies until the budget runs out, then fails with inj->fail_errno
|
||||
(the #174/#219 condition). Counts calls so the test can prove a flagged cache
|
||||
never re-enters the stream. */
|
||||
static uLong selftest_failing_zwrite(voidpf opaque, voidpf stream,
|
||||
const void *buf, uLong size) {
|
||||
writefail_inject *inj = (writefail_inject *) opaque;
|
||||
|
||||
inj->writes++;
|
||||
if (inj->budget >= (size_t) size) {
|
||||
inj->budget -= (size_t) size;
|
||||
return (uLong) fwrite(buf, 1, (size_t) size, (FILE *) stream);
|
||||
}
|
||||
errno = inj->fail_errno;
|
||||
return 0; /* short write -> the minizip op returns an error */
|
||||
}
|
||||
|
||||
/* Open a ZIP whose writes fail past inj->budget, so cache_add() hits an error.
|
||||
*/
|
||||
static zipFile selftest_open_failing_zip(const char *path,
|
||||
writefail_inject *inj) {
|
||||
zlib_filefunc_def ff;
|
||||
|
||||
fill_fopen_filefunc(&ff); /* real fopen/read/seek/close; ignores opaque */
|
||||
ff.zwrite_file = selftest_failing_zwrite;
|
||||
ff.opaque = inj;
|
||||
return zipOpen2(path, APPEND_STATUS_CREATE, NULL, &ff);
|
||||
}
|
||||
|
||||
/* Store one octet-stream body into `cache` (all-in-cache, body in the ZIP). */
|
||||
static void writefail_store(httrackp *opt, cache_back *cache, const char *fil,
|
||||
const char *body, size_t body_len) {
|
||||
htsblk r;
|
||||
char locbuf[4];
|
||||
char *bodycopy = malloct(body_len);
|
||||
|
||||
hts_init_htsblk(&r);
|
||||
r.statuscode = 200;
|
||||
r.size = (LLint) body_len;
|
||||
strcpybuff(r.msg, "OK");
|
||||
strcpybuff(r.contenttype, "application/octet-stream");
|
||||
locbuf[0] = '\0';
|
||||
r.location = locbuf;
|
||||
r.is_write = 0;
|
||||
memcpy(bodycopy, body, body_len);
|
||||
r.adr = bodycopy;
|
||||
cache_add(opt, cache, &r, "example.com", fil, "example.com/blob.bin", 1,
|
||||
NULL);
|
||||
freet(bodycopy);
|
||||
}
|
||||
|
||||
/* #174/#219: a failing cache write used to crash via assertf(); it must instead
|
||||
stop the mirror (exit_xh = -1) without crashing. Assert that, plus the cache
|
||||
is flagged and a sibling write doesn't re-enter the broken stream. */
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
int fail = 0;
|
||||
char path[HTS_URLMAXSIZE];
|
||||
/* incompressible + big, so deflate flushes (and fails) mid-write, before
|
||||
* close */
|
||||
static const size_t body_len = 256 * 1024;
|
||||
char *body = malloct(body_len);
|
||||
int phase;
|
||||
|
||||
gen_body(body, body_len, 1 /* incompressible */);
|
||||
fconcat(path, sizeof(path), dir, "/wfail.zip");
|
||||
|
||||
/* phase 0: fail on the body write, fatal errno (ENOSPC, the disk-full
|
||||
branch). phase 1: fail on the open, non-fatal errno (EIO, dropped-share
|
||||
branch). Both must abort the mirror. */
|
||||
for (phase = 0; phase < 2; phase++) {
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
int writes_after_fail;
|
||||
|
||||
inj.budget = (phase == 0) ? 4096 : 0;
|
||||
inj.fail_errno = (phase == 0) ? ENOSPC : EIO;
|
||||
inj.writes = 0;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||
if (cache.zipOutput == NULL) {
|
||||
fprintf(stderr, "cache-writefail: could not open injected ZIP\n");
|
||||
fail++;
|
||||
continue;
|
||||
}
|
||||
|
||||
opt->state.exit_xh = 0; /* clear; the failing write must set it to -1 */
|
||||
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||
if (!cache.zipWriteFailed) {
|
||||
fprintf(stderr, "cache-writefail: phase %d: write error not caught\n",
|
||||
phase);
|
||||
fail++;
|
||||
}
|
||||
if (opt->state.exit_xh != -1) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: phase %d: mirror not aborted (exit_xh=%d)\n",
|
||||
phase, opt->state.exit_xh);
|
||||
fail++;
|
||||
}
|
||||
|
||||
/* a flagged cache must no-op a sibling write: no further backend write */
|
||||
writes_after_fail = inj.writes;
|
||||
writefail_store(opt, &cache, "/blob2.bin", body, 16);
|
||||
if (inj.writes != writes_after_fail) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: phase %d: sibling write re-entered the broken "
|
||||
"stream (%d extra backend writes)\n",
|
||||
phase, inj.writes - writes_after_fail);
|
||||
fail++;
|
||||
}
|
||||
|
||||
if (cache.zipOutput != NULL) {
|
||||
zipClose(cache.zipOutput,
|
||||
NULL); /* best-effort; may fail on the backend */
|
||||
cache.zipOutput = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
freet(body);
|
||||
return fail;
|
||||
}
|
||||
|
||||
int cache_selftests(httrackp *opt, const char *dir) {
|
||||
int failures = 0;
|
||||
cache_back cache;
|
||||
|
||||
@@ -52,6 +52,10 @@ int cache_selftests(httrackp *opt, const char *dir);
|
||||
committed file, never by the test). Returns the failed-check count. */
|
||||
int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
|
||||
|
||||
/* #174/#219: assert a failing cache write aborts the mirror cleanly instead of
|
||||
crashing. Returns the failed-check count. */
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -214,6 +214,8 @@ struct cache_back {
|
||||
cache_back_zip_entry *zipEntries;
|
||||
int zipEntriesOffs;
|
||||
int zipEntriesCapa;
|
||||
hts_boolean
|
||||
zipWriteFailed; /**< a cache write failed; stop touching the stream */
|
||||
};
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_hash_struct
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -646,9 +646,7 @@ void help(const char *app, int more) {
|
||||
infomsg("");
|
||||
infomsg("Guru options: (do NOT use if possible)");
|
||||
infomsg(" #X *use optimized engine (limited memory boundary checks)");
|
||||
infomsg(" #0 filter test (-#0 '*.gif' 'www.bar.com/foo.gif')");
|
||||
infomsg(" #1 simplify test (-#1 ./foo/bar/../foobar)");
|
||||
infomsg(" #2 type test (-#2 /foo/bar.php)");
|
||||
infomsg(" #test list engine self-tests (run one with -#test=NAME [args])");
|
||||
infomsg(" #C cache list (-#C '*.com/spider*.gif'");
|
||||
infomsg(" #R cache repair (damaged cache)");
|
||||
infomsg(" #d debug parser");
|
||||
|
||||
@@ -4177,9 +4177,10 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
/* Check html -> text/html */
|
||||
const char *a = fil + strlen(fil) - 1;
|
||||
|
||||
while((*a != '.') && (*a != '/') && (a > fil))
|
||||
/* a < fil when fil is empty: bound before dereferencing */
|
||||
while ((a > fil) && (*a != '.') && (*a != '/'))
|
||||
a--;
|
||||
if (*a == '.' && strlen(a) < 32) {
|
||||
if (a >= fil && *a == '.' && strlen(a) < 32) {
|
||||
int j = 0;
|
||||
|
||||
a++;
|
||||
|
||||
@@ -1729,10 +1729,10 @@ char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||
StringBuff(opt->path_log), digest_filename);
|
||||
}
|
||||
|
||||
/* remove refname if any */
|
||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
||||
const char *fil) {
|
||||
/* remove refname if any; HTS_TRUE if it was removed */
|
||||
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||
const char *fil) {
|
||||
char *filename = url_savename_refname_fullpath(opt, adr, fil);
|
||||
|
||||
(void) UNLINK(filename);
|
||||
return UNLINK(filename) == 0 ? HTS_TRUE : HTS_FALSE;
|
||||
}
|
||||
|
||||
@@ -104,8 +104,9 @@ char *url_md5(char *digest_buffer, const char *fil_complete);
|
||||
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
||||
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||
const char *fil);
|
||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
||||
const char *fil);
|
||||
/* Remove the temp-ref for (adr,fil); HTS_TRUE if it was removed. */
|
||||
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||
const char *fil);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3749,44 +3749,60 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
|
||||
} // bloc
|
||||
// erreur HTTP (ex: 404, not found)
|
||||
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED)
|
||||
|| (r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)
|
||||
) { // Precondition Failed, c'est à dire pour nous redemander TOUT le fichier
|
||||
if (fexist_utf8(heap(ptr)->sav)) {
|
||||
remove(heap(ptr)->sav); // Eliminer
|
||||
} else {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Unexpected 412/416 error (%s) for %s%s, '%s' could not be found on disk",
|
||||
r->msg, urladr(), urlfil(),
|
||||
heap(ptr)->sav != NULL ? heap(ptr)->sav : "");
|
||||
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED) ||
|
||||
(r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)) {
|
||||
// 412/416: the resume partial is stale; re-get the whole file (#206)
|
||||
lien_back *itemback = NULL;
|
||||
int had_partial = 0;
|
||||
int ref_existed = 0;
|
||||
int ref_gone;
|
||||
|
||||
// Drop the temp-ref, its partial, and heap->sav so the re-get carries no
|
||||
// Range; else back_add rebuilds the same Range and loops.
|
||||
if (back_unserialize_ref(opt, heap(ptr)->adr, heap(ptr)->fil,
|
||||
&itemback) == 0) {
|
||||
had_partial = 1;
|
||||
ref_existed = 1;
|
||||
// best-effort: an orphaned partial cannot re-Range once the ref is gone
|
||||
if (fexist_utf8(itemback->url_sav))
|
||||
(void) UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
itemback->url_sav));
|
||||
back_clear_entry(itemback);
|
||||
freet(itemback);
|
||||
}
|
||||
if (!fexist_utf8(heap(ptr)->sav)) { // Bien éliminé? (sinon on boucle..)
|
||||
#if HDEBUG
|
||||
printf("Partial content NOT up-to-date, reget all file for %s\n",
|
||||
heap(ptr)->sav);
|
||||
#endif
|
||||
// don't re-record if the ref survived (it would re-Range and loop)
|
||||
ref_gone =
|
||||
url_savename_refname_remove(opt, heap(ptr)->adr, heap(ptr)->fil) ||
|
||||
!ref_existed;
|
||||
if (fexist_utf8(heap(ptr)->sav)) {
|
||||
had_partial = 1;
|
||||
remove(heap(ptr)->sav);
|
||||
}
|
||||
|
||||
// Re-get once, only if a partial existed and both Range triggers are
|
||||
// gone; a failed removal gives up rather than looping. range_used is
|
||||
// unreliable (it does not survive the delayed-type two-pass).
|
||||
if (had_partial && ref_gone && !fexist_utf8(heap(ptr)->sav)) {
|
||||
hts_log_print(opt, LOG_DEBUG, "Partial file reget (%s) for %s%s",
|
||||
r->msg, urladr(), urlfil());
|
||||
// enregistrer le MEME lien
|
||||
if (hts_record_link(opt, heap(ptr)->adr, heap(ptr)->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||
heap_top()->testmode = heap(ptr)->testmode; // mode test?
|
||||
heap_top()->link_import = 0; // pas mode import
|
||||
heap_top()->testmode = heap(ptr)->testmode;
|
||||
heap_top()->link_import = 0;
|
||||
heap_top()->depth = heap(ptr)->depth;
|
||||
heap_top()->pass2 = max(heap(ptr)->pass2, numero_passe);
|
||||
heap_top()->retry = heap(ptr)->retry;
|
||||
heap_top()->premier = heap(ptr)->premier;
|
||||
heap_top()->precedent = ptr;
|
||||
//
|
||||
// canceller lien actuel
|
||||
error = 1;
|
||||
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
||||
//
|
||||
} else { // oups erreur, plus de mémoire!!
|
||||
XH_uninit; // désallocation mémoire & buffers
|
||||
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
||||
} else { // out of memory
|
||||
XH_uninit;
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR, "Can not remove old file %s", urlfil());
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Giving up on partial reget (%s) for %s%s", r->msg,
|
||||
urladr(), urlfil());
|
||||
error = 1;
|
||||
}
|
||||
|
||||
|
||||
1093
src/htsselftest.c
Normal file
1093
src/htsselftest.c
Normal file
File diff suppressed because it is too large
Load Diff
52
src/htsselftest.h
Normal file
52
src/htsselftest.h
Normal file
@@ -0,0 +1,52 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 2026 Xavier Roche and other contributors
|
||||
|
||||
SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Ethical use: we kindly ask that you NOT use this software to harvest email
|
||||
addresses or to collect any other private information about people. Doing so
|
||||
would dishonor our work and waste the many hours we have spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: htsselftest.h */
|
||||
/* named dispatch for the hidden engine self-tests */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#ifndef HTSSELFTEST_DEFH
|
||||
#define HTSSELFTEST_DEFH
|
||||
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_httrackp
|
||||
#define HTS_DEF_FWSTRUCT_httrackp
|
||||
typedef struct httrackp httrackp;
|
||||
#endif
|
||||
|
||||
/* Run engine self-test `name` over the positional args argv[0..argc-1], or list
|
||||
the available tests when name is NULL, empty, or "list". Prints the result;
|
||||
returns the process exit code (0 == success). The caller owns option cleanup.
|
||||
Reached through the hidden `httrack -#test[=NAME ...]` subcommand. */
|
||||
int hts_selftest(httrackp *opt, const char *name, int argc, char **argv);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -4,7 +4,7 @@
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Golden cache-format regression test (driven by 'httrack -#B <dir>').
|
||||
# Golden cache-format regression test (driven by 'httrack -#test=cache-golden <dir>').
|
||||
#
|
||||
# 01_engine-cache.test writes the cache with the same build it reads back (a
|
||||
# round-trip), so it cannot catch a read-path or ZIP-format regression where
|
||||
@@ -13,7 +13,7 @@
|
||||
# byte-exact.
|
||||
#
|
||||
# Regenerate the fixture after a deliberate format change with
|
||||
# 'httrack -#B <dir> regen', then copy <dir>/hts-cache/new.zip over the
|
||||
# 'httrack -#test=cache-golden <dir> regen', then copy <dir>/hts-cache/new.zip over the
|
||||
# committed file.
|
||||
|
||||
set -eu
|
||||
@@ -37,11 +37,11 @@ trap 'rm -rf "$dir"' EXIT
|
||||
mkdir -p "$dir/hts-cache"
|
||||
cp "$fixture/hts-cache/new.zip" "$dir/hts-cache/new.zip"
|
||||
|
||||
out=$(httrack -#B "$dir")
|
||||
out=$(httrack -#test=cache-golden "$dir")
|
||||
|
||||
# Match the exact success line: the read must have found and verified every
|
||||
# entry, not merely failed to enter the mode (a bad -#B falls through to the
|
||||
# usage screen, which also exits non-zero but never prints this).
|
||||
# entry, not merely failed to enter the mode (a renamed/removed test prints the
|
||||
# registry to stderr, which also exits non-zero but never prints this).
|
||||
test "$out" = "cache-golden: OK" || {
|
||||
echo "expected 'cache-golden: OK', got: $out" >&2
|
||||
exit 1
|
||||
|
||||
24
tests/01_engine-cache-writefail.test
Normal file
24
tests/01_engine-cache-writefail.test
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Keep this POSIX-portable: the harness runs it via $(BASH), which is a plain
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Cache write-failure handling (httrack -#test=cache-writefail <dir>). #174/#219.
|
||||
# A failing new.zip write (disk full) used to crash the process via assertf; it
|
||||
# must instead stop the mirror with a fatal error (exit_xh=-1), no crash. The
|
||||
# self-test asserts that; reverting the fix makes -#test=cache-writefail abort (SIGABRT) and fail.
|
||||
|
||||
set -eu
|
||||
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
out=$(httrack -#test=cache-writefail "$dir")
|
||||
|
||||
# Match the exact success line (error logs also go to stdout); a renamed/removed
|
||||
# test prints the registry to stderr, which exits non-zero but never prints this.
|
||||
printf '%s\n' "$out" | grep -qx "cache-writefail: OK" || {
|
||||
echo "expected 'cache-writefail: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -4,7 +4,7 @@
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Cache create/read/update logic (driven by 'httrack -#A <dir>').
|
||||
# Cache create/read/update logic (driven by 'httrack -#test=cache <dir>').
|
||||
#
|
||||
# The in-process self-test stores several hand-crafted edge entries (normal
|
||||
# HTML, an empty redirect with a near-limit location, a non-HTML body kept via
|
||||
@@ -20,13 +20,13 @@ set -eu
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
# Like the other -# debug modes, a trailing token (the working directory) is
|
||||
# required; a bare '-#A' falls through to the usage screen.
|
||||
out=$(httrack -#A "$dir")
|
||||
# The working directory is a required argument; without it the test prints a
|
||||
# usage line to stderr and returns non-zero.
|
||||
out=$(httrack -#test=cache "$dir")
|
||||
|
||||
# Match the exact success line, so the test cannot pass for an unrelated reason
|
||||
# (e.g. the -#A mode being gone and falling through to the usage screen, which
|
||||
# also exits non-zero but never prints this).
|
||||
# (e.g. the cache test being gone, which prints the registry to stderr but
|
||||
# never prints this line).
|
||||
test "$out" = "cache-selftest: OK" || {
|
||||
echo "expected 'cache-selftest: OK', got: $out" >&2
|
||||
exit 1
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
set -euo pipefail
|
||||
|
||||
# charset -> UTF-8 conversion (hts_convertStringToUTF8).
|
||||
# -#3 <charset> <string> prints the string re-decoded from <charset> as UTF-8.
|
||||
# -#test=charset <charset> <string> prints the string re-decoded from <charset> as UTF-8.
|
||||
conv() {
|
||||
test "$(httrack -O /dev/null -#3 "$1" "$2")" == "$3" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=charset "$1" "$2")" == "$3" || exit 1
|
||||
}
|
||||
# crash probe: malformed input must exit cleanly, not abort.
|
||||
runs() {
|
||||
httrack -O /dev/null -#3 "$1" "$2" >/dev/null 2>&1 || exit 1
|
||||
httrack -O /dev/null -#test=charset "$1" "$2" >/dev/null 2>&1 || exit 1
|
||||
}
|
||||
|
||||
# the source bytes below are UTF-8 (this file is UTF-8); "café" is 0x63 61 66 C3 A9.
|
||||
@@ -31,7 +31,7 @@ conv 'us-ascii' 'hello' 'hello'
|
||||
# unknown charset: ASCII passes through unchanged, but non-ASCII input cannot be
|
||||
# decoded and yields empty output (an error is printed to stderr).
|
||||
conv 'no-such-charset-xyz' 'abc' 'abc'
|
||||
test "$(httrack -O /dev/null -#3 'no-such-charset-xyz' 'café' 2>/dev/null)" == "" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=charset 'no-such-charset-xyz' 'café' 2>/dev/null)" == "" || exit 1
|
||||
|
||||
# malformed UTF-8 (lone continuation byte, truncated lead byte) must not crash
|
||||
runs 'utf-8' $'\x80'
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Issue #151 guard: the request Cookie header must be bare RFC 6265 name=value
|
||||
# pairs, no $Version/$Path attributes. Driven by the 'httrack -#Q' selftest.
|
||||
# pairs, no $Version/$Path attributes. Driven by the 'httrack -#test=cookies' selftest.
|
||||
|
||||
set -eu
|
||||
|
||||
# A trailing token is required; a bare '-#Q' falls through to the usage screen.
|
||||
out=$(httrack -#Q run)
|
||||
# 'run' is an ignored placeholder argument.
|
||||
out=$(httrack -#test=cookies run)
|
||||
|
||||
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||
# Exact-match the success line so a renamed/removed test (it prints the registry
|
||||
# to stderr) can't pass.
|
||||
test "$out" = "cookie-header: OK" || {
|
||||
echo "expected 'cookie-header: OK', got: $out" >&2
|
||||
exit 1
|
||||
|
||||
@@ -2,15 +2,16 @@
|
||||
#
|
||||
# Regression guard for the unsigned-enum sentinel trap: copy_htsopt's
|
||||
# `if (from->X > -1)` guard is always false for unsigned hts_boolean fields, so
|
||||
# they silently stop being copied. Driven by the in-process 'httrack -#9' test.
|
||||
# they silently stop being copied. Driven by the in-process 'httrack -#test=copyopt' test.
|
||||
# Keep POSIX-portable (harness runs it via $(BASH), a plain /bin/sh on macOS).
|
||||
|
||||
set -eu
|
||||
|
||||
# A trailing token is required; a bare '-#9' falls through to the usage screen.
|
||||
out=$(httrack -#9 run)
|
||||
# 'run' is an ignored placeholder argument.
|
||||
out=$(httrack -#test=copyopt run)
|
||||
|
||||
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||
# Exact-match the success line so a renamed/removed test (it prints the registry
|
||||
# to stderr) can't pass.
|
||||
test "$out" = "copy-htsopt: OK" || {
|
||||
echo "expected 'copy-htsopt: OK', got: $out" >&2
|
||||
exit 1
|
||||
|
||||
@@ -5,9 +5,8 @@ set -euo pipefail
|
||||
|
||||
# DNS resolver/cache self-test: a mock getaddrinfo (no network) checks address
|
||||
# family, single-address selection, the -@i4/-@i6 family filter, and cache reuse.
|
||||
# The trailing token is required, like the other -# selftests, so a bare command
|
||||
# line isn't treated as "no arguments" and routed to the usage screen.
|
||||
out=$(httrack -#D run)
|
||||
# 'run' is an ignored placeholder argument.
|
||||
out=$(httrack -#test=dns run)
|
||||
|
||||
test "$out" = "dns-selftest: OK" || {
|
||||
echo "expected 'dns-selftest: OK', got: $out" >&2
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
set -euo pipefail
|
||||
|
||||
# HTML entity unescaping (hts_unescapeEntitiesWithCharset).
|
||||
# -#6 <string> prints the string with entities decoded (UTF-8 output).
|
||||
# -#test=entities <string> prints the string with entities decoded (UTF-8 output).
|
||||
ent() {
|
||||
test "$(httrack -O /dev/null -#6 "$1")" == "$2" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=entities "$1")" == "$2" || exit 1
|
||||
}
|
||||
# crash probe: malformed input must exit cleanly, not abort.
|
||||
runs() {
|
||||
httrack -O /dev/null -#6 "$1" >/dev/null 2>&1 || exit 1
|
||||
httrack -O /dev/null -#test=entities "$1" >/dev/null 2>&1 || exit 1
|
||||
}
|
||||
|
||||
# named entities
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
set -euo pipefail
|
||||
|
||||
# wildcard filter engine (strjoker), the core of +/- include/exclude rules.
|
||||
# -#0 <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
|
||||
# -#test=filter <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
|
||||
|
||||
match() {
|
||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does match $1" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=filter "$1" "$2")" == "$2 does match $1" || exit 1
|
||||
}
|
||||
nomatch() {
|
||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does NOT match $1" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=filter "$1" "$2")" == "$2 does NOT match $1" || exit 1
|
||||
}
|
||||
|
||||
# bare star matches everything
|
||||
|
||||
@@ -3,5 +3,7 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# httrack internal hashtable autotest on 100K keys
|
||||
httrack -#7 100000
|
||||
# httrack internal hashtable autotest on 100K keys. Assert the success line (on
|
||||
# stderr) so a misrouted registry entry can't pass on exit code alone.
|
||||
out=$(httrack -#test=hashtable 100000 2>&1)
|
||||
printf '%s\n' "$out" | grep -q "all hashtable tests were successful!" || exit 1
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# IDNA / punycode encode (-#4) and decode (-#5). This code has a CVE history,
|
||||
# IDNA / punycode encode (-#test=idna-encode) and decode (-#test=idna-decode). This code has a CVE history,
|
||||
# so the edge cases below cover passthrough, round-trips, and malformed input.
|
||||
|
||||
enc() { test "$(httrack -O /dev/null -#4 "$1")" == "$2" || exit 1; }
|
||||
dec() { test "$(httrack -O /dev/null -#5 "$1")" == "$2" || exit 1; }
|
||||
enc() { test "$(httrack -O /dev/null -#test=idna-encode "$1")" == "$2" || exit 1; }
|
||||
dec() { test "$(httrack -O /dev/null -#test=idna-decode "$1")" == "$2" || exit 1; }
|
||||
# crash probe: malformed ACE input must exit cleanly, not abort.
|
||||
runs() { httrack -O /dev/null -#5 "$1" >/dev/null 2>&1 || exit 1; }
|
||||
runs() { httrack -O /dev/null -#test=idna-decode "$1" >/dev/null 2>&1 || exit 1; }
|
||||
|
||||
# encode
|
||||
enc 'www.café.com' 'www.xn--caf-dma.com'
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
set -euo pipefail
|
||||
|
||||
# MIME type guessing from extension (get_httptype / give_mimext).
|
||||
# -#2 <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
|
||||
# -#test=mime <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
|
||||
|
||||
mime() {
|
||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is '$2'" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=mime "$1" | head -1)" == "$1 is '$2'" || exit 1
|
||||
}
|
||||
unknown() {
|
||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=mime "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
|
||||
}
|
||||
|
||||
mime '/a/b.html' 'text/html'
|
||||
|
||||
@@ -8,7 +8,7 @@ set -euo pipefail
|
||||
# relative path from <curr>'s directory to <link>
|
||||
rel() {
|
||||
local got
|
||||
got=$(httrack -O /dev/null -#l "$1" "$2")
|
||||
got=$(httrack -O /dev/null -#test=relative "$1" "$2")
|
||||
test "$got" == "relative=$3" ||
|
||||
{
|
||||
echo "FAIL rel($1, $2): got '$got' want 'relative=$3'"
|
||||
@@ -19,7 +19,7 @@ rel() {
|
||||
# resolve <link> against origin <adr>/<fil> -> adr=.. fil=..
|
||||
ident() {
|
||||
local got
|
||||
got=$(httrack -O /dev/null -#i "$1" "$2" "$3")
|
||||
got=$(httrack -O /dev/null -#test=resolve "$1" "$2" "$3")
|
||||
test "$got" == "$4" ||
|
||||
{
|
||||
echo "FAIL ident($1, $2, $3): got '$got' want '$4'"
|
||||
|
||||
@@ -3,11 +3,11 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Local save-name extension resolution (url_savename via -#N <fil> <content-type>).
|
||||
# Local save-name extension resolution (url_savename via -#test=savename <fil> <content-type>).
|
||||
# Asserts on the basename of "savename: <path>".
|
||||
|
||||
name() {
|
||||
out="$(httrack -O /dev/null -#N "$1" "$2" | sed -n 's/^savename: //p')"
|
||||
out="$(httrack -O /dev/null -#test=savename "$1" "$2" | sed -n 's/^savename: //p')"
|
||||
test "${out##*/}" == "$3" || {
|
||||
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
|
||||
exit 1
|
||||
|
||||
17
tests/01_engine-selftest-dispatch.test
Normal file
17
tests/01_engine-selftest-dispatch.test
Normal file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# The -#test dispatch itself: a bare -#test lists the registry, and an unknown
|
||||
# name errors (non-zero, diagnostic) instead of silently passing.
|
||||
|
||||
set -eu
|
||||
|
||||
# Bare -#test lists known tests (printed to stderr).
|
||||
list=$(httrack -#test 2>&1)
|
||||
printf '%s\n' "$list" | grep -q "filter" || exit 1
|
||||
printf '%s\n' "$list" | grep -q "cache-writefail" || exit 1
|
||||
|
||||
# Unknown name: non-zero exit + diagnostic, and no test result line.
|
||||
rc=0
|
||||
err=$(httrack -#test=bogus 2>&1) || rc=$?
|
||||
test "$rc" -ne 0 || exit 1
|
||||
printf '%s\n' "$err" | grep -q "Unknown self-test" || exit 1
|
||||
@@ -5,7 +5,7 @@ set -euo pipefail
|
||||
|
||||
# path simplify engine (fil_simplifie): collapses ./ and ../ segments.
|
||||
simp() {
|
||||
test "$(httrack -O /dev/null -#1 "$1")" == "simplified=$2" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=simplify "$1")" == "simplified=$2" || exit 1
|
||||
}
|
||||
|
||||
simp './foo/bar/' 'foo/bar/'
|
||||
|
||||
@@ -3,23 +3,22 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# htssafe.h bounded string operations (driven by 'httrack -#8').
|
||||
# htssafe.h bounded string operations (driven by 'httrack -#test=strsafe').
|
||||
|
||||
# Success path: every bounded op (strcpybuff/strcatbuff/strncatbuff/strlcpybuff)
|
||||
# must behave correctly. Like the other -# debug modes, a trailing token is
|
||||
# required (a bare '-#8' falls through to the usage screen).
|
||||
# must behave correctly. 'run' selects the success path (vs the overflow modes).
|
||||
rc=0
|
||||
out=$(httrack -#8 run) || rc=$?
|
||||
out=$(httrack -#test=strsafe run) || rc=$?
|
||||
test "$rc" -eq 0 || exit 1
|
||||
test "$out" == "strsafe: OK" || exit 1
|
||||
|
||||
# Overflow path: an over-capacity write into a sized buffer must be caught by
|
||||
# the bounded macro and abort the process, not be silently truncated/completed.
|
||||
# Assert the htssafe abort signature specifically, so the test cannot pass for
|
||||
# an unrelated reason (e.g. the -#8 mode being gone and falling through to the
|
||||
# usage screen, which also exits non-zero).
|
||||
# an unrelated reason (e.g. the strsafe test being gone, which prints the
|
||||
# registry to stderr and also exits non-zero).
|
||||
# the bounded macro aborts (non-zero exit), so don't let set -e trip on it
|
||||
err=$(httrack -#8 overflow "this string is far too long for the buffer" 2>&1) || true
|
||||
err=$(httrack -#test=strsafe overflow "this string is far too long for the buffer" 2>&1) || true
|
||||
case "$err" in
|
||||
*"strsafe: NOT aborted"*)
|
||||
echo "over-capacity write was NOT caught" >&2
|
||||
@@ -36,7 +35,7 @@ esac
|
||||
# capacity (4 bytes into a 4-byte buffer), so this also pins the boundary: a
|
||||
# '<=' off-by-one in the capacity check would let it through (and print "NOT
|
||||
# aborted"). Match the specific htsbuff abort message, not just any assert.
|
||||
err=$(httrack -#8 overflow-buff "abcd" 2>&1) || true
|
||||
err=$(httrack -#test=strsafe overflow-buff "abcd" 2>&1) || true
|
||||
case "$err" in
|
||||
*"strsafe: NOT aborted"*)
|
||||
echo "htsbuff over-capacity write was NOT caught" >&2
|
||||
|
||||
113
tests/20_local-resume-loop.test
Executable file
113
tests/20_local-resume-loop.test
Executable file
@@ -0,0 +1,113 @@
|
||||
#!/bin/bash
|
||||
# Issue #206: a continue/update crawl looped forever when the resume Range got a
|
||||
# 416. Pass 1 leaves a partial + temp-ref; pass 2 must terminate and not loop.
|
||||
set -u
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
testdir=$(cd "$(dirname "$0")" && pwd)
|
||||
server="${testdir}/local-server.py"
|
||||
|
||||
command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
|
||||
|
||||
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_206.XXXXXX") || exit 1
|
||||
serverpid=
|
||||
crawlpid=
|
||||
cleanup() {
|
||||
test -n "$crawlpid" && kill -9 "$crawlpid" 2>/dev/null
|
||||
if test -n "$serverpid"; then
|
||||
kill "$serverpid" 2>/dev/null
|
||||
wait "$serverpid" 2>/dev/null
|
||||
fi
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
trap cleanup EXIT HUP INT QUIT PIPE TERM
|
||||
|
||||
# --- start the server, discover its ephemeral port --------------------------
|
||||
# RESUME_COUNTER gets a byte per /resume/blob.txt request (pass-2 delta bounds re-gets).
|
||||
serverlog="${tmpdir}/server.log"
|
||||
counter="${tmpdir}/blobcount"
|
||||
RESUME_COUNTER="$counter" python3 "$server" --root "${testdir}/server-root" >"$serverlog" 2>&1 &
|
||||
serverpid=$!
|
||||
port=
|
||||
for _ in $(seq 1 50); do
|
||||
line=$(head -n1 "$serverlog" 2>/dev/null)
|
||||
if test "${line%% *}" == "PORT"; then
|
||||
port="${line#PORT }"
|
||||
break
|
||||
fi
|
||||
kill -0 "$serverpid" 2>/dev/null || {
|
||||
echo "server exited early: $(cat "$serverlog")"
|
||||
exit 1
|
||||
}
|
||||
sleep 0.1
|
||||
done
|
||||
test -n "$port" || {
|
||||
echo "could not discover server port"
|
||||
exit 1
|
||||
}
|
||||
base="http://127.0.0.1:${port}"
|
||||
|
||||
which httrack >/dev/null || {
|
||||
echo "could not find httrack"
|
||||
exit 1
|
||||
}
|
||||
out="${tmpdir}/crawl"
|
||||
mkdir "$out"
|
||||
common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0)
|
||||
refdir="${out}/hts-cache/ref"
|
||||
|
||||
# --- pass 1: crawl, interrupt once the blob download is underway -------------
|
||||
printf '[pass 1: interrupt mid-download] ..\t'
|
||||
httrack "${common[@]}" "${base}/resume/index.html" >"${tmpdir}/log1" 2>&1 &
|
||||
crawlpid=$!
|
||||
# Wait until blob.txt is requested, then SIGTERM so httrack's exit handler
|
||||
# finalizes the cache and serializes the temp-ref.
|
||||
for _ in $(seq 1 300); do
|
||||
test -s "$counter" && break
|
||||
kill -0 "$crawlpid" 2>/dev/null || break
|
||||
sleep 0.1
|
||||
done
|
||||
sleep 0.5
|
||||
kill -TERM "$crawlpid" 2>/dev/null
|
||||
wait "$crawlpid" 2>/dev/null
|
||||
crawlpid=
|
||||
test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
|
||||
echo "FAIL: no temp-ref survived pass 1; cannot drive #206"
|
||||
exit 1
|
||||
}
|
||||
echo "OK (temp-ref present)"
|
||||
before=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||
|
||||
# --- pass 2: --continue -> resume Range -> 416, bounded against the #206 loop -
|
||||
# Kill pass 2 after a deadline (portable stand-in for `timeout`, absent on macOS).
|
||||
printf '[pass 2: resume must terminate] ..\t'
|
||||
HANG_RC=137 # 128 + SIGKILL
|
||||
httrack "${common[@]}" --continue "${base}/resume/index.html" >"${tmpdir}/log2" 2>&1 &
|
||||
crawlpid=$!
|
||||
(sleep 30 && kill -9 "$crawlpid" 2>/dev/null) &
|
||||
guard=$!
|
||||
rc=0
|
||||
wait "$crawlpid" 2>/dev/null || rc=$?
|
||||
crawlpid=
|
||||
kill "$guard" 2>/dev/null || true
|
||||
wait "$guard" 2>/dev/null || true
|
||||
if test "$rc" -eq "$HANG_RC"; then
|
||||
echo "FAIL: pass 2 did not terminate (#206 resume->416 loop)"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK (terminated, rc=$rc)"
|
||||
|
||||
# The fix re-gets once (resume Range + range-less re-get = 2): the lower bound
|
||||
# rejects a drop-the-link non-fix (1), the upper bound rejects the loop (many).
|
||||
after=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||
hits=$((after - before))
|
||||
printf '[bounded re-get count] ..\t'
|
||||
if test "$hits" -lt 2; then
|
||||
echo "FAIL: only ${hits} pass-2 request(s); the stale partial was not re-got"
|
||||
exit 1
|
||||
fi
|
||||
if test "$hits" -gt 8; then
|
||||
echo "FAIL: ${hits} pass-2 requests for blob.txt (resume is looping)"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK (${hits} requests)"
|
||||
11
tests/21_local-intl-update.test
Normal file
11
tests/21_local-intl-update.test
Normal file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# #157: a dotless, accented URL named .html on the first crawl must keep .html
|
||||
# across an update -- not revert to the extensionless name.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'intl/Instalação_CVS_no_Ubuntu.html' \
|
||||
--not-found 'intl/Instalação_CVS_no_Ubuntu' \
|
||||
httrack 'BASEURL/intl/index.html'
|
||||
17
tests/22_local-broken-size.test
Executable file
17
tests/22_local-broken-size.test
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
# Issues #32/#41: a Content-Length that disagrees with the body warns "bogus
|
||||
# state (broken size)" and skips the cache; -%B (tolerant) accepts it.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# Default: warn, but the file is still written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-found 'bogus state \(broken size' \
|
||||
httrack 'BASEURL/size/index.html'
|
||||
|
||||
# -%B (tolerant): no warning, file written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-not-found 'bogus state' \
|
||||
httrack 'BASEURL/size/index.html' '-%B'
|
||||
19
tests/23_local-errpage.test
Normal file
19
tests/23_local-errpage.test
Normal file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
# Issue #17: with "no error pages" (-o0), 4xx/5xx bodies must not be written;
|
||||
# a genuine 0-byte 200 stays. Default (-o1) writes the error page. (#17's purge
|
||||
# half also does not reproduce; the purge path is not exercised here.)
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# -o0: 404 suppressed, good page and the legit 0-byte 200 kept.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'errpage/good.html' \
|
||||
--found 'errpage/empty.html' \
|
||||
--not-found 'errpage/missing.html' \
|
||||
httrack 'BASEURL/errpage/index.html' '-o0'
|
||||
|
||||
# Control -o1 (default): the 404 error page is written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'errpage/missing.html' \
|
||||
httrack 'BASEURL/errpage/index.html' '-o1'
|
||||
@@ -26,6 +26,7 @@ TESTS = \
|
||||
00_runnable.test \
|
||||
01_engine-cache.test \
|
||||
01_engine-cache-golden.test \
|
||||
01_engine-cache-writefail.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-cookies.test \
|
||||
@@ -41,6 +42,7 @@ TESTS = \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-strsafe.test \
|
||||
02_manpage-regen.test \
|
||||
@@ -59,6 +61,10 @@ TESTS = \
|
||||
16_local-assume.test \
|
||||
17_local-empty-ct.test \
|
||||
18_local-update.test \
|
||||
19_local-connect-fallback.test
|
||||
19_local-connect-fallback.test \
|
||||
20_local-resume-loop.test \
|
||||
21_local-intl-update.test \
|
||||
22_local-broken-size.test \
|
||||
23_local-errpage.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -14,7 +14,9 @@
|
||||
# Usage:
|
||||
# bash local-crawl.sh [--tls] [--root DIR] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
|
||||
set -u
|
||||
|
||||
@@ -107,7 +109,7 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--found | --not-found | --directory)
|
||||
--found | --not-found | --directory | --log-found | --log-not-found)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
@@ -196,6 +198,15 @@ if test -n "$rerun"; then
|
||||
exit 1
|
||||
}
|
||||
result "OK (update)"
|
||||
# The update summary reports "files updated"; a fresh crawl never does. Assert
|
||||
# it so a regression that bypasses the cache (re-crawls fresh) can't pass.
|
||||
info "checking update used the cache"
|
||||
if grep -aqE "mirror complete in .*files updated" "${out}/hts-log.txt"; then
|
||||
result "OK"
|
||||
else
|
||||
result "update pass did not report cache activity"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||
@@ -248,6 +259,22 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--log-found)
|
||||
i=$((i + 1))
|
||||
info "checking log matches ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then result "OK"; else
|
||||
result "not in log"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--log-not-found)
|
||||
i=$((i + 1))
|
||||
info "checking log lacks ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then
|
||||
result "present in log"
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
@@ -15,6 +15,7 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||
from urllib.parse import quote, unquote, urlsplit
|
||||
|
||||
@@ -176,6 +177,87 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
body, ctype = self.TYPE_MATRIX[path]
|
||||
self.send_raw(body, ctype)
|
||||
|
||||
# --- special chars in URLs across an update (issue #157) ---------------
|
||||
# A dotless, accented basename served as text/html (MediaWiki style). The
|
||||
# name the first crawl picks (.html) must survive the update pass.
|
||||
INTL_NAME = "Instalação_CVS_no_Ubuntu"
|
||||
|
||||
def route_intl_index(self):
|
||||
self.send_html('\t<a href="%s">accented</a>\n' % self.INTL_NAME)
|
||||
|
||||
def route_intl_page(self):
|
||||
self.send_raw(b"<html><body>accented page</body></html>\n", "text/html")
|
||||
|
||||
# resume / 416 loop (#206): the first GET stalls after a prefix so the crawl
|
||||
# can be interrupted (partial + temp-ref); every later request is 416.
|
||||
RESUME_PREFIX = b"PARTIAL-" + b"x" * 4096 # flushed before the stall
|
||||
RESUME_LEN = len(RESUME_PREFIX) + 4096 # declared length never delivered
|
||||
_resume_started = False
|
||||
|
||||
def route_resume_index(self):
|
||||
self.send_html('\t<a href="blob.txt">blob</a>')
|
||||
|
||||
def route_resume(self):
|
||||
counter = os.environ.get("RESUME_COUNTER")
|
||||
if counter:
|
||||
with open(counter, "a") as fp:
|
||||
fp.write("x")
|
||||
# First GET: stall mid-body so the crawl can be interrupted with a partial.
|
||||
if not Handler._resume_started:
|
||||
Handler._resume_started = True
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "image/png")
|
||||
self.send_header("Content-Length", str(self.RESUME_LEN))
|
||||
self.send_header("Accept-Ranges", "bytes")
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(self.RESUME_PREFIX)
|
||||
self.wfile.flush()
|
||||
try:
|
||||
while True:
|
||||
time.sleep(3600)
|
||||
except OSError:
|
||||
pass
|
||||
return
|
||||
self.send_response(416, "Requested Range Not Satisfiable")
|
||||
self.send_header("Content-Type", "image/png")
|
||||
self.send_header("Content-Range", "bytes */%d" % self.RESUME_LEN)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
# error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
|
||||
# bodies off disk; a genuine 0-byte 200 is a valid file and stays.
|
||||
def route_errpage_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="good.html">good</a>\n'
|
||||
'\t<a href="missing.html">missing</a>\n'
|
||||
'\t<a href="empty.html">empty</a>\n'
|
||||
)
|
||||
|
||||
def route_errpage_good(self):
|
||||
self.send_raw(b"<html><body>good page</body></html>\n", "text/html")
|
||||
|
||||
def route_errpage_missing(self):
|
||||
self.send_html("\t404 error body", status=404, extra_status="Not Found")
|
||||
|
||||
def route_errpage_empty(self):
|
||||
self.send_raw(b"", "text/html")
|
||||
|
||||
# broken Content-Length (#32/#41): declared size != bytes sent. httrack
|
||||
# warns "bogus state (broken size)" and skips the cache unless -%B.
|
||||
def route_size_index(self):
|
||||
self.send_html('\t<a href="oversize.bin">over</a>\n')
|
||||
|
||||
def route_size_oversize(self):
|
||||
body = b"A" * 100
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", str(len(body) - 2)) # lie: too short
|
||||
self.send_header("Connection", "close")
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
@@ -195,6 +277,16 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/types/style.css": route_types,
|
||||
"/types/data.json": route_types,
|
||||
"/types/gen.php": route_types,
|
||||
"/intl/index.html": route_intl_index,
|
||||
"/intl/" + INTL_NAME: route_intl_page,
|
||||
"/resume/index.html": route_resume_index,
|
||||
"/resume/blob.txt": route_resume,
|
||||
"/size/index.html": route_size_index,
|
||||
"/size/oversize.bin": route_size_oversize,
|
||||
"/errpage/index.html": route_errpage_index,
|
||||
"/errpage/good.html": route_errpage_good,
|
||||
"/errpage/missing.html": route_errpage_missing,
|
||||
"/errpage/empty.html": route_errpage_empty,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
@@ -202,7 +294,8 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
handler = self.ROUTES.get(path)
|
||||
# Match percent-encoded paths (accented #157 route) by their decoded form.
|
||||
handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
|
||||
if handler is not None:
|
||||
handler(self)
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user