mirror of
https://github.com/xroche/httrack.git
synced 2026-07-05 16:44:55 +03:00
Compare commits
2 Commits
p2-4-cache
...
naming-con
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
92ad109c30 | ||
|
|
56b809c82d |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -39,6 +39,3 @@ Makefile
|
||||
|
||||
# Editor / autotools backup files.
|
||||
*~
|
||||
|
||||
# Python bytecode (tests/local-server.py).
|
||||
__pycache__/
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
AC_PREREQ([2.71])
|
||||
|
||||
AC_INIT([httrack], [3.49.11], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_INIT([httrack], [3.49.10], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_COPYRIGHT([
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2015 Xavier Roche and other contributors
|
||||
@@ -29,11 +29,10 @@ AC_CONFIG_SRCDIR(src/httrack.c)
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
AM_INIT_AUTOMAKE([subdir-objects])
|
||||
# 3:3:0: 3.49.11 only adds enum values, macros and inline helpers to the
|
||||
# installed headers (no struct layout or exported signature changed vs
|
||||
# 3.49.10), so it stays soname .so.3; bump revision.
|
||||
# 3:2:0: 3.49.10 only appends tail fields to the options struct (no existing
|
||||
# symbol or offset changed vs 3.49.9), so it stays soname .so.3; bump revision.
|
||||
# (3:0:0 was the htsblk mime-buffer widening, the ABI break that moved .so.2 -> .so.3.)
|
||||
VERSION_INFO="3:3:0"
|
||||
VERSION_INFO="3:2:0"
|
||||
AM_MAINTAINER_MODE
|
||||
AC_USE_SYSTEM_EXTENSIONS
|
||||
|
||||
|
||||
21
debian/changelog
vendored
21
debian/changelog
vendored
@@ -1,24 +1,3 @@
|
||||
httrack (3.49.11-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: crawl correctness and security fixes (network-facing
|
||||
buffer overflows, file-type detection, redirect handling) and modernized
|
||||
web defaults; full list in history.txt.
|
||||
* Add DEP-12 upstream metadata (#466).
|
||||
* Bump debhelper compat to 14 (#466).
|
||||
* Drop the redundant Priority field and update the NMU lintian override to
|
||||
the current tag names (#466).
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sun, 05 Jul 2026 00:03:18 +0200
|
||||
|
||||
httrack (3.49.10-2) unstable; urgency=medium
|
||||
|
||||
* Fix FTBFS: tests/28_local-pause failed instead of skipping when python3 is
|
||||
absent (the local-server tests need python3, which the buildds lack). Add
|
||||
patches/skip-local-pause-test-without-python3.patch to guard the test on
|
||||
python3 up front, like its siblings, so it skips cleanly.
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sun, 28 Jun 2026 20:18:46 +0200
|
||||
|
||||
httrack (3.49.10-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: new download-pacing and URL-handling options plus a
|
||||
|
||||
17
history.txt
17
history.txt
@@ -4,23 +4,6 @@ HTTrack Website Copier release history:
|
||||
|
||||
This file lists all changes and fixes that have been made for HTTrack
|
||||
|
||||
3.49-11
|
||||
+ New: parse robots.txt Allow rules and path wildcards per RFC 9309 (#452)
|
||||
+ New: advertise deflate in Accept-Encoding and decode deflate responses (#450)
|
||||
+ New: follow <source> and <track> media elements as embedded links (#451)
|
||||
+ New: added modern web MIME types to the type/extension table (#448)
|
||||
+ Fixed: enforce the -E time limit during a slow transfer instead of only between files (#481)
|
||||
+ Fixed: sniff the leading bytes of a download so a misdeclared Content-Type no longer renames a correct URL extension
|
||||
+ Fixed: fast transfers could be saved under their temporary .delayed placeholder name (#5, #107)
|
||||
+ Fixed: follow a redirect that maps to the same saved file instead of writing a self-pointing stub (#159)
|
||||
+ Fixed: several network-facing buffer overflows in the FTP, Java and HTML parsers
|
||||
+ Fixed: the htsjava plugin could not be loaded (hidden entry points, stale library name)
|
||||
+ Fixed: HTML-escape truncation and a cache-buffer leak in the parser
|
||||
+ Changed: modernized the default User-Agent to an honest HTTrack identifier (#449)
|
||||
+ Changed: decode the full WHATWG set of HTML named character references (#443)
|
||||
+ Changed: refreshed stale HTTP status, proxy-port and TLS-floor constants (#453)
|
||||
+ Changed: multiple internal hardening, build, test and CI improvements
|
||||
|
||||
3.49-10
|
||||
+ New: --cookies-file to preload a Netscape cookies.txt before crawling (#215)
|
||||
+ New: --pause to space out file downloads by a random delay (#185)
|
||||
|
||||
@@ -62,7 +62,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
htsname.c htsrobots.c htstools.c htswizard.c \
|
||||
htsalias.c htsthread.c htsindex.c htsbauth.c \
|
||||
htsmd5.c htszlib.c htswrap.c htsconcat.c \
|
||||
htsmodules.c htscharset.c punycode.c htsencoding.c htssniff.c \
|
||||
htsmodules.c htscharset.c punycode.c htsencoding.c \
|
||||
md5.c \
|
||||
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
||||
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
||||
@@ -70,7 +70,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
||||
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
||||
htshelp.h htsindex.h htslib.h htsmd5.h \
|
||||
htsmodules.h htsname.h htsnet.h htssniff.h \
|
||||
htsmodules.h htsname.h htsnet.h \
|
||||
htsopt.h htsrobots.h htsthread.h \
|
||||
htstools.h htswizard.h htswrap.h htszlib.h \
|
||||
htsstrings.h htsarrays.h httrack-library.h \
|
||||
|
||||
@@ -1359,18 +1359,6 @@ int back_flush_output(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
}
|
||||
|
||||
// effacer entrée
|
||||
/* Discard a cancelled mid-write .delayed placeholder (unusable across runs). */
|
||||
static void back_delayed_discard(httrackp *opt, lien_back *back) {
|
||||
if (back->r.out != NULL) {
|
||||
fclose(back->r.out);
|
||||
back->r.out = NULL;
|
||||
}
|
||||
back->r.is_write = 0;
|
||||
if (opt != NULL)
|
||||
url_savename_refname_remove(opt, back->url_adr, back->url_fil);
|
||||
(void) UNLINK(back->url_sav);
|
||||
}
|
||||
|
||||
int back_delete(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
const int p) {
|
||||
lien_back *const back = sback->lnk;
|
||||
@@ -1378,12 +1366,6 @@ int back_delete(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
|
||||
assertf(p >= 0 && p < back_max);
|
||||
if (p >= 0 && p < sback->count) { // on sait jamais..
|
||||
/* mid-write cancel: drop a .delayed placeholder; real-named partials
|
||||
survive for resume (--continue) */
|
||||
if (back[p].r.is_write && IS_DELAYED_EXT(back[p].url_sav) &&
|
||||
(back[p].status != STATUS_READY || back[p].r.statuscode <= 0)) {
|
||||
back_delayed_discard(opt, &back[p]);
|
||||
}
|
||||
// Vérificateur d'intégrité
|
||||
#if DEBUG_CHECKINT
|
||||
_CHECKINT(&back[p], "Appel back_delete")
|
||||
@@ -2437,34 +2419,6 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
back_clean(opt, cache, sback);
|
||||
#endif
|
||||
|
||||
/* Time limit exceeded past grace: abort in-flight transfers so no wait loop
|
||||
starves (#481). FTP slots stay, their thread owns the socket. */
|
||||
if (!back_checkmirror(opt)) {
|
||||
int aborted = 0;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < (unsigned int) back_max; i++) {
|
||||
if (back[i].status > 0 && back[i].status < STATUS_FTP_TRANSFER) {
|
||||
if (back[i].r.soc != INVALID_SOCKET) {
|
||||
deletehttp(&back[i].r);
|
||||
}
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
/* drop a .delayed placeholder; real partials survive for resume */
|
||||
if (back[i].r.is_write && IS_DELAYED_EXT(back[i].url_sav))
|
||||
back_delayed_discard(opt, &back[i]);
|
||||
back[i].r.statuscode = STATUSCODE_TIMEOUT;
|
||||
strcpybuff(back[i].r.msg, "Mirror Time Out");
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
aborted++;
|
||||
}
|
||||
}
|
||||
if (aborted > 0)
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"time limit reached, %d transfer(s) aborted", aborted);
|
||||
return;
|
||||
}
|
||||
|
||||
// recevoir tant qu'il y a des données (avec un maximum de max_loop boucles)
|
||||
do_wait = 0;
|
||||
gestion_timeout = 0;
|
||||
@@ -4210,11 +4164,6 @@ int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Grace left to the smooth stop before in-flight transfers are aborted. */
|
||||
static int back_maxtime_grace(const int maxtime) {
|
||||
return maximum(5, minimum(30, maxtime / 10));
|
||||
}
|
||||
|
||||
int back_checkmirror(httrackp * opt) {
|
||||
// Check max size
|
||||
if ((opt->maxsite > 0) && (HTS_STAT.stat_bytes >= opt->maxsite)) {
|
||||
@@ -4231,19 +4180,13 @@ int back_checkmirror(httrackp * opt) {
|
||||
*/
|
||||
}
|
||||
// Check max time
|
||||
if (opt->maxtime > 0) {
|
||||
const TStamp elapsed = time_local() - HTS_STAT.stat_timestart;
|
||||
|
||||
if (elapsed >= opt->maxtime) {
|
||||
if (!opt->state.stop) { /* not yet stopped */
|
||||
hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up",
|
||||
opt->maxtime);
|
||||
/* cancel mirror smoothly */
|
||||
hts_request_stop(opt, 0);
|
||||
}
|
||||
/* smooth stop starved past the grace period: stop waiting (#481) */
|
||||
if (elapsed - opt->maxtime >= back_maxtime_grace(opt->maxtime))
|
||||
return 0;
|
||||
if ((opt->maxtime > 0)
|
||||
&& ((time_local() - HTS_STAT.stat_timestart) >= opt->maxtime)) {
|
||||
if (!opt->state.stop) { /* not yet stopped */
|
||||
hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up",
|
||||
opt->maxtime);
|
||||
/* cancel mirror smoothly */
|
||||
hts_request_stop(opt, 0);
|
||||
}
|
||||
}
|
||||
return 1; /* Ok, go on */
|
||||
|
||||
@@ -136,8 +136,6 @@ void back_solve(httrackp * opt, lien_back * sback);
|
||||
int host_wait(httrackp * opt, lien_back * sback);
|
||||
#endif
|
||||
int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize);
|
||||
/* Enforce -M/-E quotas: requests a smooth stop when reached; returns 0 once
|
||||
the -E deadline overran its grace period (callers must stop waiting). */
|
||||
int back_checkmirror(httrackp * opt);
|
||||
|
||||
#endif
|
||||
|
||||
102
src/htscache.c
102
src/htscache.c
@@ -40,7 +40,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htscore.h"
|
||||
#include "htsbasenet.h"
|
||||
#include "htsmd5.h"
|
||||
#include <limits.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "htszlib.h"
|
||||
@@ -597,18 +596,15 @@ htsblk cache_read_ro(httrackp * opt, cache_back * cache, const char *adr,
|
||||
return cache_readex(opt, cache, adr, fil, save, location, NULL, 1);
|
||||
}
|
||||
|
||||
htsblk cache_read_including_broken(httrackp *opt, cache_back *cache,
|
||||
const char *adr, const char *fil,
|
||||
char *return_save) {
|
||||
htsblk r = cache_readex(opt, cache, adr, fil, NULL, NULL, return_save, 0);
|
||||
htsblk cache_read_including_broken(httrackp * opt, cache_back * cache,
|
||||
const char *adr, const char *fil) {
|
||||
htsblk r = cache_read(opt, cache, adr, fil, NULL, NULL);
|
||||
|
||||
if (r.statuscode == -1) {
|
||||
lien_back *itemback = NULL;
|
||||
|
||||
if (back_unserialize_ref(opt, adr, fil, &itemback) == 0) {
|
||||
r = itemback->r;
|
||||
if (return_save != NULL)
|
||||
strlcpybuff(return_save, itemback->url_sav, HTS_URLMAXSIZE * 2);
|
||||
/* cleanup */
|
||||
back_clear_entry(itemback); /* delete entry content */
|
||||
freet(itemback); /* delete item */
|
||||
@@ -769,15 +765,6 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
strlcpybuff(return_save, previous_save, HTS_URLMAXSIZE * 2);
|
||||
}
|
||||
|
||||
/* A tampered X-Size must be rejected before the size-driven malloc.
|
||||
The alloc casts to int (malloct((int) r.size + 1)), so bound it to
|
||||
[0, INT_MAX): a negative value, or a positive one whose (int) cast
|
||||
truncates negative, would otherwise wrap to a huge allocation. */
|
||||
if (r.size < 0 || r.size >= INT_MAX) {
|
||||
r.statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(r.msg, "Cache Read Error : Bad Size");
|
||||
}
|
||||
|
||||
/* Complete fields */
|
||||
r.totalsize = r.size;
|
||||
r.adr = NULL;
|
||||
@@ -804,8 +791,7 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
} // otherwise, the ZIP file is supposed to be consistent with data.
|
||||
}
|
||||
/* Read data ? */
|
||||
else if (r.statuscode !=
|
||||
STATUSCODE_INVALID) { /* ne pas lire uniquement header */
|
||||
else { /* ne pas lire uniquement header */
|
||||
int ok = 0;
|
||||
|
||||
#if HTS_DIRECTDISK
|
||||
@@ -1431,86 +1417,6 @@ static int hts_rename(httrackp * opt, const char *a, const char *b) {
|
||||
return rename(a, b);
|
||||
}
|
||||
|
||||
/* Pathname of a file inside the mirror dir (rotating concat buffer). */
|
||||
static char *reconcile_path(httrackp *opt, const char *name) {
|
||||
return fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), name);
|
||||
}
|
||||
|
||||
/* Interrupted-run heuristic: prefer the old generation when the new cache
|
||||
stalled below NEW_TINY while the old one grew past OLD_SOLID (historical
|
||||
arbitrary thresholds). */
|
||||
#define CACHE_RECONCILE_NEW_TINY 32768
|
||||
#define CACHE_RECONCILE_OLD_SOLID 65536
|
||||
|
||||
/* Replace the new-generation file by the old one, when the old one exists. */
|
||||
static void reconcile_promote(httrackp *opt, const char *oldname,
|
||||
const char *newname) {
|
||||
if (fexist(reconcile_path(opt, oldname))) {
|
||||
remove(reconcile_path(opt, newname));
|
||||
rename(reconcile_path(opt, oldname), reconcile_path(opt, newname));
|
||||
}
|
||||
}
|
||||
|
||||
void hts_cache_reconcile(httrackp *opt, hts_cache_reconcile_mode mode) {
|
||||
switch (mode) {
|
||||
case CACHE_RECONCILE_PROMOTE:
|
||||
/* Previous run rotated new.* to old.* then died before writing: promote
|
||||
the old generation back, whichever format it uses. */
|
||||
if (!fexist(reconcile_path(opt, "hts-cache/new.zip")))
|
||||
reconcile_promote(opt, "hts-cache/old.zip", "hts-cache/new.zip");
|
||||
if ((!fexist(reconcile_path(opt, "hts-cache/new.dat")) ||
|
||||
!fexist(reconcile_path(opt, "hts-cache/new.ndx"))) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.dat")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.ndx"))) {
|
||||
reconcile_promote(opt, "hts-cache/old.dat", "hts-cache/new.dat");
|
||||
reconcile_promote(opt, "hts-cache/old.ndx", "hts-cache/new.ndx");
|
||||
}
|
||||
break;
|
||||
case CACHE_RECONCILE_INTERRUPTED:
|
||||
/* Aborted run: keep the larger generation when the new cache is
|
||||
suspiciously small next to the old one. The new file must exist: fsize()
|
||||
is -1 for a missing file, which would spuriously pass the "< TINY" test
|
||||
and overwrite a solid old generation that PROMOTE/ROLLBACK should keep.
|
||||
*/
|
||||
if (!opt->cache || !fexist(reconcile_path(opt, "hts-in_progress.lock")))
|
||||
break;
|
||||
if (fexist(reconcile_path(opt, "hts-cache/new.zip")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.zip")) &&
|
||||
fsize(reconcile_path(opt, "hts-cache/new.zip")) <
|
||||
CACHE_RECONCILE_NEW_TINY &&
|
||||
fsize(reconcile_path(opt, "hts-cache/old.zip")) >
|
||||
CACHE_RECONCILE_OLD_SOLID &&
|
||||
fsize(reconcile_path(opt, "hts-cache/old.zip")) >
|
||||
fsize(reconcile_path(opt, "hts-cache/new.zip")))
|
||||
reconcile_promote(opt, "hts-cache/old.zip", "hts-cache/new.zip");
|
||||
if (fexist(reconcile_path(opt, "hts-cache/new.dat")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.dat")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.ndx")) &&
|
||||
fsize(reconcile_path(opt, "hts-cache/new.dat")) <
|
||||
CACHE_RECONCILE_NEW_TINY &&
|
||||
fsize(reconcile_path(opt, "hts-cache/old.dat")) >
|
||||
CACHE_RECONCILE_OLD_SOLID &&
|
||||
fsize(reconcile_path(opt, "hts-cache/old.dat")) >
|
||||
fsize(reconcile_path(opt, "hts-cache/new.dat"))) {
|
||||
reconcile_promote(opt, "hts-cache/old.dat", "hts-cache/new.dat");
|
||||
reconcile_promote(opt, "hts-cache/old.ndx", "hts-cache/new.ndx");
|
||||
}
|
||||
break;
|
||||
case CACHE_RECONCILE_ROLLBACK:
|
||||
/* Nothing transferred: restore the previous generation and sidecars. */
|
||||
reconcile_promote(opt, "hts-cache/old.zip", "hts-cache/new.zip");
|
||||
if (fexist(reconcile_path(opt, "hts-cache/old.dat")) &&
|
||||
fexist(reconcile_path(opt, "hts-cache/old.ndx"))) {
|
||||
reconcile_promote(opt, "hts-cache/old.dat", "hts-cache/new.dat");
|
||||
reconcile_promote(opt, "hts-cache/old.ndx", "hts-cache/new.ndx");
|
||||
}
|
||||
reconcile_promote(opt, "hts-cache/old.lst", "hts-cache/new.lst");
|
||||
reconcile_promote(opt, "hts-cache/old.txt", "hts-cache/new.txt");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// renvoyer uniquement en tête, ou NULL si erreur
|
||||
// return NULL upon error, and set -1 to r.statuscode
|
||||
htsblk *cache_header(httrackp * opt, cache_back * cache, const char *adr,
|
||||
|
||||
@@ -66,11 +66,8 @@ htsblk cache_read(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, const char *save, char *location);
|
||||
htsblk cache_read_ro(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, const char *save, char *location);
|
||||
/* Like cache_read, but also yields entries whose transfer broke; return_save
|
||||
(optional, HTS_URLMAXSIZE*2) receives the entry's recorded save name. */
|
||||
htsblk cache_read_including_broken(httrackp *opt, cache_back *cache,
|
||||
const char *adr, const char *fil,
|
||||
char *return_save);
|
||||
htsblk cache_read_including_broken(httrackp * opt, cache_back * cache,
|
||||
const char *adr, const char *fil);
|
||||
htsblk cache_readex(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, const char *save, char *location,
|
||||
char *return_save, int readonly);
|
||||
@@ -78,17 +75,6 @@ htsblk *cache_header(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, htsblk * r);
|
||||
void cache_init(cache_back * cache, httrackp * opt);
|
||||
|
||||
/* Which hts-cache/ generation (new.* vs old.*) is authoritative. */
|
||||
typedef enum {
|
||||
CACHE_RECONCILE_PROMOTE, /* no new cache: promote the old generation */
|
||||
CACHE_RECONCILE_INTERRUPTED, /* aborted run: keep the larger generation */
|
||||
CACHE_RECONCILE_ROLLBACK /* nothing transferred: restore the old one */
|
||||
} hts_cache_reconcile_mode;
|
||||
|
||||
/* Reconcile the on-disk cache generations according to mode; a no-op when
|
||||
the involved files are absent. */
|
||||
void hts_cache_reconcile(httrackp *opt, hts_cache_reconcile_mode mode);
|
||||
|
||||
int cache_writedata(FILE * cache_ndx, FILE * cache_dat, const char *str1,
|
||||
const char *str2, char *outbuff, int len);
|
||||
int cache_readdata(cache_back * cache, const char *str1, const char *str2,
|
||||
|
||||
@@ -716,398 +716,3 @@ int cache_golden_selftest(httrackp *opt, const char *dir, int regen) {
|
||||
|
||||
return failures;
|
||||
}
|
||||
|
||||
/* --- hts_cache_reconcile() policies -------------------------------------- */
|
||||
|
||||
/* All reconcile inputs/outputs, wiped between cases. */
|
||||
static const char *const reconcile_files[] = {
|
||||
"hts-cache/new.zip", "hts-cache/old.zip", "hts-cache/new.dat",
|
||||
"hts-cache/old.dat", "hts-cache/new.ndx", "hts-cache/old.ndx",
|
||||
"hts-cache/new.lst", "hts-cache/old.lst", "hts-cache/new.txt",
|
||||
"hts-cache/old.txt", "hts-in_progress.lock"};
|
||||
|
||||
static char *reconcile_st_path(httrackp *opt, const char *name) {
|
||||
return fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), name);
|
||||
}
|
||||
|
||||
static void reconcile_wipe(httrackp *opt) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < sizeof(reconcile_files) / sizeof(reconcile_files[0]); i++)
|
||||
remove(reconcile_st_path(opt, reconcile_files[i]));
|
||||
}
|
||||
|
||||
/* Create a filler file of exactly `size` bytes. */
|
||||
static void reconcile_put(httrackp *opt, const char *name, size_t size) {
|
||||
FILE *const fp = fopen(reconcile_st_path(opt, name), "wb");
|
||||
static const char filler[1024] = {'x'};
|
||||
|
||||
assertf(fp != NULL);
|
||||
while (size > 0) {
|
||||
const size_t n = size > sizeof(filler) ? sizeof(filler) : size;
|
||||
|
||||
assertf(fwrite(filler, 1, n, fp) == n);
|
||||
size -= n;
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/* Expect `name` to weigh `size` bytes, or be absent when size == -1. */
|
||||
static int reconcile_expect(httrackp *opt, const char *name, off_t size,
|
||||
const char *what) {
|
||||
const off_t got = fsize(reconcile_st_path(opt, name));
|
||||
|
||||
if (got != size) {
|
||||
fprintf(stderr, "cache-reconcile: %s: %s is %d bytes, expected %d\n", what,
|
||||
name, (int) got, (int) size);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cache_reconcile_selftest(httrackp *opt, const char *dir) {
|
||||
int failures = 0;
|
||||
|
||||
/* around the interrupted-run thresholds (new < 32768, old > 65536) */
|
||||
static const off_t TINY = 1024, MID = 40000, SOLID = 131072;
|
||||
|
||||
golden_setup(opt, dir);
|
||||
#ifdef _WIN32
|
||||
mkdir(reconcile_st_path(opt, "hts-cache"));
|
||||
#else
|
||||
mkdir(reconcile_st_path(opt, "hts-cache"), HTS_PROTECT_FOLDER);
|
||||
#endif
|
||||
|
||||
/* PROMOTE: a zip old generation replaces a missing new one */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.zip", SOLID, "promote-zip");
|
||||
failures += reconcile_expect(opt, "hts-cache/old.zip", -1, "promote-zip");
|
||||
|
||||
/* PROMOTE: an existing new.zip is left alone */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", TINY, "promote-zip-noop");
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/old.zip", SOLID, "promote-zip-noop");
|
||||
|
||||
/* PROMOTE: a pure-legacy old generation is promoted too (was dead when no
|
||||
zip cache existed) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/old.dat", SOLID);
|
||||
reconcile_put(opt, "hts-cache/old.ndx", TINY);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.dat", SOLID, "promote-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.ndx", TINY, "promote-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/old.dat", -1, "promote-dat");
|
||||
|
||||
/* PROMOTE: a half-written legacy new pair is replaced by the old pair */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.dat", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.dat", SOLID);
|
||||
reconcile_put(opt, "hts-cache/old.ndx", TINY);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.dat", SOLID, "promote-dat-partial");
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.ndx", TINY, "promote-dat-partial");
|
||||
|
||||
/* INTERRUPTED: no lock file, no action */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", TINY, "interrupted-nolock");
|
||||
|
||||
/* INTERRUPTED: an absent new.zip must NOT promote old.zip (fsize(-1) would
|
||||
spuriously pass "< TINY"); leave the solid old generation for ROLLBACK */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", -1, "interrupted-nonew");
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/old.zip", SOLID, "interrupted-nonew");
|
||||
|
||||
/* INTERRUPTED: stalled tiny new.zip loses to a solid old.zip (was dead for
|
||||
zip caches: the arm was gated on a legacy new.dat) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", SOLID, "interrupted-zip");
|
||||
failures += reconcile_expect(opt, "hts-cache/old.zip", -1, "interrupted-zip");
|
||||
|
||||
/* INTERRUPTED: old below the confidence threshold, keep new */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", MID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", TINY, "interrupted-smallold");
|
||||
|
||||
/* INTERRUPTED: new big enough to trust, keep it */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/new.zip", MID);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.zip", MID, "interrupted-bignew");
|
||||
|
||||
/* INTERRUPTED: the legacy pair follows the same size rule (was dead code) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-in_progress.lock", 0);
|
||||
reconcile_put(opt, "hts-cache/new.dat", TINY);
|
||||
reconcile_put(opt, "hts-cache/new.ndx", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.dat", SOLID);
|
||||
reconcile_put(opt, "hts-cache/old.ndx", MID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.dat", SOLID, "interrupted-dat");
|
||||
failures +=
|
||||
reconcile_expect(opt, "hts-cache/new.ndx", MID, "interrupted-dat");
|
||||
|
||||
/* ROLLBACK: the old zip generation is restored (a zip cache used to lose
|
||||
its only good generation here) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.zip", SOLID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.zip", SOLID, "rollback-zip");
|
||||
failures += reconcile_expect(opt, "hts-cache/old.zip", -1, "rollback-zip");
|
||||
|
||||
/* ROLLBACK: sidecars are restored regardless of format */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.lst", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.lst", MID);
|
||||
reconcile_put(opt, "hts-cache/old.txt", MID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.lst", MID, "rollback-lst");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.txt", MID, "rollback-txt");
|
||||
|
||||
/* ROLLBACK: full legacy generation incl. sidecars (historical behavior) */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.dat", TINY);
|
||||
reconcile_put(opt, "hts-cache/new.ndx", TINY);
|
||||
reconcile_put(opt, "hts-cache/old.dat", SOLID);
|
||||
reconcile_put(opt, "hts-cache/old.ndx", MID);
|
||||
reconcile_put(opt, "hts-cache/old.lst", MID);
|
||||
reconcile_put(opt, "hts-cache/old.txt", MID);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.dat", SOLID, "rollback-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.ndx", MID, "rollback-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.lst", MID, "rollback-dat");
|
||||
failures += reconcile_expect(opt, "hts-cache/new.txt", MID, "rollback-dat");
|
||||
|
||||
/* ROLLBACK: nothing to restore, the new generation stays */
|
||||
reconcile_wipe(opt);
|
||||
reconcile_put(opt, "hts-cache/new.zip", TINY);
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
failures += reconcile_expect(opt, "hts-cache/new.zip", TINY, "rollback-noop");
|
||||
|
||||
reconcile_wipe(opt);
|
||||
return failures;
|
||||
}
|
||||
|
||||
/* --- read-side corruption injection --------------------------------------- */
|
||||
|
||||
/* canary read back intact after each corruption; victim gets the byte surgery
|
||||
*/
|
||||
#define CORRUPT_ADR "corrupt.example.com"
|
||||
static char corrupt_body_a[33 + 1];
|
||||
static char corrupt_body_b[44 + 1];
|
||||
|
||||
/* Write a fresh two-entry cache: /canary.html then /victim.html. */
|
||||
static void corrupt_build(httrackp *opt) {
|
||||
cache_back cache;
|
||||
|
||||
memset(corrupt_body_a, 'a', sizeof(corrupt_body_a) - 1);
|
||||
memset(corrupt_body_b, 'b', sizeof(corrupt_body_b) - 1);
|
||||
remove(reconcile_st_path(opt, "hts-cache/new.zip"));
|
||||
remove(reconcile_st_path(opt, "hts-cache/old.zip"));
|
||||
selftest_open_for_write(&cache, opt);
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/canary.html", "canary.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "", "", "", corrupt_body_a,
|
||||
strlen(corrupt_body_a));
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/victim.html", "victim.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "", "", "", corrupt_body_b,
|
||||
strlen(corrupt_body_b));
|
||||
selftest_close(&cache);
|
||||
}
|
||||
|
||||
/* Like corrupt_build, but the victim carries a 20-char Etag whose header line
|
||||
is later overwritten with a forged oversized X-Size (same byte length). */
|
||||
static void corrupt_build_etag(httrackp *opt) {
|
||||
cache_back cache;
|
||||
|
||||
memset(corrupt_body_a, 'a', sizeof(corrupt_body_a) - 1);
|
||||
memset(corrupt_body_b, 'b', sizeof(corrupt_body_b) - 1);
|
||||
remove(reconcile_st_path(opt, "hts-cache/new.zip"));
|
||||
remove(reconcile_st_path(opt, "hts-cache/old.zip"));
|
||||
selftest_open_for_write(&cache, opt);
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/canary.html", "canary.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "", "", "", corrupt_body_a,
|
||||
strlen(corrupt_body_a));
|
||||
store_entry(opt, &cache, CORRUPT_ADR, "/victim.html", "victim.html", 200,
|
||||
"OK", "text/html", "utf-8", "", "AAAAAAAAAAAAAAAAAAAA", "", "",
|
||||
corrupt_body_b, strlen(corrupt_body_b));
|
||||
selftest_close(&cache);
|
||||
}
|
||||
|
||||
/* Patch the nth of total occurrences of pat (same-length rep) in new.zip. */
|
||||
static void corrupt_patch(httrackp *opt, const char *pat, size_t patlen,
|
||||
const char *rep, size_t nth, size_t total) {
|
||||
LLint fsz = 0;
|
||||
char *data = readfile2(reconcile_st_path(opt, "hts-cache/new.zip"), &fsz);
|
||||
const size_t n = (size_t) fsz;
|
||||
size_t k, hits = 0, at = 0;
|
||||
FILE *fp;
|
||||
|
||||
assertf(data != NULL);
|
||||
for (k = 0; k + patlen <= n; k++) {
|
||||
if (memcmp(data + k, pat, patlen) == 0) {
|
||||
hits++;
|
||||
if (hits == nth)
|
||||
at = k;
|
||||
}
|
||||
}
|
||||
assertf(hits == total);
|
||||
memcpy(data + at, rep, patlen);
|
||||
fp = fopen(reconcile_st_path(opt, "hts-cache/new.zip"), "wb");
|
||||
assertf(fp != NULL);
|
||||
assertf(fwrite(data, 1, n, fp) == n);
|
||||
fclose(fp);
|
||||
freet(data);
|
||||
}
|
||||
|
||||
/* Garbage the first bytes of the victim's deflated data (2nd local header). */
|
||||
static void corrupt_victim_body(httrackp *opt) {
|
||||
LLint fsz = 0;
|
||||
char *data = readfile2(reconcile_st_path(opt, "hts-cache/new.zip"), &fsz);
|
||||
const size_t n = (size_t) fsz;
|
||||
size_t k, hits = 0, off = 0;
|
||||
FILE *fp;
|
||||
|
||||
assertf(data != NULL);
|
||||
for (k = 0; k + 4 <= n; k++) {
|
||||
if (memcmp(data + k, "PK\x03\x04", 4) == 0 && ++hits == 2) {
|
||||
const size_t namelen =
|
||||
(unsigned char) data[k + 26] | ((unsigned char) data[k + 27] << 8);
|
||||
const size_t extralen =
|
||||
(unsigned char) data[k + 28] | ((unsigned char) data[k + 29] << 8);
|
||||
|
||||
off = k + 30 + namelen + extralen;
|
||||
}
|
||||
}
|
||||
assertf(hits == 2);
|
||||
assertf(off != 0 && off + 4 <= n);
|
||||
memset(data + off, 0xFF, 4);
|
||||
fp = fopen(reconcile_st_path(opt, "hts-cache/new.zip"), "wb");
|
||||
assertf(fp != NULL);
|
||||
assertf(fwrite(data, 1, n, fp) == n);
|
||||
fclose(fp);
|
||||
freet(data);
|
||||
}
|
||||
|
||||
/* Read the corrupt /victim.html and, in the SAME read session, the intact
|
||||
/canary.html: the victim must be rejected (wantmsg pins which path) and the
|
||||
canary must still decode byte-exact, proving one bad entry never taints a
|
||||
sibling read. */
|
||||
static int corrupt_expect_victim(httrackp *opt, const char *wantmsg,
|
||||
const char *what) {
|
||||
cache_back cache;
|
||||
htsblk v, c;
|
||||
char BIGSTK lv[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK lc[HTS_URLMAXSIZE * 2];
|
||||
int fail = 0;
|
||||
|
||||
selftest_open_for_read(&cache, opt);
|
||||
lv[0] = lc[0] = '\0';
|
||||
v = cache_readex(opt, &cache, CORRUPT_ADR, "/victim.html", "", lv, NULL, 1);
|
||||
if (v.statuscode != STATUSCODE_INVALID) {
|
||||
fprintf(stderr, "%s: %s: victim: statuscode is %d, expected %d\n",
|
||||
selftest_tag, what, v.statuscode, STATUSCODE_INVALID);
|
||||
fail++;
|
||||
}
|
||||
if (wantmsg != NULL && strcmp(v.msg, wantmsg) != 0) {
|
||||
fprintf(stderr, "%s: %s: victim: msg is '%s', expected '%s'\n",
|
||||
selftest_tag, what, v.msg, wantmsg);
|
||||
fail++;
|
||||
}
|
||||
c = cache_readex(opt, &cache, CORRUPT_ADR, "/canary.html", "", lc, NULL, 1);
|
||||
if (c.statuscode != 200 || c.adr == NULL ||
|
||||
c.size != (LLint) strlen(corrupt_body_a) ||
|
||||
memcmp(c.adr, corrupt_body_a, strlen(corrupt_body_a)) != 0) {
|
||||
fprintf(stderr, "%s: %s: canary tainted (status %d)\n", selftest_tag, what,
|
||||
c.statuscode);
|
||||
fail++;
|
||||
}
|
||||
if (v.adr != NULL)
|
||||
freet(v.adr);
|
||||
if (c.adr != NULL)
|
||||
freet(c.adr);
|
||||
selftest_close(&cache);
|
||||
return fail;
|
||||
}
|
||||
|
||||
/* One zip corruption case: build, patch, then check victim+canary in-session.
|
||||
*/
|
||||
static int corrupt_case_zip(httrackp *opt, const char *pat, const char *rep,
|
||||
size_t nth, size_t total, const char *wantmsg,
|
||||
const char *what) {
|
||||
corrupt_build(opt);
|
||||
corrupt_patch(opt, pat, strlen(pat), rep, nth, total);
|
||||
return corrupt_expect_victim(opt, wantmsg, what);
|
||||
}
|
||||
|
||||
int cache_corruption_selftest(httrackp *opt, const char *dir) {
|
||||
int failures = 0;
|
||||
|
||||
selftest_tag = "cache-corrupt";
|
||||
golden_setup(opt, dir);
|
||||
|
||||
failures +=
|
||||
corrupt_case_zip(opt, "X-Size: 44", "X-Size: 99", 1, 1,
|
||||
"Cache Read Error : Read Data", "oversized X-Size");
|
||||
failures +=
|
||||
corrupt_case_zip(opt, "X-Size: 44", "X-Size: -4", 1, 1,
|
||||
"Cache Read Error : Bad Size", "negative X-Size");
|
||||
/* both entries carry the line; the victim's is the second */
|
||||
failures += corrupt_case_zip(opt, "X-In-Cache: 1", "X-In-Cache: 0", 2, 2,
|
||||
"Previous cache file not found (empty filename)",
|
||||
"blanked X-In-Cache");
|
||||
/* smashed local file header: the entry is dropped at index load */
|
||||
failures +=
|
||||
corrupt_case_zip(opt, "PK\x03\x04", "XK\x03\x04", 2, 2,
|
||||
"File Cache Entry Not Found", "smashed local header");
|
||||
|
||||
corrupt_build(opt);
|
||||
corrupt_victim_body(opt);
|
||||
failures += corrupt_expect_victim(opt, "Cache Read Error : Read Data",
|
||||
"garbled deflate stream");
|
||||
|
||||
/* An X-Size above INT_MAX is positive as int64 (slips a bare sign check) but
|
||||
truncates negative in the (int) cast the malloc uses: a wraparound alloc.
|
||||
cache_add asserts size fits an int, so such a value only reaches the reader
|
||||
from a corrupt/foreign cache; inject it by overwriting the victim's long
|
||||
Etag line with a same-length forged X-Size line (the parser keeps the last
|
||||
X-Size it sees), keeping the zip byte-length and offsets intact. */
|
||||
corrupt_build_etag(opt);
|
||||
corrupt_patch(opt, "Etag: AAAAAAAAAAAAAAAAAAAA", 26,
|
||||
"X-Size: 2147483648AAAAAAAA", 1, 1);
|
||||
failures += corrupt_expect_victim(opt, "Cache Read Error : Bad Size",
|
||||
"X-Size above INT_MAX");
|
||||
|
||||
return failures;
|
||||
}
|
||||
|
||||
@@ -56,15 +56,6 @@ int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
|
||||
crashing. Returns the failed-check count. */
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
/* Exercise the hts_cache_reconcile() generation policies on file fixtures
|
||||
under <dir>. Returns the failed-check count. */
|
||||
int cache_reconcile_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
/* Inject read-side corruption (zip byte surgery: bad size, header, deflate)
|
||||
under <dir> and assert every case degrades to STATUSCODE_INVALID without
|
||||
tainting a sibling entry. */
|
||||
int cache_corruption_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2137,7 +2137,47 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
hts_log_print(opt, LOG_NOTICE,
|
||||
"No data seems to have been transferred during this session! : restoring previous one!");
|
||||
XH_uninit;
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_ROLLBACK);
|
||||
if ((fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log), "hts-cache/old.dat")))
|
||||
&&
|
||||
(fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx")))) {
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.lst"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.txt"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.lst"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.lst"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.txt"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.txt"));
|
||||
}
|
||||
opt->state.exit_xh = 2; /* interrupted (no connection detected) */
|
||||
return 1;
|
||||
}
|
||||
@@ -3331,41 +3371,6 @@ int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
|
||||
return n;
|
||||
}
|
||||
|
||||
/* One engine-loop tick: refresh the transfer stats and run the loop callback
|
||||
for slot b (-1 = none). HTS_FALSE = the callback requested an abort. */
|
||||
hts_boolean hts_loop_tick(struct_back *sback, httrackp *opt, int b, int ptr) {
|
||||
engine_stats();
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
return RUN_CALLBACK7(
|
||||
opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
}
|
||||
|
||||
/* Single implementation of the historical WAIT_FOR_AVAILABLE_SOCKET macros. */
|
||||
hts_boolean hts_wait_available_socket(struct_back *sback, httrackp *opt,
|
||||
cache_back *cache, int ptr) {
|
||||
const int prev = opt->state._hts_in_html_parsing;
|
||||
|
||||
while (back_pluggable_sockets_strict(sback, opt) <= 0) {
|
||||
opt->state._hts_in_html_parsing = 6;
|
||||
back_wait(sback, opt, cache, 0);
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */
|
||||
if (!back_checkmirror(opt))
|
||||
break;
|
||||
if (!hts_loop_tick(sback, opt, -1, ptr))
|
||||
return HTS_FALSE;
|
||||
}
|
||||
opt->state._hts_in_html_parsing = prev;
|
||||
return HTS_TRUE;
|
||||
}
|
||||
|
||||
int back_pluggable_sockets(struct_back * sback, httrackp * opt) {
|
||||
int n;
|
||||
|
||||
|
||||
@@ -432,15 +432,6 @@ int back_pluggable_sockets(struct_back * sback, httrackp * opt);
|
||||
|
||||
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt);
|
||||
|
||||
/* One engine-loop tick: refresh the transfer stats and run the loop callback
|
||||
for slot b (-1 = none). HTS_FALSE = the callback requested an abort. */
|
||||
hts_boolean hts_loop_tick(struct_back *sback, httrackp *opt, int b, int ptr);
|
||||
|
||||
/* Wait until a test socket can be plugged, pumping transfers, stats and the
|
||||
loop callback; gives up past the -E deadline. HTS_FALSE = callback abort. */
|
||||
hts_boolean hts_wait_available_socket(struct_back *sback, httrackp *opt,
|
||||
cache_back *cache, int ptr);
|
||||
|
||||
/* Randomized inter-file pause target in [min_ms,max_ms] (#185), derived from a
|
||||
timestamp seed so it is stable within one gap and rerolls per launch. */
|
||||
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms);
|
||||
|
||||
@@ -544,11 +544,69 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
}
|
||||
}
|
||||
|
||||
// No new cache but an old one? promote it
|
||||
// Existence d'un cache - pas de new mais un old.. renommer
|
||||
#if DEBUG_STEPS
|
||||
printf("Checking cache\n");
|
||||
#endif
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_PROMOTE);
|
||||
if (!fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), "hts-cache/new.zip"))) {
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), "hts-cache/old.zip"))) {
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.zip"));
|
||||
}
|
||||
} else
|
||||
if ((!fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), "hts-cache/new.dat")))
|
||||
||
|
||||
(!fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx")))) {
|
||||
if ((fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log), "hts-cache/old.dat")))
|
||||
&&
|
||||
(fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx")))) {
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
//remove(fconcat(StringBuff(opt->path_log),"hts-cache/new.lst"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
//rename(fconcat(StringBuff(opt->path_log),"hts-cache/old.lst"),fconcat(StringBuff(opt->path_log),"hts-cache/new.lst"));
|
||||
}
|
||||
}
|
||||
|
||||
/* Interrupted mirror detected */
|
||||
if (!opt->quiet) {
|
||||
@@ -2496,8 +2554,109 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
printf("Cache & log settings\n");
|
||||
#endif
|
||||
|
||||
// If both cache generations exist, keep the most complete one
|
||||
hts_cache_reconcile(opt, CACHE_RECONCILE_INTERRUPTED);
|
||||
// on utilise le cache..
|
||||
// en cas de présence des deux versions, garder la version la plus avancée,
|
||||
// cad la version contenant le plus de fichiers
|
||||
if (opt->cache) {
|
||||
if (fexist(fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log), "hts-in_progress.lock"))) { // problemes..
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"))) {
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip"))) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.zip")) < 32768) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip")) > 65536) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip")) > fsize(fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->
|
||||
path_log),
|
||||
"hts-cache/new.zip")))
|
||||
{
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.zip"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.zip"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.zip"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"))
|
||||
&&
|
||||
fexist(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"))) {
|
||||
if (fexist
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat"))
|
||||
&&
|
||||
fexist(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx"))) {
|
||||
// switcher si new<32Ko et old>65Ko (tailles arbitraires) ?
|
||||
// ce cas est peut être une erreur ou un crash d'un miroir ancien, prendre
|
||||
// alors l'ancien cache
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat")) < 32768) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat")) > 65536) {
|
||||
if (fsize
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat")) > fsize(fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->
|
||||
path_log),
|
||||
"hts-cache/new.dat")))
|
||||
{
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
remove(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.dat"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.dat"));
|
||||
rename(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/old.ndx"), fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_log),
|
||||
"hts-cache/new.ndx"));
|
||||
//} else { // ne rien faire
|
||||
// remove("hts-cache/old.dat");
|
||||
// remove("hts-cache/old.ndx");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Débuggage des en têtes
|
||||
if (_DEBUG_HEAD) {
|
||||
ioinfo =
|
||||
|
||||
@@ -43,8 +43,8 @@ Please visit our Website: http://www.httrack.com
|
||||
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
||||
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
||||
LIB_VERSION the data/cache format generation. */
|
||||
#define HTTRACK_VERSION "3.49-11"
|
||||
#define HTTRACK_VERSIONID "3.49.11"
|
||||
#define HTTRACK_VERSION "3.49-10"
|
||||
#define HTTRACK_VERSIONID "3.49.10"
|
||||
#define HTTRACK_AFF_VERSION "3.x"
|
||||
#define HTTRACK_LIB_VERSION "2.0"
|
||||
|
||||
|
||||
224
src/htsname.c
224
src/htsname.c
@@ -41,10 +41,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htstools.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htssniff.h"
|
||||
#if HTS_USEZLIB
|
||||
#include "htszlib.h"
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
|
||||
#define ADD_STANDARD_PATH \
|
||||
@@ -74,6 +70,31 @@ static const char *hts_tbdev[] = {
|
||||
""
|
||||
};
|
||||
|
||||
#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while(back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state. _hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback,opt,cache,0); \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket=back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \
|
||||
HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \
|
||||
HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \
|
||||
HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \
|
||||
HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \
|
||||
/* Check */ \
|
||||
{ \
|
||||
if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count,-1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while(0)
|
||||
|
||||
/* Strip all // */
|
||||
static void cleanDoubleSlash(char *s) {
|
||||
int i, j;
|
||||
@@ -119,7 +140,8 @@ static void cleanEndingSpaceOrDot(char *s) {
|
||||
|
||||
/* Wire Content-Type vs URL extension: a patchable wire type wins over an
|
||||
unspecific ext, the HTS_UNKNOWN_MIME sentinel keeps a specific non-HTML ext
|
||||
(#267 guard), a declared disagreement is CONTESTED (sniffed below). */
|
||||
(#267 guard), a declared disagreement is CONTESTED. Sentinel and verdict
|
||||
ride the cache, so updates stay consistent. */
|
||||
typedef enum wire_verdict {
|
||||
WIRE_KEEPS_EXT,
|
||||
WIRE_WINS,
|
||||
@@ -143,105 +165,8 @@ static wire_verdict wire_ext_verdict(httrackp *opt, const char *wiremime,
|
||||
return WIRE_CONTESTED;
|
||||
}
|
||||
|
||||
/* Optional evidence for a contested wire-vs-ext verdict. */
|
||||
typedef struct sniff_src {
|
||||
struct_back *sback; /* live backing (looked up by adr/fil) */
|
||||
const lien_back *headers; /* snapshot: r.adr, else the url_sav file */
|
||||
const char *adr, *fil;
|
||||
const char *prev_save; /* previous run's save name (cache X-Save) */
|
||||
} sniff_src;
|
||||
|
||||
#if HTS_USEZLIB
|
||||
/* Inflate the head of a gzip/zlib stream; 0 when undecodable. */
|
||||
static size_t sniff_inflate_head(const void *in, size_t in_len, void *out,
|
||||
size_t out_len) {
|
||||
z_stream zs;
|
||||
size_t n = 0;
|
||||
int err;
|
||||
|
||||
memset(&zs, 0, sizeof(zs));
|
||||
if (inflateInit2(&zs, 47) != Z_OK) /* 47: gzip or zlib, autodetected */
|
||||
return 0;
|
||||
zs.next_in = (const Bytef *) in;
|
||||
zs.avail_in = (uInt) in_len;
|
||||
zs.next_out = (Bytef *) out;
|
||||
zs.avail_out = (uInt) out_len;
|
||||
err = inflate(&zs, Z_SYNC_FLUSH);
|
||||
if (err == Z_OK || err == Z_STREAM_END || err == Z_BUF_ERROR)
|
||||
n = out_len - zs.avail_out;
|
||||
inflateEnd(&zs);
|
||||
return n;
|
||||
}
|
||||
#endif
|
||||
|
||||
static size_t sniff_read_head(const char *path, void *buf, size_t len) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
FILE *const fp = FOPEN(fconv(catbuff, sizeof(catbuff), path), "rb");
|
||||
size_t n = 0;
|
||||
|
||||
if (fp != NULL) {
|
||||
n = fread(buf, 1, len, fp);
|
||||
fclose(fp);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Body head of one slot: memory, else its flushed on-disk file (url_sav, or
|
||||
tmpfile for a compressed stream); inflated so the sniff sees the final body.
|
||||
*/
|
||||
static size_t sniff_slot_head(const lien_back *slot, void *buf, size_t len) {
|
||||
const htsblk *const r = &slot->r;
|
||||
size_t n = 0;
|
||||
|
||||
if (r->adr != NULL && r->size > 0) {
|
||||
n = (size_t) r->size < len ? (size_t) r->size : len;
|
||||
memcpy(buf, r->adr, n);
|
||||
} else {
|
||||
if (r->out != NULL)
|
||||
fflush(r->out);
|
||||
if (slot->url_sav[0] != '\0')
|
||||
n = sniff_read_head(slot->url_sav, buf, len);
|
||||
if (n == 0 && slot->tmpfile != NULL && slot->tmpfile[0] != '\0')
|
||||
n = sniff_read_head(slot->tmpfile, buf, len);
|
||||
}
|
||||
if (n > 0 && r->compressed) {
|
||||
#if HTS_USEZLIB
|
||||
unsigned char raw[HTS_SNIFF_LEN];
|
||||
|
||||
if (n > sizeof(raw))
|
||||
n = sizeof(raw);
|
||||
memcpy(raw, buf, n);
|
||||
n = sniff_inflate_head(raw, n, buf, len);
|
||||
#else
|
||||
n = 0;
|
||||
#endif
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Up to len leading body bytes; 0 when unavailable, and always in
|
||||
non-delayed mode (its HEAD-probe first run couldn't sniff either). */
|
||||
static size_t sniff_body_head(httrackp *opt, const sniff_src *src, void *buf,
|
||||
size_t len) {
|
||||
size_t n = 0;
|
||||
|
||||
if (src == NULL || opt->savename_delayed == HTS_SAVENAME_DELAYED_NONE)
|
||||
return 0;
|
||||
/* live backing slot: a snapshot (back_copy_static) loses r.adr/r.out */
|
||||
if (src->sback != NULL && src->adr != NULL && src->fil != NULL) {
|
||||
const int b = back_index(opt, src->sback, src->adr, src->fil, NULL);
|
||||
|
||||
if (b >= 0)
|
||||
n = sniff_slot_head(&src->sback->lnk[b], buf, len);
|
||||
}
|
||||
if (n == 0 && src->headers != NULL)
|
||||
n = sniff_slot_head(src->headers, buf, len);
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Contested verdicts: magic proving the URL ext keeps it, else wire wins. */
|
||||
static int wire_patches_ext(httrackp *opt, const sniff_src *src,
|
||||
const char *wiremime, const char *file) {
|
||||
static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
char urlmime[256];
|
||||
|
||||
switch (wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime))) {
|
||||
@@ -250,51 +175,22 @@ static int wire_patches_ext(httrackp *opt, const sniff_src *src,
|
||||
case WIRE_WINS:
|
||||
return 1;
|
||||
case WIRE_CONTESTED:
|
||||
break;
|
||||
}
|
||||
if (src != NULL) {
|
||||
if (hts_sniff_mime_known(urlmime)) {
|
||||
unsigned char head[HTS_SNIFF_LEN];
|
||||
const size_t n = sniff_body_head(opt, src, head, sizeof(head));
|
||||
|
||||
if (n > 0)
|
||||
return hts_sniff_mime_consistent(head, n, urlmime) ? 0 : 1;
|
||||
}
|
||||
/* no bytes: reproduce the previous run's verdict (cached X-Save name) */
|
||||
if (src->prev_save != NULL && src->prev_save[0] != '\0') {
|
||||
char prevmime[256];
|
||||
|
||||
prevmime[0] = '\0';
|
||||
if (get_httptype_sized(opt, prevmime, sizeof(prevmime), src->prev_save,
|
||||
0) &&
|
||||
strfield2(prevmime, urlmime))
|
||||
return 0;
|
||||
}
|
||||
break; /* no content evidence is consulted today: trust the wire */
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int hts_ext_sniff_wanted(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
char urlmime[256];
|
||||
|
||||
return wiremime != NULL && strnotempty(wiremime) &&
|
||||
wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime)) ==
|
||||
WIRE_CONTESTED &&
|
||||
hts_sniff_mime_known(urlmime);
|
||||
}
|
||||
|
||||
/* Wire-metadata name change: a Content-Disposition filename wins (returns 2),
|
||||
else the declared type's ext when wire_patches_ext() allows (returns 1),
|
||||
else 0. ext receives the new extension or replacement filename. */
|
||||
static int resolve_extension(httrackp *opt, const sniff_src *src,
|
||||
const char *cdispo, const char *contenttype,
|
||||
const char *fil, char *ext, size_t ext_size) {
|
||||
static int resolve_extension(httrackp *opt, const char *cdispo,
|
||||
const char *contenttype, const char *fil,
|
||||
char *ext, size_t ext_size) {
|
||||
if (strnotempty(cdispo)) {
|
||||
strlcpybuff(ext, cdispo, ext_size);
|
||||
return 2;
|
||||
}
|
||||
if (wire_patches_ext(opt, src, contenttype, fil) &&
|
||||
if (wire_patches_ext(opt, contenttype, fil) &&
|
||||
give_mimext(ext, ext_size, contenttype))
|
||||
return 1;
|
||||
return 0;
|
||||
@@ -546,21 +442,14 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD ||
|
||||
ishtml(opt, fil) < 0) { // unsure whether it's html or a file
|
||||
// lire dans le cache
|
||||
char BIGSTK previous_save[HTS_URLMAXSIZE * 2];
|
||||
htsblk r;
|
||||
|
||||
previous_save[0] = '\0';
|
||||
r = cache_read_including_broken(opt, cache, adr, fil,
|
||||
previous_save); // test uniquement
|
||||
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
|
||||
|
||||
if (r.statuscode != -1) { // cache entry read OK
|
||||
hts_log_print(opt, LOG_DEBUG, "Testing link type (from cache) %s%s",
|
||||
adr_complete, fil_complete);
|
||||
if (!HTTP_IS_REDIRECT(r.statuscode)) {
|
||||
const sniff_src src = {sback, NULL, adr, fil, previous_save};
|
||||
|
||||
ext_chg = resolve_extension(opt, &src, r.cdispo, r.contenttype,
|
||||
fil, ext, sizeof(ext));
|
||||
ext_chg = resolve_extension(opt, r.cdispo, r.contenttype, fil,
|
||||
ext, sizeof(ext));
|
||||
}
|
||||
} else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD &&
|
||||
is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
||||
@@ -587,9 +476,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
!opt->state.stop) {
|
||||
// Check if the file is ready in backing.
|
||||
if (headers != NULL && headers->status >= 0 && !is_redirect) {
|
||||
const sniff_src src = {sback, headers, adr, fil, NULL};
|
||||
|
||||
ext_chg = resolve_extension(opt, &src, headers->r.cdispo,
|
||||
ext_chg = resolve_extension(opt, headers->r.cdispo,
|
||||
headers->r.contenttype,
|
||||
headers->url_fil, ext, sizeof(ext));
|
||||
}
|
||||
@@ -627,10 +514,11 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
int has_been_moved = 0;
|
||||
lien_adrfil current;
|
||||
|
||||
/* Wait for an available test slot, honoring the connection limits
|
||||
/* Ensure we don't use too many sockets by using a "testing" one
|
||||
If we have only 1 simultaneous connection authorized, wait for pending download
|
||||
Wait for an available slot
|
||||
*/
|
||||
if (!hts_wait_available_socket(sback, opt, cache, ptr))
|
||||
return -1;
|
||||
URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
|
||||
|
||||
/* Rock'in */
|
||||
current.adr[0] = current.fil[0] = '\0';
|
||||
@@ -660,11 +548,24 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (ptr >= 0) {
|
||||
back_fillmax(sback, opt, cache, ptr, numero_passe);
|
||||
}
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
// on est obligé d'appeler le shell pour le refresh..
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart),
|
||||
&HTS_STAT)) {
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel ||
|
||||
!back_checkmirror(
|
||||
opt)) { // cancel level 2 or 1 (cancel parsing)
|
||||
} else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
stop_looping = 1;
|
||||
}
|
||||
@@ -729,9 +630,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
"Loop with HEAD request (during prefetch) at %s%s",
|
||||
current.adr, current.fil);
|
||||
}
|
||||
if (!hts_wait_available_socket(sback, opt,
|
||||
cache, ptr))
|
||||
return -1;
|
||||
// Ajouter
|
||||
URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
|
||||
if (back_add(sback, opt, cache, moved.adr, moved.fil, methode, referer_adr, referer_fil, 1) != -1) { // OK
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"(during prefetch) %s (%d) to link %s at %s%s",
|
||||
@@ -787,7 +687,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
|
||||
// no error: change the type?
|
||||
ext_chg = resolve_extension(
|
||||
opt, NULL, back[b].r.cdispo, back[b].r.contenttype,
|
||||
opt, back[b].r.cdispo, back[b].r.contenttype,
|
||||
back[b].url_fil, ext, sizeof(ext));
|
||||
}
|
||||
// FIN Si non déplacé, forcer type?
|
||||
|
||||
@@ -100,8 +100,6 @@ void standard_name(char *b, size_t bsize, const char *dot_pos,
|
||||
const char *nom_pos, const char *fil_complete,
|
||||
int short_ver);
|
||||
void url_savename_addstr(char *d, const char *s);
|
||||
/* Contested wire-vs-ext verdict that a body sniff could settle (htssniff.h). */
|
||||
int hts_ext_sniff_wanted(httrackp *opt, const char *wiremime, const char *file);
|
||||
char *url_md5(char *digest_buffer, const char *fil_complete);
|
||||
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
||||
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||
|
||||
131
src/htsparse.c
131
src/htsparse.c
@@ -49,7 +49,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htsindex.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htssniff.h"
|
||||
|
||||
/* external modules */
|
||||
#include "htsmodules.h"
|
||||
@@ -3399,7 +3398,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
|
||||
back_fillmax(sback, opt, cache, ptr, numero_passe);
|
||||
|
||||
if (!hts_loop_tick(sback, opt, 0, ptr)) {
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, 0, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
@@ -3410,6 +3422,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
nofollow = 1; // moins violent
|
||||
opt->state._hts_cancel = 0;
|
||||
}
|
||||
|
||||
}
|
||||
// refresh the backing system each 2 seconds
|
||||
if (engine_stats()) {
|
||||
@@ -3946,8 +3959,22 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
|
||||
{
|
||||
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
b = 0;
|
||||
if (!hts_loop_tick(sback, opt, b, ptr) || !back_checkmirror(opt)) {
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)
|
||||
|| !back_checkmirror(opt)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
@@ -4049,11 +4076,21 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
|
||||
while(opt->state._hts_setpause || back_pluggable_sockets_strict(sback, opt) <= 0) { // on fait la pause..
|
||||
opt->state._hts_in_html_parsing = 6;
|
||||
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */
|
||||
if (!back_checkmirror(opt))
|
||||
break;
|
||||
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
@@ -4240,12 +4277,26 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
|
||||
freet(s);
|
||||
}
|
||||
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#if HTS_POLL
|
||||
@@ -4478,9 +4529,10 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
IS_DELAYED_EXT(afs->save) && continue_loop && loops < 7; loops++) {
|
||||
continue_loop = 0;
|
||||
|
||||
/* Wait for an available slot */
|
||||
if (!hts_wait_available_socket(sback, opt, cache, ptr))
|
||||
return -1;
|
||||
/*
|
||||
Wait for an available slot
|
||||
*/
|
||||
WAIT_FOR_AVAILABLE_SOCKET();
|
||||
|
||||
/* We can lookup directly in the cache to speedup this mess */
|
||||
if (opt->delayed_cached) {
|
||||
@@ -4626,28 +4678,39 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
if (ptr >= 0) {
|
||||
back_fillmax(sback, opt, cache, ptr, numero_passe);
|
||||
}
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
back_set_unlocked(sback, b);
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel ||
|
||||
!back_checkmirror(
|
||||
opt)) { // cancel level 2 or 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
break;
|
||||
// on est obligé d'appeler le shell pour le refresh..
|
||||
{
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (
|
||||
/* dns/connect/request */
|
||||
(back[b].status >= 99 && back[b].status <= 101) ||
|
||||
/* For redirects, wait for request to be terminated */
|
||||
(HTTP_IS_REDIRECT(back[b].r.statuscode) && back[b].status > 0) ||
|
||||
/* Same for errors */
|
||||
(HTTP_IS_ERROR(back[b].r.statuscode) && back[b].status > 0) ||
|
||||
/* Contested type: wait for a sniffable body head (or EOF) */
|
||||
(back[b].r.statuscode == HTTP_OK && back[b].status > 0 &&
|
||||
strnotempty(back[b].r.cdispo) == 0 &&
|
||||
back[b].r.size < HTS_SNIFF_LEN &&
|
||||
hts_ext_sniff_wanted(opt, back[b].r.contenttype,
|
||||
back[b].url_fil)));
|
||||
} while(
|
||||
/* dns/connect/request */
|
||||
(back[b].status >= 99 && back[b].status <= 101)
|
||||
||
|
||||
/* For redirects, wait for request to be terminated */
|
||||
(HTTP_IS_REDIRECT(back[b].r.statuscode) && back[b].status > 0)
|
||||
||
|
||||
/* Same for errors */
|
||||
(HTTP_IS_ERROR(back[b].r.statuscode) && back[b].status > 0)
|
||||
);
|
||||
if (b >= 0) {
|
||||
back_set_unlocked(sback, b); // Unlocked entry
|
||||
}
|
||||
@@ -4782,8 +4845,8 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
|
||||
/* Still have a back reference */
|
||||
if (b >= 0) {
|
||||
/* patch url_sav BEFORE finalize: it records/caches under this name
|
||||
*/
|
||||
/* Patch destination filename for direct-to-disk mode, BEFORE any
|
||||
finalize: it records and caches the entry under url_sav */
|
||||
strcpybuff(back[b].url_sav, afs->save);
|
||||
/* Finalize now as we have the type */
|
||||
if (back[b].status == STATUS_READY) {
|
||||
|
||||
@@ -175,4 +175,27 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
/* Apply changes */ \
|
||||
* str->ptr_ = ptr
|
||||
|
||||
#define WAIT_FOR_AVAILABLE_SOCKET() do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while(back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state._hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback,opt,cache,0); \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket=back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \
|
||||
HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \
|
||||
HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \
|
||||
HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \
|
||||
HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \
|
||||
/* Check */ \
|
||||
if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, -1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -52,7 +52,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htsencoding.h"
|
||||
#include "htsftp.h"
|
||||
#include "htsmd5.h"
|
||||
#include "htssniff.h"
|
||||
#if HTS_USEZLIB
|
||||
#include "htszlib.h"
|
||||
#endif
|
||||
@@ -1142,22 +1141,6 @@ static size_t st_decode_body(const char *arg, char *buf, size_t size) {
|
||||
return n;
|
||||
}
|
||||
|
||||
static int st_sniff(httrackp *opt, int argc, char **argv) {
|
||||
char BIGSTK body[1024];
|
||||
size_t n;
|
||||
|
||||
(void) opt;
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "sniff: needs a content-type and a body\n");
|
||||
return 1;
|
||||
}
|
||||
n = st_decode_body(argv[1], body, sizeof(body));
|
||||
printf("sniff: known=%d consistent=%d\n",
|
||||
hts_sniff_mime_known(argv[0]) == HTS_TRUE,
|
||||
hts_sniff_mime_consistent(body, n, argv[0]) == HTS_TRUE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int st_savename(httrackp *opt, int argc, char **argv) {
|
||||
lien_adrfilsave afs;
|
||||
cache_back cache;
|
||||
@@ -1228,7 +1211,7 @@ static int st_savename(httrackp *opt, int argc, char **argv) {
|
||||
}
|
||||
*sep = '\0';
|
||||
/* one-entry cache in cwd, reopened read-only; body is PNG magic on
|
||||
purpose: only the recorded name (X-Save) may drive the naming */
|
||||
purpose: naming must not depend on stored content */
|
||||
StringCopy(opt->path_log, "");
|
||||
cache.type = 1;
|
||||
cache.log = cache.errlog = stderr;
|
||||
@@ -1289,7 +1272,7 @@ static int st_savename(httrackp *opt, int argc, char **argv) {
|
||||
if (cdispo != NULL)
|
||||
strcpybuff(headers.r.cdispo, cdispo);
|
||||
strcpybuff(headers.url_fil, argv[0]);
|
||||
if (body != NULL) { /* leading body bytes, read via url_sav */
|
||||
if (body != NULL) { /* leading body bytes, exposed via url_sav */
|
||||
char BIGSTK data[1024];
|
||||
const size_t n = st_decode_body(body, data, sizeof(data));
|
||||
FILE *const fp = fopen(bodyfile, "wb");
|
||||
@@ -1347,30 +1330,6 @@ static int st_cache_writefail(httrackp *opt, int argc, char **argv) {
|
||||
return err;
|
||||
}
|
||||
|
||||
static int st_cache_corrupt(httrackp *opt, int argc, char **argv) {
|
||||
int err;
|
||||
|
||||
if (argc < 1) {
|
||||
fprintf(stderr, "cache-corrupt: needs a directory\n");
|
||||
return 1;
|
||||
}
|
||||
err = cache_corruption_selftest(opt, argv[0]);
|
||||
printf("cache-corrupt: %s\n", err ? "FAIL" : "OK");
|
||||
return err;
|
||||
}
|
||||
|
||||
static int st_reconcile(httrackp *opt, int argc, char **argv) {
|
||||
int err;
|
||||
|
||||
if (argc < 1) {
|
||||
fprintf(stderr, "reconcile: needs a directory\n");
|
||||
return 1;
|
||||
}
|
||||
err = cache_reconcile_selftest(opt, argv[0]);
|
||||
printf("cache-reconcile: %s\n", err ? "FAIL" : "OK");
|
||||
return err;
|
||||
}
|
||||
|
||||
static int st_dns(httrackp *opt, int argc, char **argv) {
|
||||
const int err = dns_selftests(opt);
|
||||
|
||||
@@ -2136,17 +2095,11 @@ static const struct selftest_entry {
|
||||
st_header},
|
||||
{"savename", "<fil> <content-type> [key=value ...]",
|
||||
"local save-name for a URL", st_savename},
|
||||
{"sniff", "<content-type> <hex:..|text>", "MIME magic consistency",
|
||||
st_sniff},
|
||||
{"cache", "<dir>", "cache read/write round-trip self-test", st_cache},
|
||||
{"cache-golden", "<dir> [regen]", "frozen cache-format read self-test",
|
||||
st_cache_golden},
|
||||
{"cache-writefail", "<dir>", "cache write-failure handling self-test",
|
||||
st_cache_writefail},
|
||||
{"reconcile", "<dir>", "cache generation reconcile policy self-test",
|
||||
st_reconcile},
|
||||
{"cache-corrupt", "<dir>", "cache read-side corruption self-test",
|
||||
st_cache_corrupt},
|
||||
{"dns", "", "DNS resolver/cache self-test", st_dns},
|
||||
{"cookies", "", "cookie request-header self-test", st_cookies},
|
||||
{"useragent", "", "default User-Agent self-test", st_useragent},
|
||||
|
||||
352
src/htssniff.c
352
src/htssniff.c
@@ -1,352 +0,0 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2017 Xavier Roche and other contributors
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Important notes:
|
||||
|
||||
- We hereby ask people using this source NOT to use it in purpose of grabbing
|
||||
emails addresses, or collecting any other private information on persons.
|
||||
This would disgrace our work, and spoil the many hours we spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: MIME magic-byte consistency checks */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#include "htssniff.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "htslib.h"
|
||||
|
||||
/* One magic rule: `len` bytes at `off` confirm `mime`. */
|
||||
typedef struct sniff_magic {
|
||||
const char *mime;
|
||||
unsigned short off;
|
||||
unsigned char len;
|
||||
const char *bytes;
|
||||
} sniff_magic;
|
||||
|
||||
/* Direction is mime -> magic (verify a claim, never classify); types with
|
||||
no reliable magic (plain text, css, js..) are deliberately absent. Patterns
|
||||
follow the WHATWG MIME Sniffing Standard tables where it defines them
|
||||
(https://mimesniff.spec.whatwg.org/); the rest covers httrack's wider MIME
|
||||
set. Spec-only types absent from our MIME tables (EOT, font/collection)
|
||||
are omitted as unreachable. */
|
||||
static const sniff_magic sniff_table[] = {
|
||||
/* images */
|
||||
{"image/jpeg", 0, 3, "\xff\xd8\xff"},
|
||||
{"image/pipeg", 0, 3, "\xff\xd8\xff"},
|
||||
{"image/pjpeg", 0, 3, "\xff\xd8\xff"},
|
||||
{"image/png", 0, 8, "\x89PNG\r\n\x1a\n"},
|
||||
{"image/gif", 0, 6, "GIF87a"},
|
||||
{"image/gif", 0, 6, "GIF89a"},
|
||||
{"image/bmp", 0, 2, "BM"},
|
||||
{"image/tiff", 0, 4, "II*\0"},
|
||||
{"image/tiff", 0, 4, "MM\0*"},
|
||||
{"image/x-icon", 0, 4, "\0\0\1\0"},
|
||||
{"image/x-icon", 0, 4, "\0\0\2\0"}, /* Windows cursor, per the spec */
|
||||
{"image/x-portable-bitmap", 0, 2, "P1"},
|
||||
{"image/x-portable-bitmap", 0, 2, "P4"},
|
||||
{"image/x-portable-pixmap", 0, 2, "P3"},
|
||||
{"image/x-portable-pixmap", 0, 2, "P6"},
|
||||
{"image/x-xpixmap", 0, 9, "/* XPM */"},
|
||||
{"image/x-xbitmap", 0, 7, "#define"},
|
||||
{"image/x-rgb", 0, 2, "\x01\xda"},
|
||||
{"image/x-cmu-raster", 0, 4, "\xf1\x00\x40\xbb"},
|
||||
/* audio */
|
||||
{"audio/mpeg", 0, 3, "ID3"},
|
||||
{"audio/basic", 0, 4, ".snd"},
|
||||
{"audio/mid", 0, 8, "MThd\0\0\0\6"},
|
||||
{"audio/midi", 0, 8, "MThd\0\0\0\6"},
|
||||
{"audio/x-pn-realaudio", 0, 4, ".ra\xfd"},
|
||||
{"audio/x-pn-realaudio", 0, 4, ".RMF"},
|
||||
{"audio/x-pn-realaudio-plugin", 0, 4, ".ra\xfd"},
|
||||
{"audio/x-pn-realaudio-plugin", 0, 4, ".RMF"},
|
||||
{"audio/flac", 0, 4, "fLaC"},
|
||||
{"audio/aac", 0, 4, "ADIF"},
|
||||
/* video */
|
||||
{"video/mpeg", 0, 4, "\x00\x00\x01\xba"},
|
||||
{"video/mpeg", 0, 4, "\x00\x00\x01\xb3"},
|
||||
{"video/x-sgi-movie", 0, 4, "MOVI"},
|
||||
/* archives / compression */
|
||||
{"application/x-gzip", 0, 3, "\x1f\x8b\x08"},
|
||||
{"multipart/x-gzip", 0, 3, "\x1f\x8b\x08"},
|
||||
{"application/x-compressed", 0, 3, "\x1f\x8b\x08"},
|
||||
{"application/x-compress", 0, 2, "\x1f\x9d"},
|
||||
{"application/x-bzip2", 0, 3, "BZh"},
|
||||
{"application/x-7z-compressed", 0, 6, "7z\xbc\xaf\x27\x1c"},
|
||||
/* 6-byte prefix common to RAR4 (spec) and RAR5 */
|
||||
{"application/x-rar-compressed", 0, 6, "Rar!\x1a\x07"},
|
||||
{"application/zstd", 0, 4, "\x28\xb5\x2f\xfd"},
|
||||
{"application/arj", 0, 2, "\x60\xea"},
|
||||
{"application/x-cpio", 0, 6, "070701"},
|
||||
{"application/x-cpio", 0, 6, "070707"},
|
||||
{"application/x-cpio", 0, 2, "\xc7\x71"},
|
||||
{"application/x-sv4cpio", 0, 6, "070701"},
|
||||
{"application/x-sv4crc", 0, 6, "070702"},
|
||||
{"application/x-stuffit", 0, 8, "StuffIt "},
|
||||
{"application/x-stuffit", 0, 4, "SIT!"},
|
||||
{"application/mac-binhex40", 0, 10, "(This file"},
|
||||
/* documents */
|
||||
{"application/pdf", 0, 5, "%PDF-"},
|
||||
{"application/postscript", 0, 2, "%!"},
|
||||
{"application/rtf", 0, 5, "{\\rtf"},
|
||||
{"application/x-dvi", 0, 2, "\xf7\x02"},
|
||||
{"application/x-hdf", 0, 4, "\x0e\x03\x13\x01"},
|
||||
{"application/x-hdf", 0, 8, "\x89HDF\r\n\x1a\n"},
|
||||
{"application/x-netcdf", 0, 4, "CDF\x01"},
|
||||
{"application/x-netcdf", 0, 4, "CDF\x02"},
|
||||
{"application/x-msaccess", 0, 19, "\0\1\0\0Standard Jet DB"},
|
||||
/* fonts */
|
||||
{"font/woff", 0, 4, "wOFF"},
|
||||
{"font/woff2", 0, 4, "wOF2"},
|
||||
{"font/ttf", 0, 4, "\0\1\0\0"},
|
||||
{"font/ttf", 0, 4, "true"},
|
||||
{"font/otf", 0, 4, "OTTO"},
|
||||
/* misc */
|
||||
{"application/x-shockwave-flash", 0, 3, "FWS"},
|
||||
{"application/x-shockwave-flash", 0, 3, "CWS"},
|
||||
{"application/x-shockwave-flash", 0, 3, "ZWS"},
|
||||
{"application/futuresplash", 0, 3, "FWS"},
|
||||
{"application/x-director", 0, 4, "RIFX"},
|
||||
{"application/x-director", 0, 4, "XFIR"},
|
||||
{"application/x-java-vm", 0, 4, "\xca\xfe\xba\xbe"},
|
||||
{"application/wasm", 0, 4, "\0asm"},
|
||||
{"application/x-msmetafile", 0, 4, "\xd7\xcd\xc6\x9a"},
|
||||
{"application/x-msmetafile", 0, 4, "\x01\x00\x09\x00"},
|
||||
{"application/x-x509-ca-cert", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs12", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs7-mime", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs7-signature", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs7-certificates", 0, 2, "\x30\x82"},
|
||||
{"x-world/x-vrml", 0, 5, "#VRML"},
|
||||
{"application/x-bittorrent", 0, 11, "d8:announce"},
|
||||
{"drawing/x-dwf", 0, 4, "(DWF"},
|
||||
{"application/acad", 0, 4, "AC10"},
|
||||
{NULL, 0, 0, NULL}};
|
||||
|
||||
/* MIME families sharing a container magic */
|
||||
static const char *const zip_mimes[] = {
|
||||
"application/zip", "application/x-zip-compressed", "multipart/x-zip", NULL};
|
||||
static const char *const zip_mime_prefixes[] = {
|
||||
"application/vnd.openxmlformats-officedocument.",
|
||||
"application/vnd.oasis.opendocument.", NULL};
|
||||
static const char *const ole_mimes[] = {"application/msword",
|
||||
"application/excel",
|
||||
"application/vnd.ms-excel",
|
||||
"application/powerpoint",
|
||||
"application/vnd.ms-powerpoint",
|
||||
"application/vnd.ms-project",
|
||||
"application/vnd.ms-works",
|
||||
"application/x-msmoney",
|
||||
"application/x-mspublisher",
|
||||
NULL};
|
||||
static const char *const tar_mimes[] = {
|
||||
"application/x-tar", "application/x-ustar", "application/x-gtar", NULL};
|
||||
static const char *const ogg_mimes[] = {"application/ogg", "audio/ogg",
|
||||
"video/ogg", "audio/opus", NULL};
|
||||
static const char *const ebml_mimes[] = {"video/webm", "audio/webm", NULL};
|
||||
/* ISO-BMFF, any 'ftyp' brand: containers overlap too much to split */
|
||||
static const char *const bmff_mimes[] = {"video/mp4", "audio/mp4",
|
||||
"video/quicktime", NULL};
|
||||
static const char *const avif_mimes[] = {"image/avif", NULL};
|
||||
static const char *const heic_mimes[] = {"image/heic", NULL};
|
||||
static const char *const asf_mimes[] = {"video/x-ms-asf", "video/x-ms-wmv",
|
||||
"video/x-la-asf", NULL};
|
||||
static const char *const xml_mimes[] = {"application/xml", "text/xml",
|
||||
"image/svg+xml", "image/svg-xml", NULL};
|
||||
static const char *const svg_mimes[] = {"image/svg+xml", "image/svg-xml", NULL};
|
||||
static const char *const html_mimes[] = {"text/html", NULL};
|
||||
static const char *const pem_mimes[] = {
|
||||
"application/x-x509-ca-cert", "application/x-pkcs7-certificates",
|
||||
"application/x-pkcs7-mime", "application/x-pkcs7-signature", NULL};
|
||||
|
||||
static hts_boolean mime_in(const char *const *list, const char *mime) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; list[i] != NULL; i++)
|
||||
if (strfield2(list[i], mime))
|
||||
return HTS_TRUE;
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static hts_boolean mime_in_prefix(const char *const *list, const char *mime) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; list[i] != NULL; i++)
|
||||
if (strfield(mime, list[i]))
|
||||
return HTS_TRUE;
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static hts_boolean has_bytes(const unsigned char *d, size_t n, size_t off,
|
||||
const char *bytes, size_t len) {
|
||||
/* overflow-safe: untrusted n alone on one side */
|
||||
return n >= off && len <= n - off && memcmp(d + off, bytes, len) == 0
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
}
|
||||
|
||||
static unsigned char ascii_lower(unsigned char c) {
|
||||
return c >= 'A' && c <= 'Z' ? (unsigned char) (c + 32) : c;
|
||||
}
|
||||
|
||||
/* Case-insensitive text prefix after an optional UTF-8 BOM and whitespace. */
|
||||
static hts_boolean has_text_prefix(const unsigned char *d, size_t n,
|
||||
const char *prefix) {
|
||||
const size_t len = strlen(prefix);
|
||||
size_t i, k;
|
||||
|
||||
i = n >= 3 && memcmp(d, "\xef\xbb\xbf", 3) == 0 ? 3 : 0;
|
||||
while (i < n && (d[i] == ' ' || d[i] == '\t' || d[i] == '\r' || d[i] == '\n'))
|
||||
i++;
|
||||
if (len > n - i) /* i <= n from the loop above */
|
||||
return HTS_FALSE;
|
||||
for (k = 0; k < len; k++)
|
||||
if (ascii_lower(d[i + k]) != ascii_lower((unsigned char) prefix[k]))
|
||||
return HTS_FALSE;
|
||||
return HTS_TRUE;
|
||||
}
|
||||
|
||||
typedef enum sniff_op {
|
||||
SNIFF_QUERY_KNOWN, /* is any rule defined for this MIME? */
|
||||
SNIFF_QUERY_MATCH /* do the bytes confirm this MIME? */
|
||||
} sniff_op;
|
||||
|
||||
/* Single walk for both queries so the rule set can't drift apart. */
|
||||
static hts_boolean sniff_eval(sniff_op op, const unsigned char *d, size_t n,
|
||||
const char *mime) {
|
||||
size_t i;
|
||||
|
||||
/* KNOWN short-circuits; MATCH tests the magic */
|
||||
#define SNIFF_RULE(cond) \
|
||||
do { \
|
||||
if (op == SNIFF_QUERY_KNOWN) \
|
||||
return HTS_TRUE; \
|
||||
if (cond) \
|
||||
return HTS_TRUE; \
|
||||
} while (0)
|
||||
|
||||
for (i = 0; sniff_table[i].mime != NULL; i++) {
|
||||
if (strfield2(sniff_table[i].mime, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, sniff_table[i].off, sniff_table[i].bytes,
|
||||
sniff_table[i].len));
|
||||
}
|
||||
}
|
||||
if (mime_in(zip_mimes, mime) || mime_in_prefix(zip_mime_prefixes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "PK\3\4", 4) ||
|
||||
has_bytes(d, n, 0, "PK\5\6", 4));
|
||||
}
|
||||
if (mime_in(ole_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8));
|
||||
}
|
||||
if (mime_in(tar_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 257, "ustar", 5));
|
||||
}
|
||||
if (mime_in(ogg_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "OggS\0", 5));
|
||||
}
|
||||
if (mime_in(ebml_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "\x1a\x45\xdf\xa3", 4));
|
||||
}
|
||||
if (mime_in(bmff_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 4, "ftyp", 4));
|
||||
}
|
||||
if (mime_in(avif_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 4, "ftypavif", 8) ||
|
||||
has_bytes(d, n, 4, "ftypavis", 8));
|
||||
}
|
||||
if (mime_in(heic_mimes, mime)) {
|
||||
SNIFF_RULE(
|
||||
has_bytes(d, n, 4, "ftyphei", 7) || has_bytes(d, n, 4, "ftyphev", 7) ||
|
||||
has_bytes(d, n, 4, "ftypmif1", 8) || has_bytes(d, n, 4, "ftypmsf1", 8));
|
||||
}
|
||||
if (mime_in(asf_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "\x30\x26\xb2\x75\x8e\x66\xcf\x11", 8));
|
||||
}
|
||||
if (strfield2("audio/x-wav", mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) && has_bytes(d, n, 8, "WAVE", 4));
|
||||
}
|
||||
if (strfield2("video/x-msvideo", mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) && has_bytes(d, n, 8, "AVI ", 4));
|
||||
}
|
||||
if (strfield2("image/webp", mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) &&
|
||||
has_bytes(d, n, 8, "WEBPVP", 6));
|
||||
}
|
||||
if (strfield2("image/x-portable-anymap", mime)) {
|
||||
SNIFF_RULE(n >= 2 && d[0] == 'P' && d[1] >= '1' && d[1] <= '6');
|
||||
}
|
||||
if (strfield2("audio/x-aiff", mime)) {
|
||||
SNIFF_RULE(
|
||||
has_bytes(d, n, 0, "FORM", 4) &&
|
||||
(has_bytes(d, n, 8, "AIFF", 4) || has_bytes(d, n, 8, "AIFC", 4)));
|
||||
}
|
||||
if (strfield2("audio/mpeg", mime)) {
|
||||
/* MPEG audio frame sync (11 bits), valid layer and bitrate fields */
|
||||
SNIFF_RULE(n >= 2 && d[0] == 0xff && (d[1] & 0xe0) == 0xe0 &&
|
||||
(d[1] & 0x06) != 0);
|
||||
}
|
||||
if (strfield2("audio/aac", mime)) {
|
||||
/* ADTS sync */
|
||||
SNIFF_RULE(n >= 2 && d[0] == 0xff && (d[1] & 0xf6) == 0xf0);
|
||||
}
|
||||
if (strfield2("video/mp2t", mime)) {
|
||||
SNIFF_RULE(n >= 1 && d[0] == 0x47 && (n <= 188 || d[188] == 0x47));
|
||||
}
|
||||
if (mime_in(xml_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "<?xml"));
|
||||
}
|
||||
if (mime_in(svg_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "<svg") ||
|
||||
has_text_prefix(d, n, "<!DOCTYPE svg"));
|
||||
}
|
||||
if (mime_in(html_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "<!DOCTYPE") ||
|
||||
has_text_prefix(d, n, "<html") ||
|
||||
has_text_prefix(d, n, "<head"));
|
||||
}
|
||||
if (mime_in(pem_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "-----BEGIN"));
|
||||
}
|
||||
if (strfield2("audio/x-mpegurl", mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "#EXTM3U"));
|
||||
}
|
||||
if (strfield2("text/x-vcard", mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "BEGIN:VCARD"));
|
||||
}
|
||||
#undef SNIFF_RULE
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
hts_boolean hts_sniff_mime_known(const char *mime) {
|
||||
if (mime == NULL || *mime == '\0')
|
||||
return HTS_FALSE;
|
||||
return sniff_eval(SNIFF_QUERY_KNOWN, NULL, 0, mime);
|
||||
}
|
||||
|
||||
hts_boolean hts_sniff_mime_consistent(const void *data, size_t size,
|
||||
const char *mime) {
|
||||
if (data == NULL || size == 0 || mime == NULL || *mime == '\0')
|
||||
return HTS_FALSE;
|
||||
return sniff_eval(SNIFF_QUERY_MATCH, (const unsigned char *) data, size,
|
||||
mime);
|
||||
}
|
||||
@@ -1,50 +0,0 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2017 Xavier Roche and other contributors
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Important notes:
|
||||
|
||||
- We hereby ask people using this source NOT to use it in purpose of grabbing
|
||||
emails addresses, or collecting any other private information on persons.
|
||||
This would disgrace our work, and spoil the many hours we spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: MIME magic-byte consistency checks */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#ifndef HTSSNIFF_DEFH
|
||||
#define HTSSNIFF_DEFH
|
||||
|
||||
#include <stddef.h>
|
||||
#include "htsglobal.h"
|
||||
|
||||
/* Leading-body window read to arbitrate a wire/extension MIME conflict. */
|
||||
#define HTS_SNIFF_LEN 512
|
||||
|
||||
/* Can a magic rule ever confirm this MIME? (whether sniffing is worth it) */
|
||||
hts_boolean hts_sniff_mime_known(const char *mime);
|
||||
|
||||
/* TRUE when the leading body bytes are consistent with the claimed MIME;
|
||||
FALSE on unknown MIME, unknown magic, or too-short data (fail-safe). */
|
||||
hts_boolean hts_sniff_mime_consistent(const void *data, size_t size,
|
||||
const char *mime);
|
||||
|
||||
#endif
|
||||
@@ -1,17 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Cache generation reconcile policies (httrack -#test=reconcile <dir>):
|
||||
# promote a stranded old generation, keep the larger one after an aborted
|
||||
# run, and restore the old one when an update transferred nothing.
|
||||
|
||||
set -eu
|
||||
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
out=$(httrack -#test=reconcile "$dir")
|
||||
|
||||
test "$out" = "cache-reconcile: OK" || {
|
||||
echo "expected 'cache-reconcile: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -81,14 +81,15 @@ name '/x.pdf' 'text/html' 'x.html' status=-1
|
||||
name '/x.html' 'text/html' 'x.html' status=-1
|
||||
name '/x.php' 'application/pdf' 'x.pdf' status=-1 cdispo=report.pdf
|
||||
|
||||
# Contested type (wire disagrees with a specific ext): magic bytes proving the
|
||||
# extension right keep it, anything else trusts the wire as before.
|
||||
name '/photo.jpg' 'image/png' 'photo.jpg' body=hex:FFD8FFE000104A46
|
||||
# Contested type (wire disagrees with a specific ext): the wire is trusted and
|
||||
# body bytes are not consulted; pinned so a content-based tie-break shows up
|
||||
# as an explicit flip of these rows.
|
||||
name '/photo.jpg' 'image/png' 'photo.png' body=hex:FFD8FFE000104A46
|
||||
name '/photo.jpg' 'image/png' 'photo.png' body=hex:89504E470D0A1A0A
|
||||
name '/photo.jpg' 'image/png' 'photo.png'
|
||||
name '/doc.pdf' 'text/html' 'doc.pdf' body=hex:255044462D312E34
|
||||
name '/doc.pdf' 'text/html' 'doc.html' body=hex:255044462D312E34
|
||||
name '/doc.pdf' 'text/html' 'doc.html' 'body=<html><body>soft 404</body></html>'
|
||||
name '/style.css' 'image/png' 'style.png' 'body=body { }' # no rule for css: wire wins
|
||||
name '/style.css' 'image/png' 'style.png' 'body=body { }'
|
||||
|
||||
# A redirect answer resolves nothing: delayed placeholder name.
|
||||
name '/x.php' 'text/html' 'x.0.delayed' statuscode=301
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# MIME magic consistency (-#test=sniff <content-type> <hex:..|text>), the
|
||||
# tie-break behind htsname's wire-vs-extension naming.
|
||||
|
||||
chk() {
|
||||
local mime="$1" body="$2" want="$3"
|
||||
out="$(httrack -#test=sniff "$mime" "$body" | sed -n 's/^sniff: //p')"
|
||||
test "$out" == "$want" || {
|
||||
echo "FAIL: '$mime' '$body' -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
yes='known=1 consistent=1'
|
||||
no='known=1 consistent=0'
|
||||
unk='known=0 consistent=0'
|
||||
|
||||
# images
|
||||
chk image/jpeg hex:FFD8FFE000104A46 "$yes"
|
||||
chk image/png hex:89504E470D0A1A0A "$yes"
|
||||
chk image/png hex:FFD8FFE000104A46 "$no" # jpeg bytes are not a png
|
||||
chk image/gif 'GIF89a' "$yes"
|
||||
chk image/bmp 'BMxxxx' "$yes"
|
||||
chk image/tiff hex:49492A00 "$yes"
|
||||
chk image/tiff hex:4D4D002A "$yes" # both endians
|
||||
chk image/x-icon hex:00000100 "$yes"
|
||||
chk image/x-icon hex:00000200 "$yes" # Windows cursor, spec maps to x-icon
|
||||
chk image/webp 'RIFFxxxxWEBPVP' "$yes"
|
||||
chk image/webp 'RIFFxxxxWAVE' "$no" # riff subtype discriminates
|
||||
chk image/avif hex:0000001C6674797061766966 "$yes"
|
||||
chk image/avif hex:0000001C6674797068656963 "$no" # heic brand is not avif
|
||||
chk image/heic hex:0000001C6674797068656963 "$yes"
|
||||
chk image/svg+xml '<svg xmlns="x">' "$yes"
|
||||
chk image/svg+xml $'\xef\xbb\xbf <?xml version="1.0"?>' "$yes" # BOM+ws skip
|
||||
|
||||
# audio / video
|
||||
chk audio/mpeg 'ID3xxx' "$yes"
|
||||
chk audio/mpeg hex:FFFB9000 "$yes" # bare frame sync
|
||||
chk audio/aac hex:FFF15080 "$yes"
|
||||
chk audio/flac 'fLaC' "$yes"
|
||||
chk audio/ogg hex:4F67675300 "$yes"
|
||||
chk audio/x-wav 'RIFFxxxxWAVE' "$yes"
|
||||
chk video/x-msvideo 'RIFFxxxxAVI ' "$yes"
|
||||
chk video/x-msvideo 'RIFFxxxxWAVE' "$no"
|
||||
chk video/mp4 hex:000000186674797069736F6D "$yes"
|
||||
chk video/webm hex:1A45DFA3 "$yes"
|
||||
chk video/mpeg hex:000001BA "$yes"
|
||||
chk video/x-ms-wmv hex:3026B2758E66CF11 "$yes"
|
||||
|
||||
# archives; zip magic covers the office-container families
|
||||
chk application/zip hex:504B0304 "$yes"
|
||||
chk application/vnd.openxmlformats-officedocument.wordprocessingml.document hex:504B0304 "$yes"
|
||||
chk application/vnd.oasis.opendocument.text hex:504B0304 "$yes"
|
||||
chk application/msword hex:D0CF11E0A1B11AE1 "$yes"
|
||||
chk application/msword hex:504B0304 "$no" # legacy .doc is OLE, not zip
|
||||
chk application/x-gzip hex:1F8B08 "$yes"
|
||||
chk application/x-bzip2 'BZh9' "$yes"
|
||||
chk application/x-7z-compressed hex:377ABCAF271C "$yes"
|
||||
chk application/x-rar-compressed hex:526172211A07 "$yes"
|
||||
chk application/zstd hex:28B52FFD "$yes"
|
||||
chk application/x-tar "hex:$(printf '00%.0s' {1..257})7573746172" "$yes" # ustar at 257
|
||||
chk application/x-tar hex:7573746172 "$no"
|
||||
|
||||
# documents, fonts, misc
|
||||
chk application/pdf '%PDF-1.7' "$yes"
|
||||
chk application/pdf '<html><body>soft 404</body></html>' "$no"
|
||||
chk application/postscript '%!PS-Adobe' "$yes"
|
||||
chk application/rtf '{\rtf1' "$yes"
|
||||
chk font/woff2 'wOF2' "$yes"
|
||||
chk font/otf 'OTTO' "$yes"
|
||||
chk font/ttf hex:0001000000 "$yes"
|
||||
chk application/x-shockwave-flash 'CWSx' "$yes"
|
||||
chk application/x-java-vm hex:CAFEBABE "$yes"
|
||||
chk application/wasm hex:0061736D "$yes"
|
||||
chk text/html $' \r\n<!DOCTYPE html><html>' "$yes"
|
||||
chk text/html '<html lang="en">' "$yes"
|
||||
chk text/html 'plain text, no markup' "$no"
|
||||
chk text/xml '<?xml version="1.0"?>' "$yes"
|
||||
|
||||
# no magic rule at all: never confirmed, never blocks the wire type
|
||||
chk text/css 'body { }' "$unk"
|
||||
chk text/plain 'hello' "$unk"
|
||||
chk application/x-javascript 'var x;' "$unk"
|
||||
@@ -1,19 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Read-side cache corruption (httrack -#test=cache-corrupt <dir>): zip byte
|
||||
# surgery (bad/oversized X-Size, blanked X-In-Cache, smashed header, garbled
|
||||
# deflate) must each be rejected per-entry, never crash, never taint the sibling.
|
||||
|
||||
set -eu
|
||||
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
# the smashed-header case logs expected "Corrupted cache entry" warnings on
|
||||
# stdout; the verdict is the last line
|
||||
out=$(httrack -#test=cache-corrupt "$dir" 2>/dev/null | tail -n1)
|
||||
|
||||
test "$out" = "cache-corrupt: OK" || {
|
||||
echo "expected 'cache-corrupt: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -23,11 +23,11 @@ name() {
|
||||
}
|
||||
}
|
||||
|
||||
# No live bytes: the recorded save name (X-Save) reproduces the previous
|
||||
# verdict; cached body bytes (PNG magic) are ignored; css has no magic rule.
|
||||
name '/photo.jpg' 'image/png' 'photo.jpg' 'cached=image/png|www.example.com/photo.jpg'
|
||||
# Names are re-derived from the stored headers on every run: neither the
|
||||
# recorded save name nor the cached body bytes change the verdict (pinned).
|
||||
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.jpg'
|
||||
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.png'
|
||||
name '/photo.jpg' 'image/jpeg' 'photo.jpg' 'cached=image/jpeg|www.example.com/photo.png'
|
||||
name '/style.css' 'image/png' 'style.css' 'cached=image/png|www.example.com/style.css'
|
||||
name '/style.css' 'image/png' 'style.png' 'cached=image/png|www.example.com/style.css'
|
||||
# agreement keeps the URL ext verbatim (.jpeg), never canonicalized to .jpg
|
||||
name '/photo.jpeg' 'image/jpeg' 'photo.jpeg' 'cached=image/jpeg|www.example.com/photo.jpeg'
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Content-Type vs URL-extension naming (#267 family, default -%N2). A MISSING
|
||||
# type keeps a specific non-HTML ext; a DECLARED disagreeing type is trusted
|
||||
# unless magic bytes prove the ext right (lie/wrongtype/packed keep theirs),
|
||||
# so a real HTML body (report.pdf) still becomes .html. Wrong names are
|
||||
# asserted absent so a regression in either direction fails.
|
||||
# Content-Type vs URL-extension naming (issue #267 family) under the default
|
||||
# delayed type check (-%N2). Policy: a MISSING Content-Type must not clobber a
|
||||
# URL extension that maps to a specific non-HTML type (.png/.pdf stay as-is);
|
||||
# an explicitly DECLARED type is trusted, so a binary-looking URL that really
|
||||
# serves HTML (text/html on .pdf/.jpg) is named .html. The "wrong" names are
|
||||
# asserted absent so a regression in either direction fails here.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
@@ -13,11 +14,11 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/notype.pdf' --not-found 'types/notype.html' \
|
||||
--found 'types/photo.png' \
|
||||
--found 'types/doc.pdf' \
|
||||
--found 'types/lie.png' --not-found 'types/lie.html' \
|
||||
--found 'types/wrongtype.jpg' --not-found 'types/wrongtype.png' \
|
||||
--found 'types/bigtype.jpg' --not-found 'types/bigtype.png' \
|
||||
--found 'types/mutant.jpg' --not-found 'types/mutant.png' \
|
||||
--found 'types/packed.jpg' --not-found 'types/packed.png' \
|
||||
--found 'types/lie.html' --not-found 'types/lie.png' \
|
||||
--found 'types/wrongtype.png' --not-found 'types/wrongtype.jpg' \
|
||||
--found 'types/bigtype.png' --not-found 'types/bigtype.jpg' \
|
||||
--found 'types/packed.png' --not-found 'types/packed.jpg' \
|
||||
--found 'types/mutant.png' --not-found 'types/mutant.jpg' \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/page.htm' --not-found 'types/page.html' \
|
||||
--found 'types/script.js' \
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# An update pass keeps the names the first crawl chose: type and save name
|
||||
# ride the cache, so a declared-text/html .pdf stays .html, a typeless .png
|
||||
# stays .png, and a sniff-kept ext is reproduced from X-Save even when the
|
||||
# refetched content changed (mutant.jpg serves PNG bytes on the rerun).
|
||||
# A second (update) pass must keep the names the first crawl chose. The stored
|
||||
# Content-Type rides the cache, so the update reads back the same value -- the
|
||||
# unknown/unknown sentinel for a typeless response, the declared type otherwise
|
||||
# -- and names consistently: a declared-text/html .pdf stays .html and a
|
||||
# typeless .png stays .png across the update rather than reverting.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/lie.png' --not-found 'types/lie.html' \
|
||||
--found 'types/wrongtype.jpg' --not-found 'types/wrongtype.png' \
|
||||
--found 'types/bigtype.jpg' --not-found 'types/bigtype.png' \
|
||||
--found 'types/packed.jpg' --not-found 'types/packed.png' \
|
||||
--found 'types/mutant.jpg' --not-found 'types/mutant.png' \
|
||||
--found 'types/lie.html' \
|
||||
--found 'types/wrongtype.png' --not-found 'types/wrongtype.jpg' \
|
||||
--found 'types/packed.png' --not-found 'types/packed.jpg' \
|
||||
--found 'types/mutant.png' --not-found 'types/mutant.jpg' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# -E time limit (#481): server pages trickle for minutes; the engine must stop
|
||||
# on its own at -E plus grace, aborting the in-flight transfers.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# cancelled crawls can orphan .delayed placeholders (#483): skip that audit
|
||||
start=$(date +%s)
|
||||
bash "$top_srcdir/tests/local-crawl.sh" \
|
||||
--skip-delayed-audit \
|
||||
--log-found 'More than 2 seconds passed' \
|
||||
httrack 'BASEURL/trickle/index.html' -E2 -c4
|
||||
wall=$(($(date +%s) - start))
|
||||
# hard stop is due at -E2 + 5s grace; near TRICKLE_SECONDS means it never fired
|
||||
if [ "$wall" -ge 30 ]; then
|
||||
echo "crawl took ${wall}s, -E hard stop did not engage" >&2
|
||||
exit 1
|
||||
fi
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# -M byte cap (#77): the crawl must stop with the "giving up" error and keep
|
||||
# the mirror well under the 8 x 640KB the fixture totals uncapped.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# cap = -M + the 4 in-flight files the smooth stop lets finish + one of margin
|
||||
bash "$top_srcdir/tests/local-crawl.sh" \
|
||||
--log-found 'More than 400000 bytes have been transferred.. giving up' \
|
||||
--found bigfiles/p0.bin \
|
||||
--max-mirror-bytes 3700000 \
|
||||
httrack 'BASEURL/bigfiles/index.html' -M400000 -c4
|
||||
@@ -1,55 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Diverse seeded /big/ crawl: 12 pattern families, decoy absence, update pass
|
||||
# must 304-revalidate. 360 = 1 index + 96 pages + 192 imgs + 5 shared + 60
|
||||
# family + 6 singles; the 4 planted errors write -o1 pages, not counted.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --rerun \
|
||||
--errors 4 --files 360 \
|
||||
--found 'big/p/95.html' \
|
||||
--found 'big/a/d1/d2/d3/d4/d5/d6/d7/d8/deep.png' \
|
||||
--found 'big/a/f2-2x.png' \
|
||||
--found 'big/a/subs.vtt' \
|
||||
--found 'big/a/font.woff2' \
|
||||
--found 'big/a/js-data.bin' \
|
||||
--found 'big/d/01.pdf' \
|
||||
--found 'big/d/named.pdf' \
|
||||
--found 'big/a/doc.pdf' \
|
||||
--found "big/f9/caf$(printf '\xc3\xa9').html" \
|
||||
--found 'big/f7/fa.html' \
|
||||
--found 'big/a/ref.png' \
|
||||
--found 'big/f6/sub/leaf.html' \
|
||||
--found 'big/f1/dir/index.html' \
|
||||
--found 'big/f10/empty.html' \
|
||||
--found 'big/indexd41d.html' \
|
||||
--found 'big/a/i0a.png' \
|
||||
--not-found 'big/x/og' \
|
||||
--not-found 'big/x/tw' \
|
||||
--not-found 'big/x/jsonld.png' \
|
||||
--not-found 'big/x/never-scanned.png' \
|
||||
--not-found 'big/x/atom-only.html' \
|
||||
--not-found 'big/x/sitemap-only.html' \
|
||||
--not-found 'big/x/form-target.html' \
|
||||
--not-found 'big/x/formact' \
|
||||
--not-found 'big/x/ping' \
|
||||
--not-found 'big/x/aj.jar' \
|
||||
--not-found 'big/x/bj.jar' \
|
||||
--not-found 'big/x/is1.png' \
|
||||
--not-found 'big/x/concat.html' \
|
||||
--file-matches 'big/p/2.html' 'srcset="\.\./a/f2-1x\.png 1x, \.\./a/f2-2x\.png 2x"' \
|
||||
--file-matches 'big/a/blk2.css' 'url\(blk2-bg\.png\)' \
|
||||
--file-matches 'big/p/5.html' "document\\.write\\('<a href=\"\\.\\./f5/dw\\.html\"" \
|
||||
--file-not-matches 'big/p/1.html' 'href="/big/' \
|
||||
--log-not-found 'bogus state|[Pp]anic|assert' \
|
||||
--log-found '\(404\) at link [^ ]*/big/e/404\.html' \
|
||||
--log-found '\(410\) at link [^ ]*/big/e/410\.html' \
|
||||
--log-found '\(500\) at link [^ ]*/big/e/500\.html' \
|
||||
--log-found 'decompressing.*big/e/gztrunc\.html' \
|
||||
--log-found ', no files updated' \
|
||||
--max-mirror-bytes 700000 \
|
||||
--min-mirror-bytes 500000 \
|
||||
httrack 'BASEURL/big/index.html' --retries=0 -c8 -%c100 -A100000000
|
||||
@@ -1,12 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# An update run against a dead server must not destroy the cache: the no-data
|
||||
# rollback restores the previous hts-cache generation (zip caches lost it).
|
||||
|
||||
set -eu
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun-dead \
|
||||
--found 'simple/basic.html' \
|
||||
httrack 'BASEURL/simple/basic.html'
|
||||
@@ -1,14 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# An all-304 update of a tiny site (headers under the 32K rollback threshold)
|
||||
# is a healthy run: it must not trip the no-data rollback as a fake outage.
|
||||
|
||||
set -eu
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--log-found 'no files updated' \
|
||||
--log-not-found 'No data seems to have been transferred' \
|
||||
--found 'mini304/index.html' --found 'mini304/page.html' \
|
||||
httrack 'BASEURL/mini304/index.html'
|
||||
@@ -48,14 +48,12 @@ TESTS = \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-reconcile.test \
|
||||
01_engine-redirect.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-sniff.test \
|
||||
01_engine-status.test \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
@@ -64,7 +62,6 @@ TESTS = \
|
||||
01_engine-useragent.test \
|
||||
01_zlib-acceptencoding.test \
|
||||
01_zlib-cache.test \
|
||||
01_zlib-cache-corrupt.test \
|
||||
01_zlib-cache-golden.test \
|
||||
01_zlib-cache-writefail.test \
|
||||
01_zlib-savename-cached.test \
|
||||
@@ -98,11 +95,6 @@ TESTS = \
|
||||
30_local-fragment-link.test \
|
||||
31_local-javaclass.test \
|
||||
32_local-cdispo.test \
|
||||
33_local-delayed.test \
|
||||
34_local-maxtime.test \
|
||||
35_local-maxsize.test \
|
||||
36_local-bigcrawl.test \
|
||||
37_local-cache-outage.test \
|
||||
38_local-update-304.test
|
||||
33_local-delayed.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -16,17 +16,13 @@
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
|
||||
# --max-mirror-bytes N \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --max/--min-mirror-bytes bound the mirrored content bytes (host root).
|
||||
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
|
||||
# host root), to assert rewritten link/content survived the crawl.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
# which the ephemeral port forces into the cookie domain) and passes it to
|
||||
# httrack via --cookies-file, to exercise preloaded cookies.
|
||||
# --rerun-dead re-runs with the server stopped: the no-data rollback must
|
||||
# restore the previous hts-cache generation byte-identical.
|
||||
|
||||
set -u
|
||||
|
||||
@@ -39,7 +35,6 @@ key="${testdir}/server.key"
|
||||
tls=
|
||||
verbose=
|
||||
rerun=
|
||||
rerun_dead=
|
||||
tmpdir=
|
||||
serverpid=
|
||||
crawlpid=
|
||||
@@ -97,7 +92,6 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create
|
||||
# --- parse leading control flags --------------------------------------------
|
||||
declare -a audit=()
|
||||
declare -a cookies=()
|
||||
skip_delayed_audit=""
|
||||
scheme=http
|
||||
pos=0
|
||||
args=("$@")
|
||||
@@ -105,8 +99,7 @@ nargs=$#
|
||||
while test "$pos" -lt "$nargs"; do
|
||||
case "${args[$pos]}" in
|
||||
--debug) verbose=1 ;;
|
||||
--rerun) rerun=1 ;; # run httrack a second time (update pass) before auditing
|
||||
--rerun-dead) rerun_dead=1 ;; # re-run with the server stopped (cache rollback)
|
||||
--rerun) rerun=1 ;; # run httrack a second time (update pass) before auditing
|
||||
--no-purge)
|
||||
nopurge=1
|
||||
audit+=("--no-purge")
|
||||
@@ -123,14 +116,11 @@ while test "$pos" -lt "$nargs"; do
|
||||
pos=$((pos + 1))
|
||||
cookies+=("${args[$pos]}")
|
||||
;;
|
||||
--skip-delayed-audit)
|
||||
skip_delayed_audit=1
|
||||
;;
|
||||
--errors | --files)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--found | --not-found | --directory | --log-found | --log-not-found | --max-mirror-bytes | --min-mirror-bytes)
|
||||
--found | --not-found | --directory | --log-found | --log-not-found)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
@@ -245,43 +235,6 @@ if test -n "$rerun"; then
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- optional dead pass: server stopped, the cache must survive the rollback --
|
||||
if test -n "$rerun_dead"; then
|
||||
zip="${out}/hts-cache/new.zip"
|
||||
test -s "$zip" || die "no cache was written by the first pass"
|
||||
cp "$zip" "${tmpdir}/cache-before.zip"
|
||||
cp "${out}/hts-log.txt" "${tmpdir}/log-before.txt"
|
||||
kill "$serverpid" 2>/dev/null
|
||||
wait "$serverpid" 2>/dev/null
|
||||
serverpid=
|
||||
info "re-running httrack against the stopped server"
|
||||
httrack -O "$out" --user-agent="httrack $ver local ($(uname -omrs))" \
|
||||
"${moreargs[@]}" "${hts[@]}" >"${log}.dead" 2>&1 &
|
||||
crawlpid=$!
|
||||
wait "$crawlpid" || true
|
||||
crawlpid=
|
||||
result "OK (dead pass ran)"
|
||||
# The dead pass must have gone through the no-data rollback, not bailed out
|
||||
# before the mirror loop (which would leave the cache trivially untouched).
|
||||
info "checking the dead pass hit the rollback"
|
||||
if grep -aq "No data seems to have been transferred" "${out}/hts-log.txt"; then
|
||||
result "OK"
|
||||
else
|
||||
result "rollback notice not found in hts-log.txt"
|
||||
exit 1
|
||||
fi
|
||||
info "checking the previous cache generation was restored"
|
||||
if cmp -s "$zip" "${tmpdir}/cache-before.zip" &&
|
||||
test ! -e "${out}/hts-cache/old.zip"; then
|
||||
result "OK"
|
||||
else
|
||||
result "new.zip differs from the pre-outage cache (or old.zip left behind)"
|
||||
exit 1
|
||||
fi
|
||||
# Audits below describe the healthy crawl, not the dead pass.
|
||||
cp "${tmpdir}/log-before.txt" "${out}/hts-log.txt"
|
||||
fi
|
||||
|
||||
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||
hostroot=
|
||||
for cand in "${out}/127.0.0.1_${port}" "${out}/127.0.0.1"; do
|
||||
@@ -293,15 +246,12 @@ done
|
||||
test -n "$hostroot" || die "could not find host root under $out"
|
||||
debug "host root: $hostroot"
|
||||
|
||||
# A completed crawl must leave no .delayed temporaries (issue #107).
|
||||
# --skip-delayed-audit: a cancelled crawl can orphan placeholders (issue #483)
|
||||
if test -z "$skip_delayed_audit"; then
|
||||
info "checking for leftover .delayed files"
|
||||
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
|
||||
if test -z "$leftovers"; then result "OK"; else
|
||||
result "leftover: $leftovers"
|
||||
exit 1
|
||||
fi
|
||||
# A completed crawl must leave no .delayed temporaries (issue #107)
|
||||
info "checking for leftover .delayed files"
|
||||
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
|
||||
if test -z "$leftovers"; then result "OK"; else
|
||||
result "leftover: $leftovers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- audit -------------------------------------------------------------------
|
||||
@@ -359,24 +309,6 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
--max-mirror-bytes)
|
||||
i=$((i + 1))
|
||||
sz=$(find "$hostroot" -type f -exec cat {} + | wc -c | tr -d '[:space:]')
|
||||
info "checking mirror size ${sz} <= ${audit[$i]} bytes"
|
||||
if test "$sz" -le "${audit[$i]}"; then result "OK"; else
|
||||
result "mirror too big"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--min-mirror-bytes)
|
||||
i=$((i + 1))
|
||||
sz=$(find "$hostroot" -type f -exec cat {} + | wc -c | tr -d '[:space:]')
|
||||
info "checking mirror size ${sz} >= ${audit[$i]} bytes"
|
||||
if test "$sz" -ge "${audit[$i]}"; then result "OK"; else
|
||||
result "mirror too small"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--file-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
|
||||
@@ -15,7 +15,6 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import hashlib
|
||||
import os
|
||||
import time
|
||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||
@@ -43,416 +42,6 @@ PAGE = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"""
|
||||
|
||||
|
||||
# --- /big/ seeded pseudo-site (36_local-bigcrawl) ---------------------------
|
||||
# Deterministic ~360-file tree; bodies derive from sha256(BIG_SEED, name) so
|
||||
# every run serves identical content and the test pins exact counts.
|
||||
BIG_SEED = "bigcrawl-lite-1"
|
||||
BIG_PAGES = 96
|
||||
BIG_FANOUT = 4
|
||||
# Fixed validator: a matching If-Modified-Since gets 304, so the update pass
|
||||
# revalidates instead of re-downloading.
|
||||
BIG_LASTMOD = "Mon, 01 Jan 2024 00:00:00 GMT"
|
||||
|
||||
BIG_CTYPES = {
|
||||
"html": "text/html",
|
||||
"css": "text/css",
|
||||
"js": "application/x-javascript",
|
||||
"png": "image/png",
|
||||
"gif": "image/gif",
|
||||
"jpg": "image/jpeg",
|
||||
"webp": "image/webp",
|
||||
"pdf": "application/pdf",
|
||||
"woff2": "font/woff2",
|
||||
"mp4": "video/mp4",
|
||||
"webm": "video/webm",
|
||||
"mp3": "audio/mpeg",
|
||||
"vtt": "text/vtt",
|
||||
"xml": "text/xml",
|
||||
"svg": "image/svg+xml",
|
||||
"jar": "application/java-archive",
|
||||
"bin": "application/octet-stream",
|
||||
}
|
||||
|
||||
# Honest magic bytes per claimed type so the #478 sniff never contests.
|
||||
BIG_MAGIC = {
|
||||
"png": b"\x89PNG\r\n\x1a\n",
|
||||
"gif": b"GIF89a",
|
||||
"jpg": b"\xff\xd8\xff\xe0",
|
||||
"webp": b"RIFF\x10\x27\x00\x00WEBPVP8 ",
|
||||
"pdf": b"%PDF-1.4\n",
|
||||
"woff2": b"wOF2",
|
||||
"mp4": b"\x00\x00\x00\x18ftypmp42",
|
||||
"webm": b"\x1a\x45\xdf\xa3",
|
||||
"mp3": b"ID3\x04\x00\x00\x00\x00\x00\x00",
|
||||
"jar": b"PK\x03\x04",
|
||||
}
|
||||
|
||||
|
||||
def big_blob(name, size):
|
||||
out = b""
|
||||
n = 0
|
||||
while len(out) < size:
|
||||
out += hashlib.sha256(f"{BIG_SEED}/{name}/{n}".encode()).digest()
|
||||
n += 1
|
||||
return out[:size]
|
||||
|
||||
|
||||
def big_asset(name):
|
||||
ext = name.rsplit(".", 1)[-1]
|
||||
size = 200 + int(hashlib.sha256(name.encode()).hexdigest(), 16) % 3800
|
||||
raw = big_blob(name, size)
|
||||
if ext in ("css", "js", "txt"):
|
||||
return b"/* " + raw.hex().encode() + b" */"
|
||||
return BIG_MAGIC.get(ext, b"") + raw
|
||||
|
||||
|
||||
def big_html(title, inner):
|
||||
page = (
|
||||
"<!DOCTYPE html><html><head><title>%s</title></head><body>\n%s\n</body></html>"
|
||||
% (
|
||||
title,
|
||||
inner,
|
||||
)
|
||||
)
|
||||
return page.encode()
|
||||
|
||||
|
||||
def _hexfill(name):
|
||||
return big_blob(name, 160).hex()
|
||||
|
||||
|
||||
HOME = '<a href="/big/index.html">home</a>'
|
||||
|
||||
BIG_TEXT_ASSETS = {
|
||||
"site.css": (
|
||||
"body { background: url(bg.png); } /* %s */" % _hexfill("site.css"),
|
||||
"text/css",
|
||||
),
|
||||
"print.css": ("p { margin: 0; } /* %s */" % _hexfill("print.css"), "text/css"),
|
||||
"blk.css": (
|
||||
'@import "blk2.css";\n'
|
||||
'@font-face { font-family: big; src: local("Nope Sans"), '
|
||||
'url(font.woff2) format("woff2"); }\n'
|
||||
"/* %s */" % _hexfill("blk.css"),
|
||||
"text/css",
|
||||
),
|
||||
# Absolute url() must come back relative after the rewrite (test greps it);
|
||||
# the \/ escapes collapse to an already-linked URL if taken literally.
|
||||
"blk2.css": (
|
||||
"body { background: url(/big/a/blk2-bg.png); }\n"
|
||||
"i { background: url(/big\\/a\\/bg.png); }\n"
|
||||
"/* %s */" % _hexfill("blk2.css"),
|
||||
"text/css",
|
||||
),
|
||||
# .open() grabs its first arg only (a method there is rejected, #218), so
|
||||
# the window.open single-URL form is the token-detected shape.
|
||||
"app.js": (
|
||||
'var im = new Image(); im.src = "/big/a/js-img.png";\n'
|
||||
'function pop() { window.open("/big/a/js-data.bin"); }\n'
|
||||
"// %s\n" % _hexfill("app.js"),
|
||||
"application/x-javascript",
|
||||
),
|
||||
"heavy.js": (
|
||||
'var h = new Image(); h.src = "/big/a/js1.png";\n'
|
||||
'function nav() { location.href = "/big/p/1.html"; }\n'
|
||||
'function pop() { window.open("/big/a/js2.bin"); }\n'
|
||||
"// %s\n" % _hexfill("heavy.js"),
|
||||
"application/x-javascript",
|
||||
),
|
||||
# text/javascript is fetched but never scanned: the URL inside must stay
|
||||
# out of the mirror.
|
||||
"decoy.js": (
|
||||
'var d = new Image(); d.src = "/big/x/never-scanned.png";\n',
|
||||
"text/javascript",
|
||||
),
|
||||
"subs.vtt": ("WEBVTT\n\n00:00.000 --> 00:01.000\nbig\n", "text/vtt"),
|
||||
"logo.svg": (
|
||||
'<svg xmlns="http://www.w3.org/2000/svg" width="4" height="4">'
|
||||
'<image href="ref.png" width="4" height="4"/></svg>',
|
||||
"image/svg+xml",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _fam_feeds(port):
|
||||
return (
|
||||
'<link rel="alternate" type="application/rss+xml" href="/big/f12/rss.xml">'
|
||||
'<a href="/big/f12/atom.xml">atom</a>'
|
||||
'<a href="/big/f12/sitemap.xml">sitemap</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_plain(port):
|
||||
return (
|
||||
'<a href="../f1/one.html">one</a>'
|
||||
'<a href="./two.html">two</a>'
|
||||
'<a href="../../big/f1/tri.html">tri</a>'
|
||||
'<a href="/big/f1/abs.html">abs</a>'
|
||||
'<a href="/big/f1/list.html">list</a>'
|
||||
'<a href="/big/f1/list.html?page=2">p2</a>'
|
||||
'<a href="/big/f1/list.html?page=3&sort=asc">p3</a>'
|
||||
'<a href="/big/f1/dir">dir</a>'
|
||||
'<a href="">self</a><a href="#">frag</a>'
|
||||
'<a href="mailto:big@example.com">mail</a>'
|
||||
'<a href="tel:+15551234">tel</a>'
|
||||
'<a href="data:text/plain;base64,aGk=">data</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_srcset(port):
|
||||
return (
|
||||
'<img src="/big/a/f2-base.png">'
|
||||
'<img srcset="/big/a/f2-1x.png 1x, /big/a/f2-2x.png 2x"'
|
||||
' src="/big/a/f2-base.png">'
|
||||
'<img data-srcset="/big/a/f2-1x.png 1x, /big/a/f2-2x.png 2x"'
|
||||
' src="/big/a/f2-base.png" loading="lazy">'
|
||||
'<picture><source type="image/webp" srcset="/big/a/f2-alt.webp">'
|
||||
'<img src="/big/a/f2-base.png"></picture>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_media(port):
|
||||
return (
|
||||
'<video src="/big/a/clip.mp4" poster="/big/a/poster.jpg">'
|
||||
'<source src="/big/a/clip.webm" type="video/webm">'
|
||||
'<track src="/big/a/subs.vtt" kind="subtitles" srclang="en">'
|
||||
"</video>"
|
||||
'<audio><source src="/big/a/tune.mp3" type="audio/mpeg"></audio>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_css(port):
|
||||
# image-set with descriptors is a proven-safe decoy (engine-surface §6).
|
||||
return (
|
||||
'<link rel="stylesheet" href="/big/a/print.css" media="print">'
|
||||
'<div style="background:url(/big/a/attr-bg.png)">styled</div>'
|
||||
'<style>@import "/big/a/blk.css"; h1 { background: url(/big/a/blk-bg.gif); }'
|
||||
' h2 { background-image: image-set("/big/x/is1.png" 1x, "/big/x/is2.png" 2x); }'
|
||||
"</style>"
|
||||
)
|
||||
|
||||
|
||||
def _fam_js(port):
|
||||
# The concatenated string is rejected by the scanner (no single literal).
|
||||
return (
|
||||
'<script src="/big/a/heavy.js"></script>'
|
||||
'<script src="/big/a/decoy.js"></script>'
|
||||
"<script>document.write('<a href=\"/big/f5/dw.html\">dw</a>');\n"
|
||||
'var nope = "xx-" + "/big/x/concat.html";</script>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_meta(port):
|
||||
# Extensionless decoy targets stay unfetchable even if the aggressive
|
||||
# parser fires (no known extension, no scheme: rejected in every state).
|
||||
return (
|
||||
'<meta http-equiv="refresh" content="2;URL=/big/f6/refreshed.html">'
|
||||
'<a href="/big/f6/based.html">based</a>'
|
||||
'<meta property="og:image" content="/big/x/og">'
|
||||
'<meta name="twitter:image" content="/big/x/tw">'
|
||||
'<script type="application/ld+json">'
|
||||
'{"@type": "Thing", "image": "/big/x/jsonld.png"}</script>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_legacy(port):
|
||||
# Comma-valued applet archive is rejected whole by the engine (decoy).
|
||||
return (
|
||||
'<a href="/big/f7/frames.html">frames</a>'
|
||||
'<img src="/big/a/map.gif" usemap="#m">'
|
||||
'<map name="m">'
|
||||
'<area shape="rect" coords="0,0,9,9" href="/big/f7/area.html"></map>'
|
||||
'<embed src="/big/a/e.pdf" type="application/pdf" width="9" height="9">'
|
||||
'<object data="/big/a/o.pdf" type="application/pdf"></object>'
|
||||
'<applet archive="/big/x/aj.jar,/big/x/bj.jar" width="1" height="1"></applet>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_svg(port):
|
||||
return (
|
||||
'<svg width="9" height="9">'
|
||||
'<image href="/big/a/svg-in.png" width="4" height="4"/>'
|
||||
'<use xlink:href="#icon"/></svg>'
|
||||
'<img src="/big/a/logo.svg">'
|
||||
)
|
||||
|
||||
|
||||
def _fam_i18n(port):
|
||||
return (
|
||||
'<a href="/big/f9/caf%C3%A9.html">cafe</a>'
|
||||
'<a href="/big/f9/latin1.html">latin1</a>'
|
||||
'<a href="/big/f9/metaonly.html">meta</a>'
|
||||
'<a href="/big/f9/bom.html">bom</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_http(port):
|
||||
return (
|
||||
'<a href="/big/r/hop1">chain</a>'
|
||||
'<a href="/big/r/get42">get42</a>'
|
||||
'<a href="/big/d/01">d01</a>'
|
||||
'<a href="/big/d/02">d02</a>'
|
||||
'<a href="/big/f10/empty.html">empty</a>'
|
||||
'<a href="/big/d/dl">dl</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_forms(port):
|
||||
# GET form action is rewritten but never fetched; formaction/ping are
|
||||
# outside the attribute tables (decoys).
|
||||
return (
|
||||
'<form action="/big/x/form-target.html" method="get">'
|
||||
'<input type="text" name="q">'
|
||||
'<input type="image" src="/big/a/btn.png" alt="go"></form>'
|
||||
'<a href="/big/f11/page.html">bare</a>'
|
||||
'<a href="/big/f11/page.html?utm_source=news&utm_medium=mail">utm</a>'
|
||||
'<a href="/big/f11/sess.html?PHPSESSID=deadbeef123">sess</a>'
|
||||
'<button formaction="/big/x/formact">go</button>'
|
||||
'<a href="/big/f11/page.html" ping="/big/x/ping">ping</a>'
|
||||
)
|
||||
|
||||
|
||||
BIG_FAMILIES = [
|
||||
_fam_feeds,
|
||||
_fam_plain,
|
||||
_fam_srcset,
|
||||
_fam_media,
|
||||
_fam_css,
|
||||
_fam_js,
|
||||
_fam_meta,
|
||||
_fam_legacy,
|
||||
_fam_svg,
|
||||
_fam_i18n,
|
||||
_fam_http,
|
||||
_fam_forms,
|
||||
]
|
||||
|
||||
|
||||
def big_link(m, style):
|
||||
return ["%d.html" % m, "../p/%d.html" % m, "/big/p/%d.html" % m][style]
|
||||
|
||||
|
||||
def big_page(n, port):
|
||||
style = n % 3
|
||||
home = ["../index.html", "/big/index.html", "../index.html"][style]
|
||||
parts = ['<a href="%s">home</a>' % home]
|
||||
if n > 0:
|
||||
parts.append('<a href="%s">up</a>' % big_link((n - 1) // BIG_FANOUT, style))
|
||||
for c in range(n * BIG_FANOUT + 1, n * BIG_FANOUT + BIG_FANOUT + 1):
|
||||
if c < BIG_PAGES:
|
||||
parts.append('<a href="%s">p%d</a>' % (big_link(c, style), c))
|
||||
parts.append('<link rel="stylesheet" href="/big/a/site.css">')
|
||||
parts.append('<script src="/big/a/app.js"></script>')
|
||||
exts = ["png", "gif", "jpg"]
|
||||
ia = "/big/a/i%da.%s" % (n, exts[n % 3])
|
||||
ib = "/big/a/i%db.%s" % (n, exts[(n + 1) % 3])
|
||||
# Rotate the second-image construct across deterministic table attributes.
|
||||
con = n % 4
|
||||
if con == 0:
|
||||
parts.append('<img src="%s"><img src="%s">' % (ia, ib))
|
||||
elif con == 1:
|
||||
parts.append(
|
||||
'<img src="%s"><table background="%s"><tr><td>t</td></tr></table>'
|
||||
% (ia, ib)
|
||||
)
|
||||
elif con == 2:
|
||||
parts.append('<img src="%s"><img src="%s" data-src="%s">' % (ia, ia, ib))
|
||||
else:
|
||||
parts.append(
|
||||
'<img src="%s" loading="lazy"><video poster="%s"></video>' % (ia, ib)
|
||||
)
|
||||
parts.append(BIG_FAMILIES[n % 12](port))
|
||||
return big_html("p%d" % n, "\n".join(parts))
|
||||
|
||||
|
||||
def big_index(port):
|
||||
return big_html(
|
||||
"big index",
|
||||
'<link rel="stylesheet" href="/big/a/site.css">'
|
||||
'<script src="/big/a/app.js"></script>'
|
||||
'<a href="p/0.html">root</a>'
|
||||
'<img src="/big/a/d1/d2/d3/d4/d5/d6/d7/d8/deep.png">'
|
||||
'<a href="/big/f1/long.html?x=%s">long</a>'
|
||||
'<a href="/big/f1/gzok.html">gzok</a>'
|
||||
'<a href="//127.0.0.1:%d/big/f1/protorel.html">protorel</a>'
|
||||
'<a href="http://127.0.0.1:%d/big/f1/abshost.html">abshost</a>'
|
||||
'<a href="/big/e/404.html">e404</a>'
|
||||
'<a href="/big/e/410.html">e410</a>'
|
||||
'<a href="/big/e/500.html">e500</a>'
|
||||
'<a href="/big/e/gztrunc.html">gzt</a>'
|
||||
'<a href="?">query</a>' % ("a" * 900, port, port),
|
||||
)
|
||||
|
||||
|
||||
BIG_REDIRECTS = {
|
||||
"/big/r/hop1": (301, "/big/r/hop2"),
|
||||
"/big/r/hop2": (302, "/big/f10/land.html"),
|
||||
"/big/r/get42": (301, "/big/a/doc.pdf"),
|
||||
"/big/f1/dir": (301, "/big/f1/dir/"),
|
||||
}
|
||||
|
||||
BIG_SIMPLE_PAGES = {
|
||||
"/big/p/two.html": "dot-slash target",
|
||||
"/big/f1/one.html": "one",
|
||||
"/big/f1/tri.html": "tri",
|
||||
"/big/f1/abs.html": "abs",
|
||||
"/big/f1/dir/": "dir index",
|
||||
"/big/f1/long.html": "long",
|
||||
"/big/f1/gzok.html": "gzok",
|
||||
"/big/f1/protorel.html": "protorel",
|
||||
"/big/f1/abshost.html": "abshost",
|
||||
"/big/f5/dw.html": "dw target",
|
||||
"/big/f6/refreshed.html": "refreshed",
|
||||
"/big/f6/sub/leaf.html": "leaf",
|
||||
"/big/f7/fa.html": "frame a",
|
||||
"/big/f7/fb.html": "frame b",
|
||||
"/big/f7/fn.html": "noframes",
|
||||
"/big/f7/area.html": "area",
|
||||
"/big/f10/land.html": "landed",
|
||||
"/big/f11/page.html": "the page",
|
||||
"/big/f11/sess.html": "the sess page",
|
||||
}
|
||||
|
||||
# Extensionless downloads: name resolution is wire-type driven (#478 contract).
|
||||
BIG_DOWNLOADS = {
|
||||
"/big/d/01": ("pdf", None),
|
||||
"/big/d/02": ("png", None),
|
||||
"/big/d/dl": ("pdf", 'attachment; filename="named.pdf"'),
|
||||
}
|
||||
|
||||
|
||||
def _big_rss(port):
|
||||
# purl.org marker makes the feed parse; item URLs are already-linked pages.
|
||||
return (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">\n'
|
||||
"<channel><title>big</title><link>http://127.0.0.1:%d/big/index.html</link>\n"
|
||||
"<item><title>i1</title><link>http://127.0.0.1:%d/big/p/1.html</link>\n"
|
||||
'<enclosure url="http://127.0.0.1:%d/big/p/2.html" type="text/html"/></item>\n'
|
||||
"</channel></rss>\n" % (port, port, port)
|
||||
).encode()
|
||||
|
||||
|
||||
def _big_atom(port):
|
||||
# No purl marker: emitted verbatim, its URL must never be fetched.
|
||||
return (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<feed xmlns="http://www.w3.org/2005/Atom"><title>big</title>\n'
|
||||
"<entry><title>e1</title>"
|
||||
'<link href="http://127.0.0.1:%d/big/x/atom-only.html"/>'
|
||||
"</entry></feed>\n" % port
|
||||
).encode()
|
||||
|
||||
|
||||
def _big_sitemap(port):
|
||||
return (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'
|
||||
"<url><loc>http://127.0.0.1:%d/big/x/sitemap-only.html</loc></url>\n"
|
||||
"</urlset>\n" % port
|
||||
).encode()
|
||||
|
||||
|
||||
class Handler(SimpleHTTPRequestHandler):
|
||||
# Quieter logging; the launcher captures httrack's own log anyway.
|
||||
def log_message(self, fmt, *args):
|
||||
@@ -831,16 +420,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def route_redir_target(self):
|
||||
self.send_raw(b"<html><body>redirect target</body></html>\n", "text/html")
|
||||
|
||||
# --- /mini304/: tiny fully-cacheable site (an update gets only 304s) ---
|
||||
def route_mini304_index(self):
|
||||
self.big_send(
|
||||
b'<html><body>\n\t<a href="page.html">page</a>\n</body></html>\n',
|
||||
"text/html",
|
||||
)
|
||||
|
||||
def route_mini304_page(self):
|
||||
self.big_send(b"<html><body>tiny cacheable page</body></html>\n", "text/html")
|
||||
|
||||
# --- delayed-type degenerate paths (issues #5/#107) --------------------
|
||||
def route_delayed_index(self):
|
||||
self.send_html(
|
||||
@@ -885,43 +464,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def route_delayed_empty(self):
|
||||
self.send_raw(b"", "text/html") # 200 + Content-Length: 0
|
||||
|
||||
# -E time-limit (#481): pages that trickle far longer than any -E budget,
|
||||
# so only an engine-side abort can end the crawl.
|
||||
TRICKLE_SECONDS = 60
|
||||
|
||||
def send_bin_index(self):
|
||||
"""Index page linking p0.bin..p7.bin (shared by trickle and bigfiles)."""
|
||||
self.send_html(
|
||||
"".join('\t<a href="p%d.bin">p%d</a>\n' % (i, i) for i in range(8))
|
||||
)
|
||||
|
||||
def route_trickle_index(self):
|
||||
self.send_bin_index()
|
||||
|
||||
def route_trickle_page(self):
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", str(2 * self.TRICKLE_SECONDS))
|
||||
self.end_headers()
|
||||
if self.command == "HEAD":
|
||||
return
|
||||
try:
|
||||
for _ in range(self.TRICKLE_SECONDS):
|
||||
self.wfile.write(b"xy")
|
||||
self.wfile.flush()
|
||||
time.sleep(1.0)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# -M byte cap (#77): large fast files so a crawl overruns -M immediately.
|
||||
BIGFILE_BYTES = 640 * 1024
|
||||
|
||||
def route_bigfiles_index(self):
|
||||
self.send_bin_index()
|
||||
|
||||
def route_bigfile(self):
|
||||
self.send_raw(b"x" * self.BIGFILE_BYTES, "application/octet-stream")
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
@@ -967,24 +509,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/cdispo/fetch.php": route_cdispo,
|
||||
"/cdispo/evil.php": route_cdispo,
|
||||
"/delayed/index.html": route_delayed_index,
|
||||
"/trickle/index.html": route_trickle_index,
|
||||
"/trickle/p0.bin": route_trickle_page,
|
||||
"/trickle/p1.bin": route_trickle_page,
|
||||
"/trickle/p2.bin": route_trickle_page,
|
||||
"/trickle/p3.bin": route_trickle_page,
|
||||
"/trickle/p4.bin": route_trickle_page,
|
||||
"/trickle/p5.bin": route_trickle_page,
|
||||
"/trickle/p6.bin": route_trickle_page,
|
||||
"/trickle/p7.bin": route_trickle_page,
|
||||
"/bigfiles/index.html": route_bigfiles_index,
|
||||
"/bigfiles/p0.bin": route_bigfile,
|
||||
"/bigfiles/p1.bin": route_bigfile,
|
||||
"/bigfiles/p2.bin": route_bigfile,
|
||||
"/bigfiles/p3.bin": route_bigfile,
|
||||
"/bigfiles/p4.bin": route_bigfile,
|
||||
"/bigfiles/p5.bin": route_bigfile,
|
||||
"/bigfiles/p6.bin": route_bigfile,
|
||||
"/bigfiles/p7.bin": route_bigfile,
|
||||
"/delayed/noloc.php": route_delayed_noloc,
|
||||
"/delayed/selfloop.php": route_delayed_selfloop,
|
||||
"/delayed/redir.php": route_delayed_redir,
|
||||
@@ -1003,150 +527,8 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/redir/index.html": route_redir_index,
|
||||
"/redir/go.php": route_redir_go,
|
||||
"/redir/target.html": route_redir_target,
|
||||
"/mini304/index.html": route_mini304_index,
|
||||
"/mini304/page.html": route_mini304_page,
|
||||
}
|
||||
|
||||
# --- /big/ seeded pseudo-site ------------------------------------------
|
||||
|
||||
def big_send(self, body, ctype, code=200, extra=()):
|
||||
if code == 200 and self.headers.get("If-Modified-Since") == BIG_LASTMOD:
|
||||
self.send_response(304)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
return
|
||||
self.send_response(code)
|
||||
if code == 200:
|
||||
self.send_header("Last-Modified", BIG_LASTMOD)
|
||||
self.send_header("Content-Type", ctype)
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
for name, value in extra:
|
||||
self.send_header(name, value)
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
def big_error(self, code, reason):
|
||||
body = big_html("error", "<p>%d</p>%s" % (code, HOME))
|
||||
self.big_send(body, "text/html", code=code, extra=[("X-Reason", reason)])
|
||||
|
||||
def route_big(self):
|
||||
split = urlsplit(self.path)
|
||||
path = unquote(split.path)
|
||||
port = self.server.server_address[1]
|
||||
if path in BIG_REDIRECTS:
|
||||
code, location = BIG_REDIRECTS[path]
|
||||
self.send_response(code)
|
||||
self.send_header("Location", location)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
elif path == "/big/index.html":
|
||||
self.big_send(big_index(port), "text/html")
|
||||
elif path in BIG_SIMPLE_PAGES:
|
||||
body = big_html(path, "<p>%s</p>%s" % (BIG_SIMPLE_PAGES[path], HOME))
|
||||
if path == "/big/f1/gzok.html":
|
||||
self.big_send(
|
||||
gzip.compress(body, mtime=0),
|
||||
"text/html",
|
||||
extra=[("Content-Encoding", "gzip")],
|
||||
)
|
||||
else:
|
||||
self.big_send(body, "text/html")
|
||||
elif path == "/big/f1/list.html":
|
||||
# Pagination: distinct content per query string.
|
||||
body = big_html("list", "<p>listing %s</p>%s" % (split.query or "1", HOME))
|
||||
self.big_send(body, "text/html")
|
||||
elif path == "/big/f6/based.html":
|
||||
self.big_send(
|
||||
big_html(
|
||||
"based",
|
||||
'<base href="http://127.0.0.1:%d/big/f6/sub/">'
|
||||
'<a href="leaf.html">leaf</a>' % port,
|
||||
),
|
||||
"text/html",
|
||||
)
|
||||
elif path == "/big/f7/frames.html":
|
||||
self.big_send(
|
||||
b'<html><frameset cols="50%,50%"><frame src="fa.html">'
|
||||
b'<frame src="fb.html"><noframes><body><a href="fn.html">fn</a>'
|
||||
b"</body></noframes></frameset></html>",
|
||||
"text/html",
|
||||
)
|
||||
elif path == "/big/f9/café.html":
|
||||
self.big_send(big_html("cafe", "<p>cafe</p>%s" % HOME), "text/html")
|
||||
elif path == "/big/f9/latin1.html":
|
||||
self.big_send(
|
||||
b"<html><body><p>caf\xe9 latin</p></body></html>",
|
||||
"text/html; charset=ISO-8859-1",
|
||||
)
|
||||
elif path == "/big/f9/metaonly.html":
|
||||
self.big_send(
|
||||
'<html><head><meta charset="utf-8"></head>'
|
||||
"<body><p>café meta</p></body></html>".encode(),
|
||||
"text/html",
|
||||
)
|
||||
elif path == "/big/f9/bom.html":
|
||||
self.big_send(
|
||||
b"\xef\xbb\xbf" + big_html("bom", "<p>bom</p>%s" % HOME), "text/html"
|
||||
)
|
||||
elif path == "/big/f10/empty.html":
|
||||
self.big_send(b"", "text/html")
|
||||
elif path == "/big/f12/rss.xml":
|
||||
self.big_send(_big_rss(port), "text/xml")
|
||||
elif path == "/big/f12/atom.xml":
|
||||
self.big_send(_big_atom(port), "application/xml")
|
||||
elif path == "/big/f12/sitemap.xml":
|
||||
self.big_send(_big_sitemap(port), "text/xml")
|
||||
elif path.startswith("/big/p/"):
|
||||
try:
|
||||
n = int(path[len("/big/p/") : -len(".html")])
|
||||
except ValueError:
|
||||
n = -1
|
||||
if 0 <= n < BIG_PAGES and path.endswith(".html"):
|
||||
self.big_send(big_page(n, port), "text/html")
|
||||
else:
|
||||
self.big_error(404, "no such page")
|
||||
elif path.startswith("/big/a/") or path.startswith("/big/x/"):
|
||||
name = path[len("/big/a/") :]
|
||||
if path.startswith("/big/a/") and name in BIG_TEXT_ASSETS:
|
||||
text, ctype = BIG_TEXT_ASSETS[name]
|
||||
self.big_send(text.encode(), ctype)
|
||||
elif name.endswith(".html"):
|
||||
# Decoy targets 200 so a parser leak becomes a mirror file.
|
||||
self.big_send(big_html(name, "<p>%s</p>" % name), "text/html")
|
||||
else:
|
||||
ext = name.rsplit(".", 1)[-1]
|
||||
ctype = BIG_CTYPES.get(ext, "application/octet-stream")
|
||||
self.big_send(big_asset(name), ctype)
|
||||
elif path in BIG_DOWNLOADS:
|
||||
ext, cdispo = BIG_DOWNLOADS[path]
|
||||
extra = [("Content-Disposition", cdispo)] if cdispo else []
|
||||
self.big_send(
|
||||
big_asset(path[len("/big/") :] + "." + ext),
|
||||
BIG_CTYPES[ext],
|
||||
extra=extra,
|
||||
)
|
||||
elif path == "/big/e/404.html":
|
||||
self.big_error(404, "Not Found")
|
||||
elif path == "/big/e/410.html":
|
||||
self.big_error(410, "Gone")
|
||||
elif path == "/big/e/500.html":
|
||||
self.big_error(500, "Server Error")
|
||||
elif path == "/big/e/gztrunc.html":
|
||||
# Half a gzip stream, honest Content-Length: decode fails, and the
|
||||
# missing Last-Modified keeps it the one uncacheable resource.
|
||||
full = gzip.compress(big_html("gz", "x" * 3000), mtime=0)
|
||||
body = full[: len(full) // 2]
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/html")
|
||||
self.send_header("Content-Encoding", "gzip")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
else:
|
||||
self.big_error(404, "no such big path")
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
|
||||
def reject_fragment(self):
|
||||
@@ -1162,9 +544,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
if path.startswith("/big/"):
|
||||
self.route_big()
|
||||
return True
|
||||
# Match percent-encoded paths (accented #157 route) by their decoded form.
|
||||
handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
|
||||
if handler is not None:
|
||||
|
||||
@@ -211,9 +211,7 @@ main() {
|
||||
# lintian ourselves below as the real gate.
|
||||
local -a debuild_opts=(--no-lintian)
|
||||
local -a build_opts=()
|
||||
# -d: a source build runs no debhelper, so don't require Build-Depends
|
||||
# locally (the buildds and the --sbuild gate enforce them).
|
||||
[[ $source_only -eq 1 ]] && build_opts+=(-S -d)
|
||||
[[ $source_only -eq 1 ]] && build_opts+=(-S)
|
||||
if [[ $unsigned -eq 1 ]]; then
|
||||
build_opts+=(-us -uc)
|
||||
else
|
||||
@@ -236,15 +234,12 @@ main() {
|
||||
|
||||
# The real lintian gate (debuild only reports, it does not fail on tags).
|
||||
# --profile debian: CI runners are Ubuntu, whose vendor data would wrongly
|
||||
# reject the Debian "unstable" distribution. Suppressed tags are stale-local-
|
||||
# lintian skew, not package defects: newer-standards-version, and
|
||||
# recommended-field (old lintian still wants the Priority field the sid
|
||||
# lintian in CI accepts dropping). set -e turns any error/warning tag into
|
||||
# a failure.
|
||||
# reject the Debian "unstable" distribution. newer-standards-version only
|
||||
# means the local lintian is older than the buildds', not a package
|
||||
# defect, so suppress it. set -e turns any error/warning tag into a failure.
|
||||
info "running lintian gate (--fail-on=error,warning)"
|
||||
lintian --profile debian -I -i --fail-on=error,warning \
|
||||
--suppress-tags newer-standards-version,recommended-field \
|
||||
"${changes[@]}"
|
||||
--suppress-tags newer-standards-version "${changes[@]}"
|
||||
|
||||
dcmd cp -- "${changes[@]}" "$outdir/"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user