Compare commits

..

1 Commits

Author SHA1 Message Date
Xavier Roche
6d1b67792e Add --strip-query to drop query keys from dedup naming (#112)
Two URLs that differ only in tracking or session query parameters
(?utm_source=x versus ?utm_source=y) were saved as separate files, and a
single CGI could fan out into thousands of near-duplicate pages.
fil_normalized already sorted query args, so reordered parameters dedup,
but there was no way to drop a named key.

--strip-query "[host/pattern=]key1,key2,..." (repeatable) removes the
listed keys when computing the dedup key and the saved name. The fetched
URL is untouched, so a required sid= is still sent on the wire; only the
local namespace collapses. Patterns match the normalized host/path with
the +/- filter glob (strjoker), last match wins as in the filter list,
and stripping is decoupled from urlhack (-%u) so it never silently
no-ops with -%u0.

It all funnels through one chokepoint, fil_normalized: an internal
fil_normalized_filtered() strips then delegates, and hts_query_strip_keys
resolves the per-URL key list. The strip pass walks every query field,
including empty and trailing ones, so its output is a fixpoint under the
read path's second normalization (otherwise dedup silently misses).
Exported ABI is unchanged; the strip_query field is appended at the tail
of httrackp. Covered by a -#test=stripquery self-test (degenerate queries
like a=&b&c== and a 50-case idempotency fixpoint) and an end-to-end dedup
crawl test.

Closes #112

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-27 11:03:56 +02:00
23 changed files with 44 additions and 498 deletions

2
debian/control vendored
View File

@@ -30,7 +30,7 @@ Description: Copy websites to your computer (Offline browser)
Package: webhttrack
Architecture: any
Multi-Arch: foreign
Depends: ${misc:Depends}, ${shlibs:Depends}, webhttrack-common, sensible-utils, chromium | firefox-esr | www-browser
Depends: ${misc:Depends}, ${shlibs:Depends}, webhttrack-common, sensible-utils, firefox-esr | chromium | www-browser
Replaces: webhttrack-common (<< 3.43.9-2)
Breaks: webhttrack-common (<< 3.43.9-2)
Suggests: httrack, httrack-doc

View File

@@ -24,7 +24,6 @@ httrack \- offline browser : copy websites to a local directory
[ \fB\-EN, \-\-max\-time[=N]\fR ]
[ \fB\-AN, \-\-max\-rate[=N]\fR ]
[ \fB\-%cN, \-\-connection\-per\-second[=N]\fR ]
[ \fB\-%G, \-\-pause\fR ]
[ \fB\-GN, \-\-max\-pause[=N]\fR ]
[ \fB\-cN, \-\-sockets[=N]\fR ]
[ \fB\-TN, \-\-timeout[=N]\fR ]
@@ -50,7 +49,6 @@ httrack \- offline browser : copy websites to a local directory
[ \fB\-%p, \-\-preserve\fR ]
[ \fB\-%T, \-\-utf8\-conversion\fR ]
[ \fB\-bN, \-\-cookies[=N]\fR ]
[ \fB\-%K, \-\-cookies\-file\fR ]
[ \fB\-u, \-\-check\-type[=N]\fR ]
[ \fB\-j, \-\-parse\-java[=N]\fR ]
[ \fB\-sN, \-\-robots[=N]\fR ]
@@ -156,8 +154,6 @@ maximum mirror time in seconds (60=1 minute, 3600=1 hour) (\-\-max\-time[=N])
maximum transfer rate in bytes/seconds (1000=1KB/s max) (\-\-max\-rate[=N])
.IP \-%cN
maximum number of connections/seconds (*%c10) (\-\-connection\-per\-second[=N])
.IP \-%G
random pause of MIN[:MAX] seconds between files (e.g. %G5:10) (\-\-pause <param>)
.IP \-GN
pause transfer if N bytes reached, and wait until lock file is deleted (\-\-max\-pause[=N])
.SS Flow control:
@@ -216,8 +212,6 @@ links conversion to UTF\-8 (\-\-utf8\-conversion)
.SS Spider options:
.IP \-bN
accept cookies in cookies.txt (0=do not accept,* 1=accept) (\-\-cookies[=N])
.IP \-%K
load extra cookies from a Netscape cookies.txt (\-\-cookies\-file <param>)
.IP \-u
check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always) (\-\-check\-type[=N])
.IP \-j
@@ -234,8 +228,6 @@ tolerant requests (accept bogus responses on some servers, but not standard!) (\
update hacks: various hacks to limit re\-transfers when updating (identical size, bogus response..) (\-\-updatehack)
.IP \-%u
url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..) (\-\-urlhack)
.br
opt out of one url\-hack part: \-\-keep\-www\-prefix (www.foo.com<>foo.com), \-\-keep\-double\-slashes (//), \-\-keep\-query\-order (?b&a)
.IP \-%A
assume that a type (cgi,asp..) is always linked with a mime type (\-%A php3,cgi=text/html;dat,bin=application/x\-zip) (\-\-assume <param>)
.br

View File

@@ -112,10 +112,6 @@ const char *hts_optalias[][4] = {
{"include-query-string", "-%q", "single", ""},
{"strip-query", "-%g", "param1",
"strip [host/pattern=]key1,key2,... from URLs"},
{"cookies-file", "-%K", "param1",
"load extra cookies from a Netscape cookies.txt"},
{"pause", "-%G", "param1",
"random pause of MIN[:MAX] seconds between files"},
{"generate-errors", "-o", "single", ""},
{"do-not-generate-errors", "-o0", "single", ""},
{"purge-old", "-X", "param", ""},
@@ -132,9 +128,6 @@ const char *hts_optalias[][4] = {
{"tolerant", "-%B", "single", ""},
{"updatehack", "-%s", "single", ""}, {"sizehack", "-%s", "single", ""},
{"urlhack", "-%u", "single", ""},
{"keep-www-prefix", "-%j", "single", ""},
{"keep-double-slashes", "-%o", "single", ""},
{"keep-query-order", "-%y", "single", ""},
{"user-agent", "-F", "param1", "user-agent identity"},
{"referer", "-%R", "param1", "default referer URL"},
{"from", "-%E", "param1", "from email address"},

View File

@@ -35,7 +35,6 @@ Please visit our Website: http://www.httrack.com
#include <fcntl.h>
#include <ctype.h>
#include <stdint.h> /* uint64_t for the pause mixer (already a hard dep via md5.h) */
/* File defs */
#include "htscore.h"
@@ -524,12 +523,9 @@ int httpmirror(char *url1, httrackp * opt) {
opt->cookie = &cookie;
cookie.max_len = 30000; // max len
strcpybuff(cookie.data, "");
// Load the mirror's cookies.txt, then the one in the current directory
// Charger cookies.txt par défaut ou cookies.txt du miroir
cookie_load(opt->cookie, StringBuff(opt->path_log), "cookies.txt");
cookie_load(opt->cookie, "", "cookies.txt");
// A user-supplied cookie file is merged last so it wins on conflicts
if (strnotempty(StringBuff(opt->cookies_file)))
cookie_load(opt->cookie, "", StringBuff(opt->cookies_file));
} else
opt->cookie = NULL;
@@ -3315,21 +3311,6 @@ HTS_INLINE int back_fillmax(struct_back * sback, httrackp * opt,
return -1; /* plus de place */
}
/* Seed-derived: stable within a gap, rerolls per launch; a per-call rand()
would bias the delay toward min_ms (see header). Jitter, not crypto. */
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms) {
uint64_t z = (uint64_t) seed;
if (max_ms <= min_ms)
return min_ms;
/* SplitMix64 finalizer: scrambles the low-entropy ms timestamp. */
z += 0x9E3779B97F4A7C15ULL;
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL;
z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL;
z ^= z >> 31;
return min_ms + (int) (z % (uint64_t) (max_ms - min_ms + 1));
}
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
int n = opt->maxsoc - back_nsoc(sback);
@@ -3350,18 +3331,6 @@ int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
}
}
// #185 randomized inter-file pause: non-blocking, one launch per gap
if (n > 0 && opt->pause_max_ms > 0 && HTS_STAT.last_connect > 0) {
TStamp opTime =
HTS_STAT.last_request ? HTS_STAT.last_request : HTS_STAT.last_connect;
TStamp lap = mtime_local() - opTime;
if (lap < hts_pause_target_ms(opTime, opt->pause_min_ms, opt->pause_max_ms))
n = 0;
else
n = 1;
}
return n;
}
@@ -3773,14 +3742,6 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
if (StringNotEmpty(from->strip_query))
StringCopyS(to->strip_query, from->strip_query);
if (StringNotEmpty(from->cookies_file))
StringCopyS(to->cookies_file, from->cookies_file);
if (from->pause_max_ms > 0) {
to->pause_min_ms = from->pause_min_ms;
to->pause_max_ms = from->pause_max_ms;
}
if (from->retry > -1)
to->retry = from->retry;

View File

@@ -234,10 +234,8 @@ struct hash_struct {
coucal adrfil;
/* former address+path -> link index (renamed/moved entries) */
coucal former_adrfil;
/* effective urlhack sub-flags: www.==host / // collapse / query-arg sort */
hts_boolean norm_host;
hts_boolean norm_slash;
hts_boolean norm_query;
/* scratch buffers reused across lookups (not reentrant) */
int normalized;
/* query-strip keys (not owned); set from opt->strip_query at hash_init */
const char *strip_query;
char normfil[HTS_URLMAXSIZE * 2];
@@ -373,11 +371,6 @@ char *next_token(char *p, int flag);
char *fil_normalized_filtered(const char *source, char *dest,
const char *strip);
/* As fil_normalized_filtered(), but DO_SLASH/DO_QUERY gate the // collapse and
the query-argument sort independently (the urlhack sub-flags). */
char *fil_normalized_filtered_ex(const char *source, char *dest,
const char *strip, int do_slash, int do_query);
/* For URL ADR/FIL, return (in DEST) the comma keylist to strip from the
'\n'-separated "[pattern=]keys" RULES (patterns matched on host/path via
strjoker, last wins); NULL if none match. Feeds fil_normalized_filtered(). */
@@ -418,10 +411,6 @@ int back_pluggable_sockets(struct_back * sback, httrackp * opt);
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt);
/* Randomized inter-file pause target in [min_ms,max_ms] (#185), derived from a
timestamp seed so it is stable within one gap and rerolls per launch. */
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms);
/* Schedule more links from the heap into free slots. Returns the number queued,
or <=0 if none could be added (no free slot / paused / stopped). */
int back_fill(struct_back * sback, httrackp * opt, cache_back * cache,

View File

@@ -1570,30 +1570,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
com++;
}
break; // url hack
case 'j':
opt->no_www_dedup =
HTS_TRUE; // --keep-www-prefix: keep www.X != X
if (*(com + 1) == '0') {
opt->no_www_dedup = HTS_FALSE;
com++;
}
break;
case 'o':
opt->no_slash_dedup =
HTS_TRUE; // --keep-double-slashes: keep //
if (*(com + 1) == '0') {
opt->no_slash_dedup = HTS_FALSE;
com++;
}
break;
case 'y':
opt->no_query_dedup =
HTS_TRUE; // --keep-query-order: keep ?b&a order
if (*(com + 1) == '0') {
opt->no_query_dedup = HTS_FALSE;
com++;
}
break;
case 'v':
opt->verbosedisplay = HTS_VERBOSE_FULL;
if (isdigit((unsigned char) *(com + 1))) {
@@ -1976,51 +1952,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
StringCat(opt->strip_query, argv[na]);
}
break;
case 'K': // cookies-file: extra Netscape cookies.txt to preload
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
HTS_PANIC_PRINTF(
"Option cookies-file needs a blank space and "
"a cookies.txt path");
printf("Example: --cookies-file \"/home/me/cookies.txt\"\n");
htsmain_free();
return -1;
} else {
na++;
if (strlen(argv[na]) >= 1024) {
HTS_PANIC_PRINTF("Cookie file path too long");
htsmain_free();
return -1;
}
StringCopy(opt->cookies_file, argv[na]);
}
break;
case 'G': // pause: randomized inter-file delay MIN[:MAX] seconds
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
HTS_PANIC_PRINTF("Option pause needs a blank space and a "
"delay in seconds (MIN[:MAX])");
printf("Example: --pause 5:10\n");
htsmain_free();
return -1;
} else {
double pmin = 0, pmax = 0;
int nf;
na++;
nf = sscanf(argv[na], "%lf:%lf", &pmin, &pmax);
if (nf < 2)
pmax = pmin; /* a single value means a fixed delay */
/* positive-form bounds: NaN fails every comparison, so this
rejects it before the undefined (int)(NaN*1000) cast */
if (nf < 1 || !(pmin >= 0 && pmax >= pmin && pmax <= 86400)) {
HTS_PANIC_PRINTF("Invalid --pause range (expected "
"MIN[:MAX] seconds, 0<=MIN<=MAX<=86400)");
htsmain_free();
return -1;
}
opt->pause_min_ms = (int) (pmin * 1000.0);
opt->pause_max_ms = (int) (pmax * 1000.0);
}
break;
case 't': /* do not change type (ending) of filenames according to the MIME type */
opt->no_type_change = 1;
if (*(com+1)=='0') { opt->no_type_change = 0; com++; }

View File

@@ -106,10 +106,10 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
const lien_url*const lien = (const lien_url*) value;
const char *const adr = !former ? lien->adr : lien->former_adr;
const char *const fil = !former ? lien->fil : lien->former_fil;
const char *const adr_norm =
adr != NULL ? (hash->norm_host ? jump_normalized_const(adr)
: jump_identification_const(adr))
: NULL;
const char *const adr_norm = adr != NULL ?
( hash->normalized ? jump_normalized_const(adr)
: jump_identification_const(adr) )
: NULL;
// copy address
assertf(adr_norm != NULL);
@@ -123,9 +123,8 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
const char *const keys = hts_query_strip_keys(hash->strip_query, adr, fil,
keybuf, sizeof(keybuf));
if (hash->norm_slash || hash->norm_query || keys != NULL) {
fil_normalized_filtered_ex(fil, &hash->normfil[strlen(hash->normfil)],
keys, hash->norm_slash, hash->norm_query);
if (hash->normalized || keys != NULL) {
fil_normalized_filtered(fil, &hash->normfil[strlen(hash->normfil)], keys);
} else {
strcpy(&hash->normfil[strlen(hash->normfil)], fil);
}
@@ -140,7 +139,8 @@ static int key_adrfil_equals_generic(void *arg,
coucal_key_const a_,
coucal_key_const b_,
const int former) {
hash_struct *const hash = (hash_struct *) arg;
hash_struct *const hash = (hash_struct*) arg;
const int normalized = hash->normalized;
const lien_url*const a = (const lien_url*) a_;
const lien_url*const b = (const lien_url*) b_;
const char *const a_adr = !former ? a->adr : a->former_adr;
@@ -157,10 +157,10 @@ static int key_adrfil_equals_generic(void *arg,
assertf(b_fil != NULL);
// skip scheme and authentication to the domain (possibly without www.)
ja = hash->norm_host ? jump_normalized_const(a_adr)
: jump_identification_const(a_adr);
jb = hash->norm_host ? jump_normalized_const(b_adr)
: jump_identification_const(b_adr);
ja = normalized
? jump_normalized_const(a_adr) : jump_identification_const(a_adr);
jb = normalized
? jump_normalized_const(b_adr) : jump_identification_const(b_adr);
assertf(ja != NULL);
assertf(jb != NULL);
if (strcasecmp(ja, jb) != 0) {
@@ -175,12 +175,9 @@ static int key_adrfil_equals_generic(void *arg,
const char *const keysb =
hts_query_strip_keys(hash->strip_query, b_adr, b_fil, kb, sizeof(kb));
if (hash->norm_slash || hash->norm_query || keysa != NULL ||
keysb != NULL) {
fil_normalized_filtered_ex(a_fil, hash->normfil, keysa, hash->norm_slash,
hash->norm_query);
fil_normalized_filtered_ex(b_fil, hash->normfil2, keysb, hash->norm_slash,
hash->norm_query);
if (normalized || keysa != NULL || keysb != NULL) {
fil_normalized_filtered(a_fil, hash->normfil, keysa);
fil_normalized_filtered(b_fil, hash->normfil2, keysb);
return strcmp(hash->normfil, hash->normfil2) == 0;
} else {
return strcmp(a_fil, b_fil) == 0;
@@ -240,14 +237,11 @@ static int key_former_adrfil_equals(void *arg,
return key_adrfil_equals_generic(arg, a, b, 1);
}
void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized) {
void hash_init(httrackp *opt, hash_struct * hash, int normalized) {
hash->sav = coucal_new(0);
hash->adrfil = coucal_new(0);
hash->former_adrfil = coucal_new(0);
/* urlhack is the umbrella; per-feature negatives opt out of each part */
hash->norm_host = normalized && !opt->no_www_dedup;
hash->norm_slash = normalized && !opt->no_slash_dedup;
hash->norm_query = normalized && !opt->no_query_dedup;
hash->normalized = normalized;
/* snapshot the query-strip list (not owned; valid for the hash lifetime) */
hash->strip_query =
StringNotEmpty(opt->strip_query) ? StringBuff(opt->strip_query) : NULL;
@@ -306,26 +300,6 @@ void hash_free(hash_struct *hash) {
}
}
/* Test helper: do the two URLs dedupe to the same key under opt's urlhack
flags? Exercises the live hash compare (norm_host/slash/query resolution). */
hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila,
const char *adrb, const char *filb) {
hash_struct hash;
lien_url la, lb;
hts_boolean eq;
memset(&la, 0, sizeof(la));
memset(&lb, 0, sizeof(lb));
la.adr = key_duphandler(NULL, adra);
la.fil = key_duphandler(NULL, fila);
lb.adr = key_duphandler(NULL, adrb);
lb.fil = key_duphandler(NULL, filb);
hash_init(opt, &hash, opt->urlhack);
eq = key_adrfil_equals(&hash, &la, &lb);
hash_free(&hash);
return eq;
}
// retour: position ou -1 si non trouvé
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
hash_struct_type type) {

View File

@@ -51,12 +51,8 @@ typedef enum hash_struct_type {
} hash_struct_type;
// tables de hachage
void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized);
void hash_init(httrackp *opt, hash_struct *hash, int normalized);
void hash_free(hash_struct *hash);
/* Test helper: HTS_TRUE if the two URLs dedupe together under opt's urlhack
flags. */
hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila,
const char *adrb, const char *filb);
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
hash_struct_type type);
void hash_write(hash_struct * hash, size_t lpos);

View File

@@ -521,7 +521,6 @@ void help(const char *app, int more) {
infomsg(" EN maximum mirror time in seconds (60=1 minute, 3600=1 hour)");
infomsg(" AN maximum transfer rate in bytes/seconds (1000=1KB/s max)");
infomsg(" %cN maximum number of connections/seconds (*%c10)");
infomsg(" %G random pause of MIN[:MAX] seconds between files (e.g. %G5:10)");
infomsg
(" GN pause transfer if N bytes reached, and wait until lock file is deleted");
infomsg("");
@@ -573,7 +572,6 @@ void help(const char *app, int more) {
infomsg("");
infomsg("Spider options:");
infomsg(" bN accept cookies in cookies.txt (0=do not accept,* 1=accept)");
infomsg(" %K load extra cookies from a Netscape cookies.txt");
infomsg
(" u check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always)");
infomsg
@@ -590,9 +588,6 @@ void help(const char *app, int more) {
(" %s update hacks: various hacks to limit re-transfers when updating (identical size, bogus response..)");
infomsg
(" %u url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..)");
infomsg(" opt out of one url-hack part: --keep-www-prefix "
"(www.foo.com<>foo.com), --keep-double-slashes (//), "
"--keep-query-order (?b&a)");
infomsg
(" %A assume that a type (cgi,asp..) is always linked with a mime type (-%A php3,cgi=text/html;dat,bin=application/x-zip)");
infomsg(" shortcut: '--assume standard' is equivalent to -%A "

View File

@@ -3610,10 +3610,7 @@ static int sortNormFnc(const void *a_, const void *b_) {
return strcmp(*a + 1, *b + 1);
}
/* Path normalizer core: optionally collapse redundant '//' (DO_SLASH) and/or
sort query arguments (DO_QUERY) so equivalent URLs dedupe. */
static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
int do_query) {
HTSEXT_API char *fil_normalized(const char *source, char *dest) {
char lastc = 0;
int gotquery = 0;
int ampargs = 0;
@@ -3623,8 +3620,8 @@ static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
for(i = j = 0; source[i] != '\0'; i++) {
if (!gotquery && source[i] == '?')
gotquery = ampargs = 1;
if (do_slash && !gotquery && lastc == '/' && source[i] == '/') {
// foo//bar -> foo/bar
if ((!gotquery && lastc == '/' && source[i] == '/') // foo//bar -> foo/bar
) {
} else {
if (gotquery && source[i] == '&') {
ampargs++;
@@ -3636,7 +3633,7 @@ static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
dest[j++] = '\0';
/* Sort arguments (&foo=1&bar=2 == &bar=2&foo=1) */
if (do_query && ampargs > 1) {
if (ampargs > 1) {
char **amps = malloct(ampargs * sizeof(char *));
char *copyBuff = NULL;
size_t qLen = 0;
@@ -3684,10 +3681,6 @@ static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
return dest;
}
HTSEXT_API char *fil_normalized(const char *source, char *dest) {
return fil_normalized_ex(source, dest, 1, 1);
}
/* Is query key ARG[0..keylen) in the comma-separated STRIP list? "*" = all;
case-sensitive, space-trimmed tokens. */
static int hts_query_key_stripped(const char *arg, size_t keylen,
@@ -3718,9 +3711,8 @@ static int hts_query_key_stripped(const char *arg, size_t keylen,
}
/* see htscore.h */
char *fil_normalized_filtered_ex(const char *source, char *dest,
const char *strip, int do_slash,
int do_query) {
char *fil_normalized_filtered(const char *source, char *dest,
const char *strip) {
const char *query;
char BIGSTK tmp[HTS_URLMAXSIZE * 2];
htsbuff cb;
@@ -3729,7 +3721,7 @@ char *fil_normalized_filtered_ex(const char *source, char *dest,
/* No strip list, or no query: plain normalization. */
if (strip == NULL || *strip == '\0' ||
(query = strchr(source, '?')) == NULL) {
return fil_normalized_ex(source, dest, do_slash, do_query);
return fil_normalized(source, dest);
}
/* Copy the path, re-emit kept query args, let fil_normalized() sort. Walk
@@ -3758,13 +3750,7 @@ char *fil_normalized_filtered_ex(const char *source, char *dest,
break;
query++;
}
return fil_normalized_ex(tmp, dest, do_slash, do_query);
}
/* see htscore.h */
char *fil_normalized_filtered(const char *source, char *dest,
const char *strip) {
return fil_normalized_filtered_ex(source, dest, strip, 1, 1);
return fil_normalized(tmp, dest);
}
/* see htscore.h */
@@ -6040,14 +6026,8 @@ HTSEXT_API httrackp *hts_create_opt(void) {
opt->verbosedisplay = HTS_VERBOSE_NONE; // no text animation
opt->sizehack = HTS_FALSE;
opt->urlhack = HTS_TRUE;
opt->no_www_dedup = HTS_FALSE;
opt->no_slash_dedup = HTS_FALSE;
opt->no_query_dedup = HTS_FALSE;
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
StringCopy(opt->strip_query, "");
StringCopy(opt->cookies_file, "");
opt->pause_min_ms = 0;
opt->pause_max_ms = 0;
opt->ftp_proxy = HTS_TRUE;
opt->convert_utf8 = HTS_TRUE;
StringCopy(opt->filelist, "");
@@ -6193,7 +6173,6 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
StringFree(opt->footer);
StringFree(opt->mod_blacklist);
StringFree(opt->strip_query);
StringFree(opt->cookies_file);
StringFree(opt->path_html);
StringFree(opt->path_html_utf8);

View File

@@ -237,13 +237,9 @@ int url_savename(lien_adrfilsave *const afs,
// www-42.foo.com -> foo.com
// foo.com/bar//foobar -> foo.com/bar/foobar
if (opt->urlhack) {
// dedup-lookup key; honor the per-feature negatives like htshash.c so
// distinct URLs keep distinct savenames (else keep normadr = adr)
if (!opt->no_www_dedup)
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
normfil =
fil_normalized_filtered_ex(fil_complete, normfil_, strip,
!opt->no_slash_dedup, !opt->no_query_dedup);
// copy of adr (without protocol), used for lookups (see urlhack)
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
normfil = fil_normalized_filtered(fil_complete, normfil_, strip);
} else {
if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder
char *pos = strchr(adr_complete, ':');
@@ -256,11 +252,9 @@ int url_savename(lien_adrfilsave *const afs,
normadr = normadr_;
}
}
// strip still applies with urlhack off (host left untouched); no // or
// query-sort here, to match the hash key (norm_slash/norm_query are 0 when
// urlhack is off) so a URL is looked up under the key it was stored with
// strip still applies with urlhack off (host left untouched)
if (strip != NULL)
normfil = fil_normalized_filtered_ex(fil_complete, normfil_, strip, 0, 0);
normfil = fil_normalized_filtered(fil_complete, normfil_, strip);
}
// à afficher sans ftp://

View File

@@ -531,14 +531,6 @@ struct httrackp {
htsoptstate state; /**< embedded live engine state */
String strip_query; /**< query keys to drop when deduping URLs (-strip-query);
appended at the tail to keep field offsets stable */
hts_boolean
no_www_dedup; /**< with urlhack, keep www.host distinct from host */
hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
String cookies_file; /**< extra Netscape cookies.txt to preload
(--cookies-file) */
int pause_min_ms; /**< inter-file pause lower bound, ms (0=off, #185) */
int pause_max_ms; /**< inter-file pause upper bound, ms */
};
/* Running statistics for a mirror. */

View File

@@ -3602,28 +3602,16 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
int set_prio_to = 0; // pas de priotité fixéd par wizard
// check whether URLHack is harmless or not (per the effective
// sub-flags)
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
!opt->no_query_dedup)) {
const int norm_host = !opt->no_www_dedup;
const int norm_slash = !opt->no_slash_dedup;
const int norm_query = !opt->no_query_dedup;
// check whether URLHack is harmless or not
if (opt->urlhack) {
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
strlcpybuff(n_adr,
norm_host ? jump_normalized_const(moved->adr)
: jump_identification_const(moved->adr),
sizeof(n_adr));
strlcpybuff(pn_adr,
norm_host ? jump_normalized_const(urladr())
: jump_identification_const(urladr()),
sizeof(pn_adr));
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
norm_query);
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
norm_query);
n_adr[0] = n_fil[0] = '\0';
(void) adr_normalized_sized(moved->adr, n_adr, sizeof(n_adr));
(void) fil_normalized(moved->fil, n_fil);
(void) adr_normalized_sized(urladr(), pn_adr, sizeof(pn_adr));
(void) fil_normalized(urlfil(), pn_fil);
if (strcasecmp(n_adr, pn_adr) == 0
&& strcasecmp(n_fil, pn_fil) == 0) {
hts_log_print(opt, LOG_WARNING,

View File

@@ -899,71 +899,12 @@ static int st_copyopt(httrackp *opt, int argc, char **argv) {
if (to->parseall != HTS_TRUE)
err = 1;
/* String field: a non-empty source deep-copies across, an empty source
leaves the target intact (StringNotEmpty guard). Covers the exported
copy_htsopt String path that no crawl test reaches. */
StringCopy(from->cookies_file, "/tmp/jar.txt");
StringCopy(to->cookies_file, "");
copy_htsopt(from, to);
if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0)
err = 1;
StringCopy(from->cookies_file, "");
copy_htsopt(from, to);
if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0)
err = 1;
/* #185 pause pair: copied when enabled (max>0), the 0 sentinel skips */
from->pause_min_ms = 5000;
from->pause_max_ms = 10000;
to->pause_min_ms = to->pause_max_ms = 0;
copy_htsopt(from, to);
if (to->pause_min_ms != 5000 || to->pause_max_ms != 10000)
err = 1;
from->pause_min_ms = from->pause_max_ms = 0;
copy_htsopt(from, to);
if (to->pause_min_ms != 5000 || to->pause_max_ms != 10000)
err = 1;
hts_free_opt(from);
hts_free_opt(to);
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
return err;
}
static int st_pause(httrackp *opt, int argc, char **argv) {
int err = 0, i, seen_low = 0, seen_high = 0;
(void) opt;
(void) argc;
(void) argv;
/* Consecutive-ms seeds (production shape: launch timestamps a few ms apart)
must stay in range and spread, not collapse to a bound -- worst case for a
weak low-bit mixer. */
for (i = 0; i < 10000; i++) {
int t = hts_pause_target_ms((TStamp) (1719500000000LL + i), 5000, 10000);
if (t < 5000 || t > 10000)
err = 1;
seen_low |= (t < 6000);
seen_high |= (t > 9000);
}
if (!seen_low || !seen_high)
err = 1;
if (hts_pause_target_ms(12345, 8000, 8000) != 8000) /* equal bounds = fixed */
err = 1;
/* deterministic: a seed yields the same target even after an intervening call
with another seed (no global PRNG state to perturb it) */
{
int a = hts_pause_target_ms(99, 5000, 10000);
(void) hts_pause_target_ms(54321, 5000, 10000);
if (hts_pause_target_ms(99, 5000, 10000) != a)
err = 1;
}
printf("pause: %s\n", err ? "FAIL" : "OK");
return err;
}
static int st_relative(httrackp *opt, int argc, char **argv) {
char s[HTS_URLMAXSIZE * 2];
@@ -1231,53 +1172,6 @@ static int st_stripquery(httrackp *opt, int argc, char **argv) {
return 0;
}
/* -%u url-hack split (#271): each sub-flag must toggle independently. */
static int st_urlhack(httrackp *opt, int argc, char **argv) {
(void) argc;
(void) argv;
#define EQ(aa, fa, ab, fb) hash_url_equals(opt, aa, fa, ab, fb)
/* urlhack on, no opt-outs: www, // and query order all collapse */
opt->urlhack = HTS_TRUE;
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
assertf(EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b"));
assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
/* keep-www-prefix: host off; // and query still collapse */
opt->no_www_dedup = HTS_TRUE;
assertf(!EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b"));
assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
opt->no_www_dedup = HTS_FALSE;
/* keep-double-slashes: // significant; www, query order still collapse */
opt->no_slash_dedup = HTS_TRUE;
assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b"));
assertf(EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
opt->no_slash_dedup = HTS_FALSE;
/* keep-query-order: query order significant; www and // still collapse */
opt->no_query_dedup = HTS_TRUE;
assertf(!EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
assertf(EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b"));
opt->no_query_dedup = HTS_FALSE;
/* all opt-outs == urlhack off entirely */
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
assertf(!EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b"));
assertf(!EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
opt->urlhack = HTS_FALSE;
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
assertf(!EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b"));
#undef EQ
printf("urlhack self-test OK\n");
return 0;
}
/* ------------------------------------------------------------ */
/* Registry: name -> handler, with a usage hint and a one-line description. */
/* ------------------------------------------------------------ */
@@ -1296,8 +1190,6 @@ static const struct selftest_entry {
{"simplify", "<path>", "collapse ./ and ../ in a path", st_simplify},
{"stripquery", "", "--strip-query pattern/key stripping self-test",
st_stripquery},
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
st_urlhack},
{"mime", "<filename>", "MIME type for a filename", st_mime},
{"charset", "<charset> <string>",
"convert a string to UTF-8 from a charset", st_charset},
@@ -1310,7 +1202,6 @@ static const struct selftest_entry {
{"strsafe", "[overflow|overflow-buff [str]]", "bounded string-op self-test",
st_strsafe},
{"copyopt", "", "copy_htsopt option-copy self-test", st_copyopt},
{"pause", "", "randomized inter-file pause target self-test", st_pause},
{"relative", "<link> <curr-file>", "relative link between two paths",
st_relative},
{"resolve", "<link> <adr> <fil>", "resolve a link against an origin",

View File

@@ -90,16 +90,4 @@ refused "dangling-quote argument not refused cleanly"
run_only "$tmp/q-lone" '"'
refused "lone-quote argument not refused cleanly"
# --pause (#185): valid MIN[:MAX] accepted; malformed, reversed, over-range and
# non-finite values refused cleanly. NaN defeats naive `<`/`>` checks (it
# compares false to everything), so it must not slip through to the int cast.
run "$tmp/pause-ok" --pause 0.2:0.4
accepted "$tmp/pause-ok" "#185: valid --pause range rejected"
run "$tmp/pause-fix" --pause 0.2
accepted "$tmp/pause-fix" "#185: valid fixed --pause rejected"
for bad in nan nan:5 5:nan inf 10:5 99999; do
run "$tmp/pause-bad" --pause "$bad"
refused "#185: invalid --pause '$bad' not refused cleanly"
done
exit 0

View File

@@ -1,15 +0,0 @@
#!/bin/bash
#
# --pause (#185): the inter-file pause target must stay in [min,max] and spread
# across it (a per-call rand() would collapse it toward min). Driven by the
# in-process 'httrack -#test=pause' test. POSIX-portable ($(BASH) is /bin/sh on macOS).
set -eu
# 'run' is an ignored placeholder argument.
out=$(httrack -#test=pause run)
test "$out" = "pause: OK" || {
echo "expected 'pause: OK', got: $out" >&2
exit 1
}

View File

@@ -1,8 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# -%u url-hack split (#271): www / // / query-order dedup toggle independently.
# All assertions live in the engine self-test (hash compare flag resolution).
httrack -O /dev/null -#test=urlhack run | grep -q "urlhack self-test OK"

View File

@@ -16,8 +16,3 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \
# control: no stripping -> both query-named variants are saved
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \
httrack 'BASEURL/stripquery/index.html'
# strip still applies with url-hack off (-%u0): exercises the urlhack-off
# savename branch, which must normalize the dedup key the same way the hash does
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \
httrack 'BASEURL/stripquery/index.html' -%u0 --strip-query 'utm_source'

View File

@@ -1,22 +0,0 @@
#!/bin/bash
#
# End-to-end --cookies-file (#215): /gated/secret.php needs a cookie no page
# ever Set-Cookies, so it is reachable only when the option preloads it from a
# Netscape cookies.txt. Locks the CLI->opt->cookie_load->wire plumbing.
set -e
: "${top_srcdir:=..}"
# preloaded cookie -> secret page is served. -o0 means a 500 leaves no file, so
# --found/--files only hold when the secret is genuinely fetched (200).
bash "$top_srcdir/tests/local-crawl.sh" --cookie 'session=opensesame' \
--errors 0 --files 2 \
--found 'gated/index.html' --found 'gated/secret.html' \
httrack 'BASEURL/gated/index.php' -o0
# control: without the cookie the secret 500s; -o0 suppresses the error page so
# its absence is real (error + missing file)
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
--found 'gated/index.html' --not-found 'gated/secret.html' \
httrack 'BASEURL/gated/index.php' -o0

View File

@@ -1,29 +0,0 @@
#!/bin/bash
#
# --pause (#185): a fixed inter-file delay must slow a multi-file crawl. Measure
# the same crawl with and without --pause and compare: the harness overhead
# cancels, leaving only the pause. Integer seconds keep it portable (BSD date
# has no %N); a lower bound is not timing-flaky since a pause only adds time.
set -e
: "${top_srcdir:=..}"
run() { # echoes the wall-clock seconds of one crawl
local t0 t1
t0=$(date +%s)
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
httrack 'BASEURL/types/index.html' -c1 "$@" >/dev/null 2>&1
t1=$(date +%s)
echo $((t1 - t0))
}
base=$(run)
paused=$(run --pause 0.5)
delta=$((paused - base))
echo "crawl: ${base}s, with --pause 0.5: ${paused}s (delta ${delta}s)"
if [ "$delta" -lt 2 ]; then
echo "FAIL: --pause did not delay the crawl (delta ${delta}s)" >&2
exit 1
fi

View File

@@ -41,7 +41,6 @@ TESTS = \
01_engine-idna.test \
01_engine-mime.test \
01_engine-parse.test \
01_engine-pause.test \
01_engine-rcfile.test \
01_engine-relative.test \
01_engine-savename.test \
@@ -49,7 +48,6 @@ TESTS = \
01_engine-simplify.test \
01_engine-stripquery.test \
01_engine-strsafe.test \
01_engine-urlhack.test \
02_manpage-regen.test \
02_update-cache.test \
10_crawl-simple.test \
@@ -73,8 +71,6 @@ TESTS = \
23_local-errpage.test \
24_local-resume-overlap.test \
25_local-mime-exclude.test \
26_local-strip-query.test \
27_local-cookies-file.test \
28_local-pause.test
26_local-strip-query.test
CLEANFILES = check-network_sh.cache

View File

@@ -12,14 +12,11 @@
# the mirror directory name.
#
# Usage:
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
# bash local-crawl.sh [--tls] [--root DIR] \
# --errors N --files N --found PATH ... --directory PATH ... \
# --log-found REGEX ... --log-not-found REGEX ... \
# httrack BASEURL/some/path [httrack-args...]
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
# which the ephemeral port forces into the cookie domain) and passes it to
# httrack via --cookies-file, to exercise preloaded cookies.
set -u
@@ -88,7 +85,6 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create
# --- parse leading control flags --------------------------------------------
declare -a audit=()
declare -a cookies=()
scheme=http
pos=0
args=("$@")
@@ -109,10 +105,6 @@ while test "$pos" -lt "$nargs"; do
pos=$((pos + 1))
root="${args[$pos]}"
;;
--cookie)
pos=$((pos + 1))
cookies+=("${args[$pos]}")
;;
--errors | --files)
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
pos=$((pos + 1))
@@ -166,17 +158,6 @@ while test "$pos" -lt "$nargs"; do
pos=$((pos + 1))
done
# --- materialize any --cookie entries into a cookies.txt ---------------------
if test "${#cookies[@]}" -gt 0; then
jar="${tmpdir}/cookies.txt"
: >"$jar"
for spec in "${cookies[@]}"; do
printf '127.0.0.1:%s\tTRUE\t/\tFALSE\t1999999999\t%s\t%s\n' \
"$port" "${spec%%=*}" "${spec#*=}" >>"$jar"
done
hts+=(--cookies-file "$jar")
fi
# --- run httrack -------------------------------------------------------------
which httrack >/dev/null || die "could not find httrack"
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')

View File

@@ -110,19 +110,6 @@ class Handler(SimpleHTTPRequestHandler):
return self.fail_cookie("badger")
self.send_html("\tThis is a test.")
# --cookies-file (#215): the secret page needs a cookie no page ever sets,
# so it is reachable only when --cookies-file preloads it.
GATE_COOKIE = ("session", "opensesame")
def route_gated_index(self):
self.send_html('\tThis is a <a href="secret.php">link</a>')
def route_gated_secret(self):
name, value = self.GATE_COOKIE
if self.request_cookies().get(name) != value:
return self.fail_cookie(name)
self.send_html("\tThis is the secret.")
def route_robots(self):
body = b"User-agent: *\nDisallow:\n"
self.send_response(200)
@@ -358,8 +345,6 @@ class Handler(SimpleHTTPRequestHandler):
"/cookies/entrance.php": route_entrance,
"/cookies/second.php": route_second,
"/cookies/third.php": route_third,
"/gated/index.php": route_gated_index,
"/gated/secret.php": route_gated_secret,
"/robots.txt": route_robots,
"/types/index.html": route_types_index,
"/types/control.php": route_types,