Compare commits

..

1 Commits

Author SHA1 Message Date
Xavier Roche
0133da24b5 ci: add a MemorySanitizer job for the offline engine self-tests
MSan is the only sanitizer that catches a read of uninitialized memory --
the class of #143, where the size filter tested an uninitialized stack
LLint and forbade files at random. ASan and UBSan let that through.

MSan reports any byte produced by an uninstrumented library as
uninitialized, so the job stays inside our own code: clang, a static link
(the MSan runtime is not injected into shared objects), --disable-https to
drop openssl, and only the offline 01_engine-* self-tests minus the
zlib-backed cache trio. Those self-tests drive the hostile-input parsers
(charset, mime, html, entities, idna, filters) straight through MSan.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-27 08:27:31 +02:00
19 changed files with 41 additions and 578 deletions

View File

@@ -3,7 +3,7 @@
.\"
.\" This file is generated by man/makeman.sh; do not edit by hand.
.\" SPDX-License-Identifier: GPL-3.0-or-later
.TH httrack 1 "27 June 2026" "httrack website copier"
.TH httrack 1 "26 June 2026" "httrack website copier"
.SH NAME
httrack \- offline browser : copy websites to a local directory
.SH SYNOPSIS
@@ -43,7 +43,6 @@ httrack \- offline browser : copy websites to a local directory
[ \fB\-x, \-\-replace\-external\fR ]
[ \fB\-%x, \-\-disable\-passwords\fR ]
[ \fB\-%q, \-\-include\-query\-string\fR ]
[ \fB\-%g, \-\-strip\-query\fR ]
[ \fB\-o, \-\-generate\-errors\fR ]
[ \fB\-X, \-\-purge\-old[=N]\fR ]
[ \fB\-%p, \-\-preserve\fR ]
@@ -199,8 +198,6 @@ replace external html links by error pages (\-\-replace\-external)
do not include any password for external password protected websites (%x0 include) (\-\-disable\-passwords)
.IP \-%q
*include query string for local files (useless, for information purpose only) (%q0 don't include) (\-\-include\-query\-string)
.IP \-%g
strip query keys for dedup ([host/pattern=]key1,key2,...) (\-\-strip\-query <param>)
.IP \-o
*generate output html file in case of error (404..) (o0 don't generate) (\-\-generate\-errors)
.IP \-X
@@ -228,8 +225,6 @@ tolerant requests (accept bogus responses on some servers, but not standard!) (\
update hacks: various hacks to limit re\-transfers when updating (identical size, bogus response..) (\-\-updatehack)
.IP \-%u
url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..) (\-\-urlhack)
.br
opt out of one url\-hack part: \-\-keep\-www\-prefix (www.foo.com<>foo.com), \-\-keep\-double\-slashes (//), \-\-keep\-query\-order (?b&a)
.IP \-%A
assume that a type (cgi,asp..) is always linked with a mime type (\-%A php3,cgi=text/html;dat,bin=application/x\-zip) (\-\-assume <param>)
.br

View File

@@ -60,9 +60,6 @@ Please visit our Website: http://www.httrack.com
param1 : this option must be alone, and needs one distinct parameter (-P <path>)
param0 : this option must be alone, but the parameter should be put together (+*.gif)
*/
/* clang-format off: hand-aligned table; clang-format reflows the whole
initializer (2->4 space) on any edit, churning every untouched row. */
/* clang-format off */
const char *hts_optalias[][4] = {
/* {"","","",""}, */
{"path", "-O", "param1", "output path"},
@@ -110,8 +107,6 @@ const char *hts_optalias[][4] = {
{"disable-passwords", "-%x", "single", ""}, {"disable-password", "-%x",
"single", ""},
{"include-query-string", "-%q", "single", ""},
{"strip-query", "-%g", "param1",
"strip [host/pattern=]key1,key2,... from URLs"},
{"generate-errors", "-o", "single", ""},
{"do-not-generate-errors", "-o0", "single", ""},
{"purge-old", "-X", "param", ""},
@@ -128,9 +123,6 @@ const char *hts_optalias[][4] = {
{"tolerant", "-%B", "single", ""},
{"updatehack", "-%s", "single", ""}, {"sizehack", "-%s", "single", ""},
{"urlhack", "-%u", "single", ""},
{"keep-www-prefix", "-%j", "single", ""},
{"keep-double-slashes", "-%o", "single", ""},
{"keep-query-order", "-%y", "single", ""},
{"user-agent", "-F", "param1", "user-agent identity"},
{"referer", "-%R", "param1", "default referer URL"},
{"from", "-%E", "param1", "from email address"},
@@ -249,7 +241,6 @@ const char *hts_optalias[][4] = {
{"", "", "", ""}
};
/* clang-format on */
/*
Check for alias in command-line

View File

@@ -3739,9 +3739,6 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
if (StringNotEmpty(from->user_agent))
StringCopyS(to->user_agent, from->user_agent);
if (StringNotEmpty(from->strip_query))
StringCopyS(to->strip_query, from->strip_query);
if (from->retry > -1)
to->retry = from->retry;

View File

@@ -234,12 +234,8 @@ struct hash_struct {
coucal adrfil;
/* former address+path -> link index (renamed/moved entries) */
coucal former_adrfil;
/* effective urlhack sub-flags: www.==host / // collapse / query-arg sort */
hts_boolean norm_host;
hts_boolean norm_slash;
hts_boolean norm_query;
/* query-strip keys (not owned); set from opt->strip_query at hash_init */
const char *strip_query;
/* scratch buffers reused across lookups (not reentrant) */
int normalized;
char normfil[HTS_URLMAXSIZE * 2];
char normfil2[HTS_URLMAXSIZE * 2];
char catbuff[CATBUFF_SIZE];
@@ -368,22 +364,6 @@ int fspc(httrackp * opt, FILE * fp, const char *type);
char *next_token(char *p, int flag);
/* Like fil_normalized(), but first drops query keys in STRIP (comma-separated,
"*" = all); STRIP NULL/empty behaves exactly like fil_normalized(). */
char *fil_normalized_filtered(const char *source, char *dest,
const char *strip);
/* As fil_normalized_filtered(), but DO_SLASH/DO_QUERY gate the // collapse and
the query-argument sort independently (the urlhack sub-flags). */
char *fil_normalized_filtered_ex(const char *source, char *dest,
const char *strip, int do_slash, int do_query);
/* For URL ADR/FIL, return (in DEST) the comma keylist to strip from the
'\n'-separated "[pattern=]keys" RULES (patterns matched on host/path via
strjoker, last wins); NULL if none match. Feeds fil_normalized_filtered(). */
const char *hts_query_strip_keys(const char *rules, const char *adr,
const char *fil, char *dest, size_t destsize);
/* Read a whole file into a freshly malloc'd, NUL-terminated buffer; the caller
owns it and must release it with freet(). Return NULL on missing/unreadable
file (readfile_or substitutes defaultdata instead). The byte content is NOT

View File

@@ -1570,30 +1570,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
com++;
}
break; // url hack
case 'j':
opt->no_www_dedup =
HTS_TRUE; // --keep-www-prefix: keep www.X != X
if (*(com + 1) == '0') {
opt->no_www_dedup = HTS_FALSE;
com++;
}
break;
case 'o':
opt->no_slash_dedup =
HTS_TRUE; // --keep-double-slashes: keep //
if (*(com + 1) == '0') {
opt->no_slash_dedup = HTS_FALSE;
com++;
}
break;
case 'y':
opt->no_query_dedup =
HTS_TRUE; // --keep-query-order: keep ?b&a order
if (*(com + 1) == '0') {
opt->no_query_dedup = HTS_FALSE;
com++;
}
break;
case 'v':
opt->verbosedisplay = HTS_VERBOSE_FULL;
if (isdigit((unsigned char) *(com + 1))) {
@@ -1961,21 +1937,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
}
break;
case 'g': // strip-query: accumulate "[pattern=]keys" entries
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
HTS_PANIC_PRINTF("Option strip-query needs a blank space and "
"[host/pattern=]key1,key2,...");
printf("Example: --strip-query "
"\"www.example.com/*=utm_source,sid\"\n");
htsmain_free();
return -1;
} else {
na++;
if (StringNotEmpty(opt->strip_query))
StringCat(opt->strip_query, "\n");
StringCat(opt->strip_query, argv[na]);
}
break;
case 't': /* do not change type (ending) of filenames according to the MIME type */
opt->no_type_change = 1;
if (*(com+1)=='0') { opt->no_type_change = 0; com++; }

View File

@@ -106,10 +106,10 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
const lien_url*const lien = (const lien_url*) value;
const char *const adr = !former ? lien->adr : lien->former_adr;
const char *const fil = !former ? lien->fil : lien->former_fil;
const char *const adr_norm =
adr != NULL ? (hash->norm_host ? jump_normalized_const(adr)
: jump_identification_const(adr))
: NULL;
const char *const adr_norm = adr != NULL ?
( hash->normalized ? jump_normalized_const(adr)
: jump_identification_const(adr) )
: NULL;
// copy address
assertf(adr_norm != NULL);
@@ -117,18 +117,10 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
// copy link
assertf(fil != NULL);
{
/* resolve the per-URL strip keys; strip applies even when urlhack is off */
char BIGSTK keybuf[HTS_URLMAXSIZE];
const char *const keys = hts_query_strip_keys(hash->strip_query, adr, fil,
keybuf, sizeof(keybuf));
if (hash->norm_slash || hash->norm_query || keys != NULL) {
fil_normalized_filtered_ex(fil, &hash->normfil[strlen(hash->normfil)],
keys, hash->norm_slash, hash->norm_query);
} else {
strcpy(&hash->normfil[strlen(hash->normfil)], fil);
}
if (hash->normalized) {
fil_normalized(fil, &hash->normfil[strlen(hash->normfil)]);
} else {
strcpy(&hash->normfil[strlen(hash->normfil)], fil);
}
// hash
@@ -140,7 +132,8 @@ static int key_adrfil_equals_generic(void *arg,
coucal_key_const a_,
coucal_key_const b_,
const int former) {
hash_struct *const hash = (hash_struct *) arg;
hash_struct *const hash = (hash_struct*) arg;
const int normalized = hash->normalized;
const lien_url*const a = (const lien_url*) a_;
const lien_url*const b = (const lien_url*) b_;
const char *const a_adr = !former ? a->adr : a->former_adr;
@@ -157,10 +150,10 @@ static int key_adrfil_equals_generic(void *arg,
assertf(b_fil != NULL);
// skip scheme and authentication to the domain (possibly without www.)
ja = hash->norm_host ? jump_normalized_const(a_adr)
: jump_identification_const(a_adr);
jb = hash->norm_host ? jump_normalized_const(b_adr)
: jump_identification_const(b_adr);
ja = normalized
? jump_normalized_const(a_adr) : jump_identification_const(a_adr);
jb = normalized
? jump_normalized_const(b_adr) : jump_identification_const(b_adr);
assertf(ja != NULL);
assertf(jb != NULL);
if (strcasecmp(ja, jb) != 0) {
@@ -168,23 +161,12 @@ static int key_adrfil_equals_generic(void *arg,
}
// now compare pathes
{
char BIGSTK ka[HTS_URLMAXSIZE], kb[HTS_URLMAXSIZE];
const char *const keysa =
hts_query_strip_keys(hash->strip_query, a_adr, a_fil, ka, sizeof(ka));
const char *const keysb =
hts_query_strip_keys(hash->strip_query, b_adr, b_fil, kb, sizeof(kb));
if (hash->norm_slash || hash->norm_query || keysa != NULL ||
keysb != NULL) {
fil_normalized_filtered_ex(a_fil, hash->normfil, keysa, hash->norm_slash,
hash->norm_query);
fil_normalized_filtered_ex(b_fil, hash->normfil2, keysb, hash->norm_slash,
hash->norm_query);
return strcmp(hash->normfil, hash->normfil2) == 0;
} else {
return strcmp(a_fil, b_fil) == 0;
}
if (normalized) {
fil_normalized(a_fil, hash->normfil);
fil_normalized(b_fil, hash->normfil2);
return strcmp(hash->normfil, hash->normfil2) == 0;
} else {
return strcmp(a_fil, b_fil) == 0;
}
}
@@ -240,17 +222,11 @@ static int key_former_adrfil_equals(void *arg,
return key_adrfil_equals_generic(arg, a, b, 1);
}
void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized) {
void hash_init(httrackp *opt, hash_struct * hash, int normalized) {
hash->sav = coucal_new(0);
hash->adrfil = coucal_new(0);
hash->former_adrfil = coucal_new(0);
/* urlhack is the umbrella; per-feature negatives opt out of each part */
hash->norm_host = normalized && !opt->no_www_dedup;
hash->norm_slash = normalized && !opt->no_slash_dedup;
hash->norm_query = normalized && !opt->no_query_dedup;
/* snapshot the query-strip list (not owned; valid for the hash lifetime) */
hash->strip_query =
StringNotEmpty(opt->strip_query) ? StringBuff(opt->strip_query) : NULL;
hash->normalized = normalized;
hts_set_hash_handler(hash->sav, opt);
hts_set_hash_handler(hash->adrfil, opt);
@@ -306,26 +282,6 @@ void hash_free(hash_struct *hash) {
}
}
/* Test helper: do the two URLs dedupe to the same key under opt's urlhack
flags? Exercises the live hash compare (norm_host/slash/query resolution). */
hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila,
const char *adrb, const char *filb) {
hash_struct hash;
lien_url la, lb;
hts_boolean eq;
memset(&la, 0, sizeof(la));
memset(&lb, 0, sizeof(lb));
la.adr = key_duphandler(NULL, adra);
la.fil = key_duphandler(NULL, fila);
lb.adr = key_duphandler(NULL, adrb);
lb.fil = key_duphandler(NULL, filb);
hash_init(opt, &hash, opt->urlhack);
eq = key_adrfil_equals(&hash, &la, &lb);
hash_free(&hash);
return eq;
}
// retour: position ou -1 si non trouvé
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
hash_struct_type type) {

View File

@@ -51,12 +51,8 @@ typedef enum hash_struct_type {
} hash_struct_type;
// tables de hachage
void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized);
void hash_init(httrackp *opt, hash_struct *hash, int normalized);
void hash_free(hash_struct *hash);
/* Test helper: HTS_TRUE if the two URLs dedupe together under opt's urlhack
flags. */
hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila,
const char *adrb, const char *filb);
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
hash_struct_type type);
void hash_write(hash_struct * hash, size_t lpos);

View File

@@ -563,7 +563,6 @@ void help(const char *app, int more) {
(" %x do not include any password for external password protected websites (%x0 include)");
infomsg
(" %q *include query string for local files (useless, for information purpose only) (%q0 don't include)");
infomsg(" %g strip query keys for dedup ([host/pattern=]key1,key2,...)");
infomsg
(" o *generate output html file in case of error (404..) (o0 don't generate)");
infomsg(" X *purge old files after update (X0 keep delete)");
@@ -588,9 +587,6 @@ void help(const char *app, int more) {
(" %s update hacks: various hacks to limit re-transfers when updating (identical size, bogus response..)");
infomsg
(" %u url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..)");
infomsg(" opt out of one url-hack part: --keep-www-prefix "
"(www.foo.com<>foo.com), --keep-double-slashes (//), "
"--keep-query-order (?b&a)");
infomsg
(" %A assume that a type (cgi,asp..) is always linked with a mime type (-%A php3,cgi=text/html;dat,bin=application/x-zip)");
infomsg(" shortcut: '--assume standard' is equivalent to -%A "

View File

@@ -3610,10 +3610,7 @@ static int sortNormFnc(const void *a_, const void *b_) {
return strcmp(*a + 1, *b + 1);
}
/* Path normalizer core: optionally collapse redundant '//' (DO_SLASH) and/or
sort query arguments (DO_QUERY) so equivalent URLs dedupe. */
static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
int do_query) {
HTSEXT_API char *fil_normalized(const char *source, char *dest) {
char lastc = 0;
int gotquery = 0;
int ampargs = 0;
@@ -3623,8 +3620,8 @@ static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
for(i = j = 0; source[i] != '\0'; i++) {
if (!gotquery && source[i] == '?')
gotquery = ampargs = 1;
if (do_slash && !gotquery && lastc == '/' && source[i] == '/') {
// foo//bar -> foo/bar
if ((!gotquery && lastc == '/' && source[i] == '/') // foo//bar -> foo/bar
) {
} else {
if (gotquery && source[i] == '&') {
ampargs++;
@@ -3636,7 +3633,7 @@ static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
dest[j++] = '\0';
/* Sort arguments (&foo=1&bar=2 == &bar=2&foo=1) */
if (do_query && ampargs > 1) {
if (ampargs > 1) {
char **amps = malloct(ampargs * sizeof(char *));
char *copyBuff = NULL;
size_t qLen = 0;
@@ -3684,153 +3681,6 @@ static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
return dest;
}
HTSEXT_API char *fil_normalized(const char *source, char *dest) {
return fil_normalized_ex(source, dest, 1, 1);
}
/* Is query key ARG[0..keylen) in the comma-separated STRIP list? "*" = all;
case-sensitive, space-trimmed tokens. */
static int hts_query_key_stripped(const char *arg, size_t keylen,
const char *strip) {
const char *p = strip;
while (*p != '\0') {
const char *start = p;
size_t toklen;
while (*p != '\0' && *p != ',')
p++;
toklen = (size_t) (p - start);
while (toklen > 0 && *start == ' ') {
start++;
toklen--;
}
while (toklen > 0 && start[toklen - 1] == ' ')
toklen--;
if (toklen == 1 && start[0] == '*')
return 1;
if (toklen == keylen && strncmp(start, arg, keylen) == 0)
return 1;
if (*p == ',')
p++;
}
return 0;
}
/* see htscore.h */
char *fil_normalized_filtered_ex(const char *source, char *dest,
const char *strip, int do_slash,
int do_query) {
const char *query;
char BIGSTK tmp[HTS_URLMAXSIZE * 2];
htsbuff cb;
int wrote = 0;
/* No strip list, or no query: plain normalization. */
if (strip == NULL || *strip == '\0' ||
(query = strchr(source, '?')) == NULL) {
return fil_normalized_ex(source, dest, do_slash, do_query);
}
/* Copy the path, re-emit kept query args, let fil_normalized() sort. Walk
every field incl. empty/trailing ("a&","?&&") so the result is a fixpoint
(the read re-normalizes it; a dropped empty arg would miss dedup). */
cb = htsbuff_ptr(tmp, sizeof(tmp));
htsbuff_catn(&cb, source, (size_t) (query - source));
for (query++;;) {
const char *const arg = query;
const char *eq = NULL;
size_t keylen, arglen;
while (*query != '\0' && *query != '&') {
if (eq == NULL && *query == '=')
eq = query;
query++;
}
arglen = (size_t) (query - arg);
keylen = eq != NULL ? (size_t) (eq - arg) : arglen;
if (!hts_query_key_stripped(arg, keylen, strip)) {
htsbuff_catc(&cb, wrote ? '&' : '?');
htsbuff_catn(&cb, arg, arglen);
wrote = 1;
}
if (*query == '\0')
break;
query++;
}
return fil_normalized_ex(tmp, dest, do_slash, do_query);
}
/* see htscore.h */
char *fil_normalized_filtered(const char *source, char *dest,
const char *strip) {
return fil_normalized_filtered_ex(source, dest, strip, 1, 1);
}
/* see htscore.h */
const char *hts_query_strip_keys(const char *rules, const char *adr,
const char *fil, char *dest, size_t destsize) {
const char *p, *q;
const char *result = NULL;
char BIGSTK url[HTS_URLMAXSIZE * 2];
if (rules == NULL || *rules == '\0' || destsize == 0)
return NULL;
/* Match string = normalized host/path, query removed. jump_normalized_const
collapses www+scheme/auth so read and write (double-normalized) agree;
query excluded keeps the decision on host/path only. */
url[0] = '\0';
strcatbuff(url, jump_normalized_const(adr));
if (fil[0] != '/')
strcatbuff(url, "/");
q = strchr(fil, '?');
if (q != NULL)
strncatbuff(url, fil, (int) (q - fil));
else
strcatbuff(url, fil);
/* Walk the '\n' entries; last match wins (like the +/- filter eval). Each is
"pattern=keys"; no '=' is the bare form, pattern "*". */
for (p = rules; *p != '\0';) {
const char *const line = p;
const char *eol, *eq, *keys;
char BIGSTK pat[HTS_URLMAXSIZE * 2];
while (*p != '\0' && *p != '\n')
p++;
eol = p;
if (*p == '\n')
p++;
if (eol == line)
continue;
eq = memchr(line, '=', (size_t) (eol - line));
if (eq != NULL) {
size_t patlen = (size_t) (eq - line);
if (patlen >= sizeof(pat))
patlen = sizeof(pat) - 1;
memcpy(pat, line, patlen);
pat[patlen] = '\0';
keys = eq + 1;
} else {
pat[0] = '*';
pat[1] = '\0';
keys = line;
}
if (strjoker(url, pat, NULL, NULL) != NULL) {
size_t klen = (size_t) (eol - keys);
if (klen >= destsize)
klen = destsize - 1;
memcpy(dest, keys, klen);
dest[klen] = '\0';
result = dest;
}
}
return result;
}
#define endwith(a) ( (len >= (sizeof(a)-1)) ? ( strncmp(dest, a+len-(sizeof(a)-1), sizeof(a)-1) == 0 ) : 0 );
HTSEXT_API char *adr_normalized_sized(const char *source, char *dest,
size_t destsize) {
@@ -6040,11 +5890,7 @@ HTSEXT_API httrackp *hts_create_opt(void) {
opt->verbosedisplay = HTS_VERBOSE_NONE; // no text animation
opt->sizehack = HTS_FALSE;
opt->urlhack = HTS_TRUE;
opt->no_www_dedup = HTS_FALSE;
opt->no_slash_dedup = HTS_FALSE;
opt->no_query_dedup = HTS_FALSE;
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
StringCopy(opt->strip_query, "");
opt->ftp_proxy = HTS_TRUE;
opt->convert_utf8 = HTS_TRUE;
StringCopy(opt->filelist, "");
@@ -6189,7 +6035,6 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
StringFree(opt->urllist);
StringFree(opt->footer);
StringFree(opt->mod_blacklist);
StringFree(opt->strip_query);
StringFree(opt->path_html);
StringFree(opt->path_html_utf8);

View File

@@ -198,13 +198,6 @@ int url_savename(lien_adrfilsave *const afs,
// copy of fil, used for lookups (see urlhack)
const char *normadr = adr;
const char *normfil = fil_complete;
/* query keys to strip for this URL (NULL = none); decoupled from urlhack */
char BIGSTK stripkeys[HTS_URLMAXSIZE];
const char *const strip =
StringNotEmpty(opt->strip_query)
? hts_query_strip_keys(StringBuff(opt->strip_query), adr,
fil_complete, stripkeys, sizeof(stripkeys))
: NULL;
const char *const print_adr = jump_protocol_const(adr);
const char *start_pos = NULL, *nom_pos = NULL, *dot_pos = NULL; // Position nom et point
@@ -237,13 +230,9 @@ int url_savename(lien_adrfilsave *const afs,
// www-42.foo.com -> foo.com
// foo.com/bar//foobar -> foo.com/bar/foobar
if (opt->urlhack) {
// dedup-lookup key; honor the per-feature negatives like htshash.c so
// distinct URLs keep distinct savenames (else keep normadr = adr)
if (!opt->no_www_dedup)
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
normfil =
fil_normalized_filtered_ex(fil_complete, normfil_, strip,
!opt->no_slash_dedup, !opt->no_query_dedup);
// copy of adr (without protocol), used for lookups (see urlhack)
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
normfil = fil_normalized(fil_complete, normfil_);
} else {
if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder
char *pos = strchr(adr_complete, ':');
@@ -256,11 +245,6 @@ int url_savename(lien_adrfilsave *const afs,
normadr = normadr_;
}
}
// strip still applies with urlhack off (host left untouched); no // or
// query-sort here, to match the hash key (norm_slash/norm_query are 0 when
// urlhack is off) so a URL is looked up under the key it was stored with
if (strip != NULL)
normfil = fil_normalized_filtered_ex(fil_complete, normfil_, strip, 0, 0);
}
// à afficher sans ftp://

View File

@@ -529,12 +529,6 @@ struct httrackp {
htslibhandles libHandles; /**< loaded external module handles */
//
htsoptstate state; /**< embedded live engine state */
String strip_query; /**< query keys to drop when deduping URLs (-strip-query);
appended at the tail to keep field offsets stable */
hts_boolean
no_www_dedup; /**< with urlhack, keep www.host distinct from host */
hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
};
/* Running statistics for a mirror. */

View File

@@ -3602,28 +3602,16 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
int set_prio_to = 0; // pas de priotité fixéd par wizard
// check whether URLHack is harmless or not (per the effective
// sub-flags)
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
!opt->no_query_dedup)) {
const int norm_host = !opt->no_www_dedup;
const int norm_slash = !opt->no_slash_dedup;
const int norm_query = !opt->no_query_dedup;
// check whether URLHack is harmless or not
if (opt->urlhack) {
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
strlcpybuff(n_adr,
norm_host ? jump_normalized_const(moved->adr)
: jump_identification_const(moved->adr),
sizeof(n_adr));
strlcpybuff(pn_adr,
norm_host ? jump_normalized_const(urladr())
: jump_identification_const(urladr()),
sizeof(pn_adr));
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
norm_query);
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
norm_query);
n_adr[0] = n_fil[0] = '\0';
(void) adr_normalized_sized(moved->adr, n_adr, sizeof(n_adr));
(void) fil_normalized(moved->fil, n_fil);
(void) adr_normalized_sized(urladr(), pn_adr, sizeof(pn_adr));
(void) fil_normalized(urlfil(), pn_fil);
if (strcasecmp(n_adr, pn_adr) == 0
&& strcasecmp(n_fil, pn_fil) == 0) {
hts_log_print(opt, LOG_WARNING,

View File

@@ -1052,173 +1052,6 @@ static int st_cookies(httrackp *opt, int argc, char **argv) {
return err;
}
/* --strip-query: resolver + fil_normalized_filtered, end to end. */
static int st_stripquery(httrackp *opt, int argc, char **argv) {
char dest[1024], keys[256], ref[1024];
const char *k;
(void) opt;
(void) argc;
(void) argv;
/* empty rules == plain fil_normalized */
assertf(hts_query_strip_keys(NULL, "h.com", "/p?a=1", keys, sizeof(keys)) ==
NULL);
assertf(hts_query_strip_keys("", "h.com", "/p?a=1", keys, sizeof(keys)) ==
NULL);
assertf(strcmp(fil_normalized_filtered("/p?b=2&a=1", dest, NULL),
fil_normalized("/p?b=2&a=1", ref)) == 0);
/* bare form (*=keys): strip the key everywhere, keep+sort the rest */
k = hts_query_strip_keys("sid", "any.com", "/p?b=2&sid=x&a=1", keys,
sizeof(keys));
assertf(k != NULL && strcmp(k, "sid") == 0);
assertf(strcmp(fil_normalized_filtered("/p?b=2&sid=x&a=1", dest, k),
"/p?a=1&b=2") == 0);
/* reordered variant + an extra stripped key == the clean URL */
assertf(strcmp(fil_normalized_filtered("/p?sid=y&a=1&b=2", dest, "sid"),
fil_normalized("/p?a=1&b=2", ref)) == 0);
/* host pattern matches only that host, incl. its www-normalized forms */
assertf(hts_query_strip_keys("ex.com/*=utm", "other.com", "/p?utm=1", keys,
sizeof(keys)) == NULL);
assertf(hts_query_strip_keys("ex.com/*=utm", "ex.com", "/p?utm=1", keys,
sizeof(keys)) != NULL);
assertf(hts_query_strip_keys("ex.com/*=utm", "www.ex.com", "/p?utm=1", keys,
sizeof(keys)) != NULL);
assertf(hts_query_strip_keys("ex.com/*=utm", "http://www-3.ex.com",
"/p?utm=1", keys, sizeof(keys)) != NULL);
/* last match wins, wholesale: host rule overrides global, no union */
k = hts_query_strip_keys("*=sid\nex.com/*=utm", "ex.com",
"/p?sid=1&utm=2&a=3", keys, sizeof(keys));
assertf(k != NULL && strcmp(k, "utm") == 0);
assertf(strcmp(fil_normalized_filtered("/p?sid=1&utm=2&a=3", dest, k),
"/p?a=3&sid=1") == 0);
k = hts_query_strip_keys("*=sid\nex.com/*=utm", "z.com", "/p?sid=1&a=3", keys,
sizeof(keys));
assertf(k != NULL && strcmp(k, "sid") == 0);
/* whole-key match, not prefix: "utm" must not strip utm_source */
assertf(strcmp(fil_normalized_filtered("/p?utm_source=x&a=1", dest, "utm"),
"/p?a=1&utm_source=x") == 0);
/* "*" drops every param; a fully-stripped single-arg query loses its '?' */
assertf(strcmp(fil_normalized_filtered("/p?a=1&b=2", dest, "*"), "/p") == 0);
assertf(strcmp(fil_normalized_filtered("/p?utm=1", dest, "utm"), "/p") == 0);
/* degenerate forms a=, b, c== (key 'c'); strip c keeps a= and b */
assertf(strcmp(fil_normalized_filtered("/p?a=&b&c==", dest, "c"),
"/p?a=&b") == 0);
/* short key must not strip a longer one: 'c' must not touch 'cc' */
assertf(strcmp(fil_normalized_filtered("/p?cc=1&c=2", dest, "c"),
"/p?cc=1") == 0);
/* repeated key: every occurrence is stripped, not just the first */
assertf(
strcmp(fil_normalized_filtered("/p?foo=42&bar=13&foo=43", dest, "foo"),
"/p?bar=13") == 0);
/* repeated key mixing missing/empty values */
assertf(
strcmp(fil_normalized_filtered("/p?foo&bar=13&foo=42&foo=", dest, "foo"),
"/p?bar=13") == 0);
/* repeated key kept (no match): all occurrences retained, then sorted */
assertf(strcmp(fil_normalized_filtered("/p?foo=42&bar=13&foo=43", dest, "z"),
"/p?bar=13&foo=42&foo=43") == 0);
/* value containing '=': the key is only the part before the first '='. Strip
'foo' drops "foo=42=17" whole; the '=' in the value is not a delimiter. */
assertf(strcmp(fil_normalized_filtered("/p?foo=42=17&bar=", dest, "foo"),
"/p?bar=") == 0);
/* keeping it preserves the embedded '=' verbatim */
assertf(strcmp(fil_normalized_filtered("/p?foo=42=17&bar=", dest, "bar"),
"/p?foo=42=17") == 0);
/* a value segment is not a key: stripping "42" must not touch foo=42=17 */
assertf(strcmp(fil_normalized_filtered("/p?foo=42=17", dest, "42"),
"/p?foo=42=17") == 0);
/* Idempotency: the read path re-normalizes an already-normalized fil, so the
result must be a fixpoint or dedup misses (catches a dropped empty/trailing
arg like "?&&", "a&"). */
{
static const char *const qs[] = {"/p?a=&b&c==",
"/p?a&&b",
"/p?&a",
"/p?a&",
"/p?",
"/p?=v",
"/p?&&",
"/p?b=2&a=1",
"/p?utm=x&",
"/p?&utm=x",
"/p?foo=42&bar=13&foo=43",
"/p?foo&bar=13&foo=42&foo=",
"/p?foo=42=17&bar="};
static const char *const strips[] = {NULL, "z", "utm", "*", "a", "foo"};
char once[1024], twice[1024];
size_t i, j;
for (i = 0; i < sizeof(qs) / sizeof(qs[0]); i++) {
for (j = 0; j < sizeof(strips) / sizeof(strips[0]); j++) {
fil_normalized_filtered(qs[i], once, strips[j]);
fil_normalized_filtered(once, twice, strips[j]);
assertf(strcmp(once, twice) == 0);
}
}
}
printf("strip-query self-test OK\n");
return 0;
}
/* -%u url-hack split (#271): each sub-flag must toggle independently. */
static int st_urlhack(httrackp *opt, int argc, char **argv) {
(void) argc;
(void) argv;
#define EQ(aa, fa, ab, fb) hash_url_equals(opt, aa, fa, ab, fb)
/* urlhack on, no opt-outs: www, // and query order all collapse */
opt->urlhack = HTS_TRUE;
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
assertf(EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b"));
assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
/* keep-www-prefix: host off; // and query still collapse */
opt->no_www_dedup = HTS_TRUE;
assertf(!EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b"));
assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
opt->no_www_dedup = HTS_FALSE;
/* keep-double-slashes: // significant; www, query order still collapse */
opt->no_slash_dedup = HTS_TRUE;
assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b"));
assertf(EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
opt->no_slash_dedup = HTS_FALSE;
/* keep-query-order: query order significant; www and // still collapse */
opt->no_query_dedup = HTS_TRUE;
assertf(!EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
assertf(EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b"));
opt->no_query_dedup = HTS_FALSE;
/* all opt-outs == urlhack off entirely */
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
assertf(!EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b"));
assertf(!EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
opt->urlhack = HTS_FALSE;
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
assertf(!EQ("www.foo.com", "/a", "foo.com", "/a"));
assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b"));
#undef EQ
printf("urlhack self-test OK\n");
return 0;
}
/* ------------------------------------------------------------ */
/* Registry: name -> handler, with a usage hint and a one-line description. */
/* ------------------------------------------------------------ */
@@ -1235,10 +1068,6 @@ static const struct selftest_entry {
"size-aware filter verdict (negative size = unknown/scan time)",
st_filtersize},
{"simplify", "<path>", "collapse ./ and ../ in a path", st_simplify},
{"stripquery", "", "--strip-query pattern/key stripping self-test",
st_stripquery},
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
st_urlhack},
{"mime", "<filename>", "MIME type for a filename", st_mime},
{"charset", "<charset> <string>",
"convert a string to UTF-8 from a charset", st_charset},

View File

@@ -1,8 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# --strip-query: pattern-scoped query-key stripping for dedup. All assertions
# live in the engine self-test (hts_query_strip_keys + fil_normalized_filtered).
httrack -O /dev/null -#test=stripquery | grep -q "strip-query self-test OK"

View File

@@ -1,8 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# -%u url-hack split (#271): www / // / query-order dedup toggle independently.
# All assertions live in the engine self-test (hash compare flag resolution).
httrack -O /dev/null -#test=urlhack run | grep -q "urlhack self-test OK"

View File

@@ -1,23 +0,0 @@
#!/bin/bash
#
# End-to-end --strip-query (#112): two links to one resource differing only by
# ?utm_source dedup to a single saved file (2 files written: index + resource);
# the control crawl without the option keeps both variants (3 files). Locks the
# CLI->opt->hash plumbing the engine self-test can't reach.
set -e
: "${top_srcdir:=..}"
# stripped: the two ?utm_source variants collapse to one resource
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \
httrack 'BASEURL/stripquery/index.html' --strip-query 'utm_source'
# control: no stripping -> both query-named variants are saved
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \
httrack 'BASEURL/stripquery/index.html'
# strip still applies with url-hack off (-%u0): exercises the urlhack-off
# savename branch, which must normalize the dedup key the same way the hash does
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \
httrack 'BASEURL/stripquery/index.html' -%u0 --strip-query 'utm_source'

View File

@@ -5,7 +5,6 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
proxy-https-server.py \
local-crawl.sh local-server.py server.crt server.key \
server-root/simple/basic.html server-root/simple/link.html \
server-root/stripquery/index.html server-root/stripquery/a.html \
fixtures/cache-golden/hts-cache/new.zip
TESTS_ENVIRONMENT =
@@ -46,9 +45,7 @@ TESTS = \
01_engine-savename.test \
01_engine-selftest-dispatch.test \
01_engine-simplify.test \
01_engine-stripquery.test \
01_engine-strsafe.test \
01_engine-urlhack.test \
02_manpage-regen.test \
02_update-cache.test \
10_crawl-simple.test \
@@ -71,7 +68,6 @@ TESTS = \
22_local-broken-size.test \
23_local-errpage.test \
24_local-resume-overlap.test \
25_local-mime-exclude.test \
26_local-strip-query.test
25_local-mime-exclude.test
CLEANFILES = check-network_sh.cache

View File

@@ -1 +0,0 @@
<html><body>resource A</body></html>

View File

@@ -1,5 +0,0 @@
<html><body>
Two links to one resource, differing only by a tracking parameter.
<a href="a.html?utm_source=x">x</a>
<a href="a.html?utm_source=y">y</a>
</body></html>