mirror of
https://github.com/xroche/httrack.git
synced 2026-06-27 20:47:19 +03:00
Compare commits
5 Commits
fix/filter
...
fix/urlhac
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8ebfcbe416 | ||
|
|
40a66600ff | ||
|
|
768756e231 | ||
|
|
b138c87a93 | ||
|
|
3de47433b7 |
45
.github/workflows/ci.yml
vendored
45
.github/workflows/ci.yml
vendored
@@ -188,6 +188,51 @@ jobs:
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# MemorySanitizer catches reads of uninitialized memory (#143's stack-garbage
|
||||
# size filter) that ASan/UBSan miss. It flags any byte an uninstrumented lib
|
||||
# wrote, so the job stays in our own code: offline self-tests only, no openssl
|
||||
# (--disable-https), no zlib cache tests, static (the runtime is not in .so's).
|
||||
msan:
|
||||
name: msan (MemorySanitizer, clang)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
set -euo pipefail
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential clang autoconf automake libtool autoconf-archive \
|
||||
zlib1g-dev
|
||||
|
||||
- name: Configure (MSan, static, no https)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
autoreconf -fi
|
||||
./configure CC=clang \
|
||||
CFLAGS="-fsanitize=memory -fsanitize-memory-track-origins=2 -fno-sanitize-recover=all -g -O1 -fno-omit-frame-pointer" \
|
||||
LDFLAGS="-fsanitize=memory" \
|
||||
--disable-https --disable-shared --enable-static
|
||||
|
||||
- name: Build
|
||||
run: make -j"$(nproc)"
|
||||
|
||||
- name: Test (offline self-tests under MSan)
|
||||
env:
|
||||
MSAN_OPTIONS: abort_on_error=1:halt_on_error=1
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Engine self-tests only; the cache trio pulls in uninstrumented zlib.
|
||||
tests="$(cd tests && ls 01_engine-*.test | grep -v -- '-cache' | tr '\n' ' ')"
|
||||
make check TESTS="$tests"
|
||||
|
||||
- name: Print the test log on failure
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Optional-dependency build: compile and test with HTTPS/OpenSSL disabled --
|
||||
# the configuration users on minimal systems build, and one libssl is not even
|
||||
# installed here so configure cannot silently re-enable it. The matrix above
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
.\"
|
||||
.\" This file is generated by man/makeman.sh; do not edit by hand.
|
||||
.\" SPDX-License-Identifier: GPL-3.0-or-later
|
||||
.TH httrack 1 "26 June 2026" "httrack website copier"
|
||||
.TH httrack 1 "27 June 2026" "httrack website copier"
|
||||
.SH NAME
|
||||
httrack \- offline browser : copy websites to a local directory
|
||||
.SH SYNOPSIS
|
||||
@@ -43,6 +43,7 @@ httrack \- offline browser : copy websites to a local directory
|
||||
[ \fB\-x, \-\-replace\-external\fR ]
|
||||
[ \fB\-%x, \-\-disable\-passwords\fR ]
|
||||
[ \fB\-%q, \-\-include\-query\-string\fR ]
|
||||
[ \fB\-%g, \-\-strip\-query\fR ]
|
||||
[ \fB\-o, \-\-generate\-errors\fR ]
|
||||
[ \fB\-X, \-\-purge\-old[=N]\fR ]
|
||||
[ \fB\-%p, \-\-preserve\fR ]
|
||||
@@ -198,6 +199,8 @@ replace external html links by error pages (\-\-replace\-external)
|
||||
do not include any password for external password protected websites (%x0 include) (\-\-disable\-passwords)
|
||||
.IP \-%q
|
||||
*include query string for local files (useless, for information purpose only) (%q0 don't include) (\-\-include\-query\-string)
|
||||
.IP \-%g
|
||||
strip query keys for dedup ([host/pattern=]key1,key2,...) (\-\-strip\-query <param>)
|
||||
.IP \-o
|
||||
*generate output html file in case of error (404..) (o0 don't generate) (\-\-generate\-errors)
|
||||
.IP \-X
|
||||
@@ -225,6 +228,8 @@ tolerant requests (accept bogus responses on some servers, but not standard!) (\
|
||||
update hacks: various hacks to limit re\-transfers when updating (identical size, bogus response..) (\-\-updatehack)
|
||||
.IP \-%u
|
||||
url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..) (\-\-urlhack)
|
||||
.br
|
||||
opt out of one url\-hack part: \-\-keep\-www\-prefix (www.foo.com<>foo.com), \-\-keep\-double\-slashes (//), \-\-keep\-query\-order (?b&a)
|
||||
.IP \-%A
|
||||
assume that a type (cgi,asp..) is always linked with a mime type (\-%A php3,cgi=text/html;dat,bin=application/x\-zip) (\-\-assume <param>)
|
||||
.br
|
||||
|
||||
@@ -60,6 +60,9 @@ Please visit our Website: http://www.httrack.com
|
||||
param1 : this option must be alone, and needs one distinct parameter (-P <path>)
|
||||
param0 : this option must be alone, but the parameter should be put together (+*.gif)
|
||||
*/
|
||||
/* clang-format off: hand-aligned table; clang-format reflows the whole
|
||||
initializer (2->4 space) on any edit, churning every untouched row. */
|
||||
/* clang-format off */
|
||||
const char *hts_optalias[][4] = {
|
||||
/* {"","","",""}, */
|
||||
{"path", "-O", "param1", "output path"},
|
||||
@@ -107,6 +110,8 @@ const char *hts_optalias[][4] = {
|
||||
{"disable-passwords", "-%x", "single", ""}, {"disable-password", "-%x",
|
||||
"single", ""},
|
||||
{"include-query-string", "-%q", "single", ""},
|
||||
{"strip-query", "-%g", "param1",
|
||||
"strip [host/pattern=]key1,key2,... from URLs"},
|
||||
{"generate-errors", "-o", "single", ""},
|
||||
{"do-not-generate-errors", "-o0", "single", ""},
|
||||
{"purge-old", "-X", "param", ""},
|
||||
@@ -123,6 +128,9 @@ const char *hts_optalias[][4] = {
|
||||
{"tolerant", "-%B", "single", ""},
|
||||
{"updatehack", "-%s", "single", ""}, {"sizehack", "-%s", "single", ""},
|
||||
{"urlhack", "-%u", "single", ""},
|
||||
{"keep-www-prefix", "-%j", "single", ""},
|
||||
{"keep-double-slashes", "-%o", "single", ""},
|
||||
{"keep-query-order", "-%y", "single", ""},
|
||||
{"user-agent", "-F", "param1", "user-agent identity"},
|
||||
{"referer", "-%R", "param1", "default referer URL"},
|
||||
{"from", "-%E", "param1", "from email address"},
|
||||
@@ -241,6 +249,7 @@ const char *hts_optalias[][4] = {
|
||||
|
||||
{"", "", "", ""}
|
||||
};
|
||||
/* clang-format on */
|
||||
|
||||
/*
|
||||
Check for alias in command-line
|
||||
|
||||
@@ -3739,6 +3739,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
||||
if (StringNotEmpty(from->user_agent))
|
||||
StringCopyS(to->user_agent, from->user_agent);
|
||||
|
||||
if (StringNotEmpty(from->strip_query))
|
||||
StringCopyS(to->strip_query, from->strip_query);
|
||||
|
||||
if (from->retry > -1)
|
||||
to->retry = from->retry;
|
||||
|
||||
|
||||
@@ -234,8 +234,12 @@ struct hash_struct {
|
||||
coucal adrfil;
|
||||
/* former address+path -> link index (renamed/moved entries) */
|
||||
coucal former_adrfil;
|
||||
/* scratch buffers reused across lookups (not reentrant) */
|
||||
int normalized;
|
||||
/* effective urlhack sub-flags: www.==host / // collapse / query-arg sort */
|
||||
int norm_host;
|
||||
int norm_slash;
|
||||
int norm_query;
|
||||
/* query-strip keys (not owned); set from opt->strip_query at hash_init */
|
||||
const char *strip_query;
|
||||
char normfil[HTS_URLMAXSIZE * 2];
|
||||
char normfil2[HTS_URLMAXSIZE * 2];
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
@@ -364,6 +368,22 @@ int fspc(httrackp * opt, FILE * fp, const char *type);
|
||||
|
||||
char *next_token(char *p, int flag);
|
||||
|
||||
/* Like fil_normalized(), but first drops query keys in STRIP (comma-separated,
|
||||
"*" = all); STRIP NULL/empty behaves exactly like fil_normalized(). */
|
||||
char *fil_normalized_filtered(const char *source, char *dest,
|
||||
const char *strip);
|
||||
|
||||
/* As fil_normalized_filtered(), but DO_SLASH/DO_QUERY gate the // collapse and
|
||||
the query-argument sort independently (the urlhack sub-flags). */
|
||||
char *fil_normalized_filtered_ex(const char *source, char *dest,
|
||||
const char *strip, int do_slash, int do_query);
|
||||
|
||||
/* For URL ADR/FIL, return (in DEST) the comma keylist to strip from the
|
||||
'\n'-separated "[pattern=]keys" RULES (patterns matched on host/path via
|
||||
strjoker, last wins); NULL if none match. Feeds fil_normalized_filtered(). */
|
||||
const char *hts_query_strip_keys(const char *rules, const char *adr,
|
||||
const char *fil, char *dest, size_t destsize);
|
||||
|
||||
/* Read a whole file into a freshly malloc'd, NUL-terminated buffer; the caller
|
||||
owns it and must release it with freet(). Return NULL on missing/unreadable
|
||||
file (readfile_or substitutes defaultdata instead). The byte content is NOT
|
||||
|
||||
@@ -1570,6 +1570,27 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
com++;
|
||||
}
|
||||
break; // url hack
|
||||
case 'j':
|
||||
opt->no_www_dedup = 1; // --keep-www-prefix: keep www.X != X
|
||||
if (*(com + 1) == '0') {
|
||||
opt->no_www_dedup = 0;
|
||||
com++;
|
||||
}
|
||||
break;
|
||||
case 'o':
|
||||
opt->no_slash_dedup = 1; // --keep-double-slashes: keep //
|
||||
if (*(com + 1) == '0') {
|
||||
opt->no_slash_dedup = 0;
|
||||
com++;
|
||||
}
|
||||
break;
|
||||
case 'y':
|
||||
opt->no_query_dedup = 1; // --keep-query-order: keep ?b&a order
|
||||
if (*(com + 1) == '0') {
|
||||
opt->no_query_dedup = 0;
|
||||
com++;
|
||||
}
|
||||
break;
|
||||
case 'v':
|
||||
opt->verbosedisplay = HTS_VERBOSE_FULL;
|
||||
if (isdigit((unsigned char) *(com + 1))) {
|
||||
@@ -1937,6 +1958,21 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
}
|
||||
break;
|
||||
|
||||
case 'g': // strip-query: accumulate "[pattern=]keys" entries
|
||||
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
|
||||
HTS_PANIC_PRINTF("Option strip-query needs a blank space and "
|
||||
"[host/pattern=]key1,key2,...");
|
||||
printf("Example: --strip-query "
|
||||
"\"www.example.com/*=utm_source,sid\"\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
na++;
|
||||
if (StringNotEmpty(opt->strip_query))
|
||||
StringCat(opt->strip_query, "\n");
|
||||
StringCat(opt->strip_query, argv[na]);
|
||||
}
|
||||
break;
|
||||
case 't': /* do not change type (ending) of filenames according to the MIME type */
|
||||
opt->no_type_change = 1;
|
||||
if (*(com+1)=='0') { opt->no_type_change = 0; com++; }
|
||||
|
||||
@@ -106,10 +106,10 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
|
||||
const lien_url*const lien = (const lien_url*) value;
|
||||
const char *const adr = !former ? lien->adr : lien->former_adr;
|
||||
const char *const fil = !former ? lien->fil : lien->former_fil;
|
||||
const char *const adr_norm = adr != NULL ?
|
||||
( hash->normalized ? jump_normalized_const(adr)
|
||||
: jump_identification_const(adr) )
|
||||
: NULL;
|
||||
const char *const adr_norm =
|
||||
adr != NULL ? (hash->norm_host ? jump_normalized_const(adr)
|
||||
: jump_identification_const(adr))
|
||||
: NULL;
|
||||
|
||||
// copy address
|
||||
assertf(adr_norm != NULL);
|
||||
@@ -117,10 +117,18 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
|
||||
|
||||
// copy link
|
||||
assertf(fil != NULL);
|
||||
if (hash->normalized) {
|
||||
fil_normalized(fil, &hash->normfil[strlen(hash->normfil)]);
|
||||
} else {
|
||||
strcpy(&hash->normfil[strlen(hash->normfil)], fil);
|
||||
{
|
||||
/* resolve the per-URL strip keys; strip applies even when urlhack is off */
|
||||
char BIGSTK keybuf[HTS_URLMAXSIZE];
|
||||
const char *const keys = hts_query_strip_keys(hash->strip_query, adr, fil,
|
||||
keybuf, sizeof(keybuf));
|
||||
|
||||
if (hash->norm_slash || hash->norm_query || keys != NULL) {
|
||||
fil_normalized_filtered_ex(fil, &hash->normfil[strlen(hash->normfil)],
|
||||
keys, hash->norm_slash, hash->norm_query);
|
||||
} else {
|
||||
strcpy(&hash->normfil[strlen(hash->normfil)], fil);
|
||||
}
|
||||
}
|
||||
|
||||
// hash
|
||||
@@ -132,8 +140,7 @@ static int key_adrfil_equals_generic(void *arg,
|
||||
coucal_key_const a_,
|
||||
coucal_key_const b_,
|
||||
const int former) {
|
||||
hash_struct *const hash = (hash_struct*) arg;
|
||||
const int normalized = hash->normalized;
|
||||
hash_struct *const hash = (hash_struct *) arg;
|
||||
const lien_url*const a = (const lien_url*) a_;
|
||||
const lien_url*const b = (const lien_url*) b_;
|
||||
const char *const a_adr = !former ? a->adr : a->former_adr;
|
||||
@@ -150,10 +157,10 @@ static int key_adrfil_equals_generic(void *arg,
|
||||
assertf(b_fil != NULL);
|
||||
|
||||
// skip scheme and authentication to the domain (possibly without www.)
|
||||
ja = normalized
|
||||
? jump_normalized_const(a_adr) : jump_identification_const(a_adr);
|
||||
jb = normalized
|
||||
? jump_normalized_const(b_adr) : jump_identification_const(b_adr);
|
||||
ja = hash->norm_host ? jump_normalized_const(a_adr)
|
||||
: jump_identification_const(a_adr);
|
||||
jb = hash->norm_host ? jump_normalized_const(b_adr)
|
||||
: jump_identification_const(b_adr);
|
||||
assertf(ja != NULL);
|
||||
assertf(jb != NULL);
|
||||
if (strcasecmp(ja, jb) != 0) {
|
||||
@@ -161,12 +168,23 @@ static int key_adrfil_equals_generic(void *arg,
|
||||
}
|
||||
|
||||
// now compare pathes
|
||||
if (normalized) {
|
||||
fil_normalized(a_fil, hash->normfil);
|
||||
fil_normalized(b_fil, hash->normfil2);
|
||||
return strcmp(hash->normfil, hash->normfil2) == 0;
|
||||
} else {
|
||||
return strcmp(a_fil, b_fil) == 0;
|
||||
{
|
||||
char BIGSTK ka[HTS_URLMAXSIZE], kb[HTS_URLMAXSIZE];
|
||||
const char *const keysa =
|
||||
hts_query_strip_keys(hash->strip_query, a_adr, a_fil, ka, sizeof(ka));
|
||||
const char *const keysb =
|
||||
hts_query_strip_keys(hash->strip_query, b_adr, b_fil, kb, sizeof(kb));
|
||||
|
||||
if (hash->norm_slash || hash->norm_query || keysa != NULL ||
|
||||
keysb != NULL) {
|
||||
fil_normalized_filtered_ex(a_fil, hash->normfil, keysa, hash->norm_slash,
|
||||
hash->norm_query);
|
||||
fil_normalized_filtered_ex(b_fil, hash->normfil2, keysb, hash->norm_slash,
|
||||
hash->norm_query);
|
||||
return strcmp(hash->normfil, hash->normfil2) == 0;
|
||||
} else {
|
||||
return strcmp(a_fil, b_fil) == 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,7 +244,13 @@ void hash_init(httrackp *opt, hash_struct * hash, int normalized) {
|
||||
hash->sav = coucal_new(0);
|
||||
hash->adrfil = coucal_new(0);
|
||||
hash->former_adrfil = coucal_new(0);
|
||||
hash->normalized = normalized;
|
||||
/* urlhack is the umbrella; per-feature negatives opt out of each part */
|
||||
hash->norm_host = normalized && !opt->no_www_dedup;
|
||||
hash->norm_slash = normalized && !opt->no_slash_dedup;
|
||||
hash->norm_query = normalized && !opt->no_query_dedup;
|
||||
/* snapshot the query-strip list (not owned; valid for the hash lifetime) */
|
||||
hash->strip_query =
|
||||
StringNotEmpty(opt->strip_query) ? StringBuff(opt->strip_query) : NULL;
|
||||
|
||||
hts_set_hash_handler(hash->sav, opt);
|
||||
hts_set_hash_handler(hash->adrfil, opt);
|
||||
@@ -282,6 +306,26 @@ void hash_free(hash_struct *hash) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Test helper: do the two URLs dedupe to the same key under opt's urlhack
|
||||
flags? Exercises the live hash compare (norm_host/slash/query resolution). */
|
||||
int hash_url_equals(httrackp *opt, const char *adra, const char *fila,
|
||||
const char *adrb, const char *filb) {
|
||||
hash_struct hash;
|
||||
lien_url la, lb;
|
||||
int eq;
|
||||
|
||||
memset(&la, 0, sizeof(la));
|
||||
memset(&lb, 0, sizeof(lb));
|
||||
la.adr = key_duphandler(NULL, adra);
|
||||
la.fil = key_duphandler(NULL, fila);
|
||||
lb.adr = key_duphandler(NULL, adrb);
|
||||
lb.fil = key_duphandler(NULL, filb);
|
||||
hash_init(opt, &hash, opt->urlhack);
|
||||
eq = key_adrfil_equals(&hash, &la, &lb);
|
||||
hash_free(&hash);
|
||||
return eq;
|
||||
}
|
||||
|
||||
// retour: position ou -1 si non trouvé
|
||||
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
|
||||
hash_struct_type type) {
|
||||
|
||||
@@ -53,6 +53,9 @@ typedef enum hash_struct_type {
|
||||
// tables de hachage
|
||||
void hash_init(httrackp *opt, hash_struct *hash, int normalized);
|
||||
void hash_free(hash_struct *hash);
|
||||
/* Test helper: 1 if the two URLs dedupe together under opt's urlhack flags. */
|
||||
int hash_url_equals(httrackp *opt, const char *adra, const char *fila,
|
||||
const char *adrb, const char *filb);
|
||||
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
|
||||
hash_struct_type type);
|
||||
void hash_write(hash_struct * hash, size_t lpos);
|
||||
|
||||
@@ -563,6 +563,7 @@ void help(const char *app, int more) {
|
||||
(" %x do not include any password for external password protected websites (%x0 include)");
|
||||
infomsg
|
||||
(" %q *include query string for local files (useless, for information purpose only) (%q0 don't include)");
|
||||
infomsg(" %g strip query keys for dedup ([host/pattern=]key1,key2,...)");
|
||||
infomsg
|
||||
(" o *generate output html file in case of error (404..) (o0 don't generate)");
|
||||
infomsg(" X *purge old files after update (X0 keep delete)");
|
||||
@@ -587,6 +588,9 @@ void help(const char *app, int more) {
|
||||
(" %s update hacks: various hacks to limit re-transfers when updating (identical size, bogus response..)");
|
||||
infomsg
|
||||
(" %u url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..)");
|
||||
infomsg(" opt out of one url-hack part: --keep-www-prefix "
|
||||
"(www.foo.com<>foo.com), --keep-double-slashes (//), "
|
||||
"--keep-query-order (?b&a)");
|
||||
infomsg
|
||||
(" %A assume that a type (cgi,asp..) is always linked with a mime type (-%A php3,cgi=text/html;dat,bin=application/x-zip)");
|
||||
infomsg(" shortcut: '--assume standard' is equivalent to -%A "
|
||||
|
||||
163
src/htslib.c
163
src/htslib.c
@@ -3610,7 +3610,10 @@ static int sortNormFnc(const void *a_, const void *b_) {
|
||||
return strcmp(*a + 1, *b + 1);
|
||||
}
|
||||
|
||||
HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
||||
/* Path normalizer core: optionally collapse redundant '//' (DO_SLASH) and/or
|
||||
sort query arguments (DO_QUERY) so equivalent URLs dedupe. */
|
||||
static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
|
||||
int do_query) {
|
||||
char lastc = 0;
|
||||
int gotquery = 0;
|
||||
int ampargs = 0;
|
||||
@@ -3620,8 +3623,8 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
||||
for(i = j = 0; source[i] != '\0'; i++) {
|
||||
if (!gotquery && source[i] == '?')
|
||||
gotquery = ampargs = 1;
|
||||
if ((!gotquery && lastc == '/' && source[i] == '/') // foo//bar -> foo/bar
|
||||
) {
|
||||
if (do_slash && !gotquery && lastc == '/' && source[i] == '/') {
|
||||
// foo//bar -> foo/bar
|
||||
} else {
|
||||
if (gotquery && source[i] == '&') {
|
||||
ampargs++;
|
||||
@@ -3633,7 +3636,7 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
||||
dest[j++] = '\0';
|
||||
|
||||
/* Sort arguments (&foo=1&bar=2 == &bar=2&foo=1) */
|
||||
if (ampargs > 1) {
|
||||
if (do_query && ampargs > 1) {
|
||||
char **amps = malloct(ampargs * sizeof(char *));
|
||||
char *copyBuff = NULL;
|
||||
size_t qLen = 0;
|
||||
@@ -3681,6 +3684,153 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
||||
return dest;
|
||||
}
|
||||
|
||||
HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
||||
return fil_normalized_ex(source, dest, 1, 1);
|
||||
}
|
||||
|
||||
/* Is query key ARG[0..keylen) in the comma-separated STRIP list? "*" = all;
|
||||
case-sensitive, space-trimmed tokens. */
|
||||
static int hts_query_key_stripped(const char *arg, size_t keylen,
|
||||
const char *strip) {
|
||||
const char *p = strip;
|
||||
|
||||
while (*p != '\0') {
|
||||
const char *start = p;
|
||||
size_t toklen;
|
||||
|
||||
while (*p != '\0' && *p != ',')
|
||||
p++;
|
||||
toklen = (size_t) (p - start);
|
||||
while (toklen > 0 && *start == ' ') {
|
||||
start++;
|
||||
toklen--;
|
||||
}
|
||||
while (toklen > 0 && start[toklen - 1] == ' ')
|
||||
toklen--;
|
||||
if (toklen == 1 && start[0] == '*')
|
||||
return 1;
|
||||
if (toklen == keylen && strncmp(start, arg, keylen) == 0)
|
||||
return 1;
|
||||
if (*p == ',')
|
||||
p++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* see htscore.h */
|
||||
char *fil_normalized_filtered_ex(const char *source, char *dest,
|
||||
const char *strip, int do_slash,
|
||||
int do_query) {
|
||||
const char *query;
|
||||
char BIGSTK tmp[HTS_URLMAXSIZE * 2];
|
||||
htsbuff cb;
|
||||
int wrote = 0;
|
||||
|
||||
/* No strip list, or no query: plain normalization. */
|
||||
if (strip == NULL || *strip == '\0' ||
|
||||
(query = strchr(source, '?')) == NULL) {
|
||||
return fil_normalized_ex(source, dest, do_slash, do_query);
|
||||
}
|
||||
|
||||
/* Copy the path, re-emit kept query args, let fil_normalized() sort. Walk
|
||||
every field incl. empty/trailing ("a&","?&&") so the result is a fixpoint
|
||||
(the read re-normalizes it; a dropped empty arg would miss dedup). */
|
||||
cb = htsbuff_ptr(tmp, sizeof(tmp));
|
||||
htsbuff_catn(&cb, source, (size_t) (query - source));
|
||||
for (query++;;) {
|
||||
const char *const arg = query;
|
||||
const char *eq = NULL;
|
||||
size_t keylen, arglen;
|
||||
|
||||
while (*query != '\0' && *query != '&') {
|
||||
if (eq == NULL && *query == '=')
|
||||
eq = query;
|
||||
query++;
|
||||
}
|
||||
arglen = (size_t) (query - arg);
|
||||
keylen = eq != NULL ? (size_t) (eq - arg) : arglen;
|
||||
if (!hts_query_key_stripped(arg, keylen, strip)) {
|
||||
htsbuff_catc(&cb, wrote ? '&' : '?');
|
||||
htsbuff_catn(&cb, arg, arglen);
|
||||
wrote = 1;
|
||||
}
|
||||
if (*query == '\0')
|
||||
break;
|
||||
query++;
|
||||
}
|
||||
return fil_normalized_ex(tmp, dest, do_slash, do_query);
|
||||
}
|
||||
|
||||
/* see htscore.h */
|
||||
char *fil_normalized_filtered(const char *source, char *dest,
|
||||
const char *strip) {
|
||||
return fil_normalized_filtered_ex(source, dest, strip, 1, 1);
|
||||
}
|
||||
|
||||
/* see htscore.h */
|
||||
const char *hts_query_strip_keys(const char *rules, const char *adr,
|
||||
const char *fil, char *dest, size_t destsize) {
|
||||
const char *p, *q;
|
||||
const char *result = NULL;
|
||||
char BIGSTK url[HTS_URLMAXSIZE * 2];
|
||||
|
||||
if (rules == NULL || *rules == '\0' || destsize == 0)
|
||||
return NULL;
|
||||
|
||||
/* Match string = normalized host/path, query removed. jump_normalized_const
|
||||
collapses www+scheme/auth so read and write (double-normalized) agree;
|
||||
query excluded keeps the decision on host/path only. */
|
||||
url[0] = '\0';
|
||||
strcatbuff(url, jump_normalized_const(adr));
|
||||
if (fil[0] != '/')
|
||||
strcatbuff(url, "/");
|
||||
q = strchr(fil, '?');
|
||||
if (q != NULL)
|
||||
strncatbuff(url, fil, (int) (q - fil));
|
||||
else
|
||||
strcatbuff(url, fil);
|
||||
|
||||
/* Walk the '\n' entries; last match wins (like the +/- filter eval). Each is
|
||||
"pattern=keys"; no '=' is the bare form, pattern "*". */
|
||||
for (p = rules; *p != '\0';) {
|
||||
const char *const line = p;
|
||||
const char *eol, *eq, *keys;
|
||||
char BIGSTK pat[HTS_URLMAXSIZE * 2];
|
||||
|
||||
while (*p != '\0' && *p != '\n')
|
||||
p++;
|
||||
eol = p;
|
||||
if (*p == '\n')
|
||||
p++;
|
||||
if (eol == line)
|
||||
continue;
|
||||
eq = memchr(line, '=', (size_t) (eol - line));
|
||||
if (eq != NULL) {
|
||||
size_t patlen = (size_t) (eq - line);
|
||||
|
||||
if (patlen >= sizeof(pat))
|
||||
patlen = sizeof(pat) - 1;
|
||||
memcpy(pat, line, patlen);
|
||||
pat[patlen] = '\0';
|
||||
keys = eq + 1;
|
||||
} else {
|
||||
pat[0] = '*';
|
||||
pat[1] = '\0';
|
||||
keys = line;
|
||||
}
|
||||
if (strjoker(url, pat, NULL, NULL) != NULL) {
|
||||
size_t klen = (size_t) (eol - keys);
|
||||
|
||||
if (klen >= destsize)
|
||||
klen = destsize - 1;
|
||||
memcpy(dest, keys, klen);
|
||||
dest[klen] = '\0';
|
||||
result = dest;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#define endwith(a) ( (len >= (sizeof(a)-1)) ? ( strncmp(dest, a+len-(sizeof(a)-1), sizeof(a)-1) == 0 ) : 0 );
|
||||
HTSEXT_API char *adr_normalized_sized(const char *source, char *dest,
|
||||
size_t destsize) {
|
||||
@@ -5890,7 +6040,11 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
opt->verbosedisplay = HTS_VERBOSE_NONE; // no text animation
|
||||
opt->sizehack = HTS_FALSE;
|
||||
opt->urlhack = HTS_TRUE;
|
||||
opt->no_www_dedup = HTS_FALSE;
|
||||
opt->no_slash_dedup = HTS_FALSE;
|
||||
opt->no_query_dedup = HTS_FALSE;
|
||||
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
|
||||
StringCopy(opt->strip_query, "");
|
||||
opt->ftp_proxy = HTS_TRUE;
|
||||
opt->convert_utf8 = HTS_TRUE;
|
||||
StringCopy(opt->filelist, "");
|
||||
@@ -6035,6 +6189,7 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
|
||||
StringFree(opt->urllist);
|
||||
StringFree(opt->footer);
|
||||
StringFree(opt->mod_blacklist);
|
||||
StringFree(opt->strip_query);
|
||||
|
||||
StringFree(opt->path_html);
|
||||
StringFree(opt->path_html_utf8);
|
||||
|
||||
@@ -198,6 +198,13 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
// copy of fil, used for lookups (see urlhack)
|
||||
const char *normadr = adr;
|
||||
const char *normfil = fil_complete;
|
||||
/* query keys to strip for this URL (NULL = none); decoupled from urlhack */
|
||||
char BIGSTK stripkeys[HTS_URLMAXSIZE];
|
||||
const char *const strip =
|
||||
StringNotEmpty(opt->strip_query)
|
||||
? hts_query_strip_keys(StringBuff(opt->strip_query), adr,
|
||||
fil_complete, stripkeys, sizeof(stripkeys))
|
||||
: NULL;
|
||||
const char *const print_adr = jump_protocol_const(adr);
|
||||
const char *start_pos = NULL, *nom_pos = NULL, *dot_pos = NULL; // Position nom et point
|
||||
|
||||
@@ -230,9 +237,13 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
// www-42.foo.com -> foo.com
|
||||
// foo.com/bar//foobar -> foo.com/bar/foobar
|
||||
if (opt->urlhack) {
|
||||
// copy of adr (without protocol), used for lookups (see urlhack)
|
||||
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
|
||||
normfil = fil_normalized(fil_complete, normfil_);
|
||||
// dedup-lookup key; honor the per-feature negatives like htshash.c so
|
||||
// distinct URLs keep distinct savenames (else keep normadr = adr)
|
||||
if (!opt->no_www_dedup)
|
||||
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
|
||||
normfil =
|
||||
fil_normalized_filtered_ex(fil_complete, normfil_, strip,
|
||||
!opt->no_slash_dedup, !opt->no_query_dedup);
|
||||
} else {
|
||||
if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder
|
||||
char *pos = strchr(adr_complete, ':');
|
||||
@@ -245,6 +256,9 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
normadr = normadr_;
|
||||
}
|
||||
}
|
||||
// strip still applies with urlhack off (host left untouched)
|
||||
if (strip != NULL)
|
||||
normfil = fil_normalized_filtered(fil_complete, normfil_, strip);
|
||||
}
|
||||
|
||||
// à afficher sans ftp://
|
||||
|
||||
@@ -529,6 +529,12 @@ struct httrackp {
|
||||
htslibhandles libHandles; /**< loaded external module handles */
|
||||
//
|
||||
htsoptstate state; /**< embedded live engine state */
|
||||
String strip_query; /**< query keys to drop when deduping URLs (-strip-query);
|
||||
appended at the tail to keep field offsets stable */
|
||||
hts_boolean
|
||||
no_www_dedup; /**< with urlhack, keep www.host distinct from host */
|
||||
hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */
|
||||
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
|
||||
};
|
||||
|
||||
/* Running statistics for a mirror. */
|
||||
|
||||
@@ -3602,16 +3602,28 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
|
||||
int set_prio_to = 0; // pas de priotité fixéd par wizard
|
||||
|
||||
// check whether URLHack is harmless or not
|
||||
if (opt->urlhack) {
|
||||
// check whether URLHack is harmless or not (per the effective
|
||||
// sub-flags)
|
||||
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
|
||||
!opt->no_query_dedup)) {
|
||||
const int norm_host = !opt->no_www_dedup;
|
||||
const int norm_slash = !opt->no_slash_dedup;
|
||||
const int norm_query = !opt->no_query_dedup;
|
||||
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
n_adr[0] = n_fil[0] = '\0';
|
||||
(void) adr_normalized_sized(moved->adr, n_adr, sizeof(n_adr));
|
||||
(void) fil_normalized(moved->fil, n_fil);
|
||||
(void) adr_normalized_sized(urladr(), pn_adr, sizeof(pn_adr));
|
||||
(void) fil_normalized(urlfil(), pn_fil);
|
||||
strlcpybuff(n_adr,
|
||||
norm_host ? jump_normalized_const(moved->adr)
|
||||
: jump_identification_const(moved->adr),
|
||||
sizeof(n_adr));
|
||||
strlcpybuff(pn_adr,
|
||||
norm_host ? jump_normalized_const(urladr())
|
||||
: jump_identification_const(urladr()),
|
||||
sizeof(pn_adr));
|
||||
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
if (strcasecmp(n_adr, pn_adr) == 0
|
||||
&& strcasecmp(n_fil, pn_fil) == 0) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
|
||||
@@ -537,7 +537,9 @@ static int st_filtersize(httrackp *opt, int argc, char **argv) {
|
||||
return 1;
|
||||
}
|
||||
known = (argv[0][0] != '-'); /* "-1"/"-" => size unknown */
|
||||
sz = known ? (LLint) strtoll(argv[0], NULL, 10) : -1;
|
||||
sz = -1;
|
||||
if (known)
|
||||
sscanf(argv[0], LLintP, &sz);
|
||||
verdict = fa_strjoker(0, &argv[2], argc - 2, argv[1], known ? &sz : NULL,
|
||||
known ? &size_flag : NULL, NULL);
|
||||
printf("verdict=%s size_flag=%d\n",
|
||||
@@ -1050,6 +1052,173 @@ static int st_cookies(httrackp *opt, int argc, char **argv) {
|
||||
return err;
|
||||
}
|
||||
|
||||
/* --strip-query: resolver + fil_normalized_filtered, end to end. */
|
||||
static int st_stripquery(httrackp *opt, int argc, char **argv) {
|
||||
char dest[1024], keys[256], ref[1024];
|
||||
const char *k;
|
||||
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
|
||||
/* empty rules == plain fil_normalized */
|
||||
assertf(hts_query_strip_keys(NULL, "h.com", "/p?a=1", keys, sizeof(keys)) ==
|
||||
NULL);
|
||||
assertf(hts_query_strip_keys("", "h.com", "/p?a=1", keys, sizeof(keys)) ==
|
||||
NULL);
|
||||
assertf(strcmp(fil_normalized_filtered("/p?b=2&a=1", dest, NULL),
|
||||
fil_normalized("/p?b=2&a=1", ref)) == 0);
|
||||
|
||||
/* bare form (*=keys): strip the key everywhere, keep+sort the rest */
|
||||
k = hts_query_strip_keys("sid", "any.com", "/p?b=2&sid=x&a=1", keys,
|
||||
sizeof(keys));
|
||||
assertf(k != NULL && strcmp(k, "sid") == 0);
|
||||
assertf(strcmp(fil_normalized_filtered("/p?b=2&sid=x&a=1", dest, k),
|
||||
"/p?a=1&b=2") == 0);
|
||||
|
||||
/* reordered variant + an extra stripped key == the clean URL */
|
||||
assertf(strcmp(fil_normalized_filtered("/p?sid=y&a=1&b=2", dest, "sid"),
|
||||
fil_normalized("/p?a=1&b=2", ref)) == 0);
|
||||
|
||||
/* host pattern matches only that host, incl. its www-normalized forms */
|
||||
assertf(hts_query_strip_keys("ex.com/*=utm", "other.com", "/p?utm=1", keys,
|
||||
sizeof(keys)) == NULL);
|
||||
assertf(hts_query_strip_keys("ex.com/*=utm", "ex.com", "/p?utm=1", keys,
|
||||
sizeof(keys)) != NULL);
|
||||
assertf(hts_query_strip_keys("ex.com/*=utm", "www.ex.com", "/p?utm=1", keys,
|
||||
sizeof(keys)) != NULL);
|
||||
assertf(hts_query_strip_keys("ex.com/*=utm", "http://www-3.ex.com",
|
||||
"/p?utm=1", keys, sizeof(keys)) != NULL);
|
||||
|
||||
/* last match wins, wholesale: host rule overrides global, no union */
|
||||
k = hts_query_strip_keys("*=sid\nex.com/*=utm", "ex.com",
|
||||
"/p?sid=1&utm=2&a=3", keys, sizeof(keys));
|
||||
assertf(k != NULL && strcmp(k, "utm") == 0);
|
||||
assertf(strcmp(fil_normalized_filtered("/p?sid=1&utm=2&a=3", dest, k),
|
||||
"/p?a=3&sid=1") == 0);
|
||||
k = hts_query_strip_keys("*=sid\nex.com/*=utm", "z.com", "/p?sid=1&a=3", keys,
|
||||
sizeof(keys));
|
||||
assertf(k != NULL && strcmp(k, "sid") == 0);
|
||||
|
||||
/* whole-key match, not prefix: "utm" must not strip utm_source */
|
||||
assertf(strcmp(fil_normalized_filtered("/p?utm_source=x&a=1", dest, "utm"),
|
||||
"/p?a=1&utm_source=x") == 0);
|
||||
|
||||
/* "*" drops every param; a fully-stripped single-arg query loses its '?' */
|
||||
assertf(strcmp(fil_normalized_filtered("/p?a=1&b=2", dest, "*"), "/p") == 0);
|
||||
assertf(strcmp(fil_normalized_filtered("/p?utm=1", dest, "utm"), "/p") == 0);
|
||||
|
||||
/* degenerate forms a=, b, c== (key 'c'); strip c keeps a= and b */
|
||||
assertf(strcmp(fil_normalized_filtered("/p?a=&b&c==", dest, "c"),
|
||||
"/p?a=&b") == 0);
|
||||
/* short key must not strip a longer one: 'c' must not touch 'cc' */
|
||||
assertf(strcmp(fil_normalized_filtered("/p?cc=1&c=2", dest, "c"),
|
||||
"/p?cc=1") == 0);
|
||||
|
||||
/* repeated key: every occurrence is stripped, not just the first */
|
||||
assertf(
|
||||
strcmp(fil_normalized_filtered("/p?foo=42&bar=13&foo=43", dest, "foo"),
|
||||
"/p?bar=13") == 0);
|
||||
/* repeated key mixing missing/empty values */
|
||||
assertf(
|
||||
strcmp(fil_normalized_filtered("/p?foo&bar=13&foo=42&foo=", dest, "foo"),
|
||||
"/p?bar=13") == 0);
|
||||
/* repeated key kept (no match): all occurrences retained, then sorted */
|
||||
assertf(strcmp(fil_normalized_filtered("/p?foo=42&bar=13&foo=43", dest, "z"),
|
||||
"/p?bar=13&foo=42&foo=43") == 0);
|
||||
|
||||
/* value containing '=': the key is only the part before the first '='. Strip
|
||||
'foo' drops "foo=42=17" whole; the '=' in the value is not a delimiter. */
|
||||
assertf(strcmp(fil_normalized_filtered("/p?foo=42=17&bar=", dest, "foo"),
|
||||
"/p?bar=") == 0);
|
||||
/* keeping it preserves the embedded '=' verbatim */
|
||||
assertf(strcmp(fil_normalized_filtered("/p?foo=42=17&bar=", dest, "bar"),
|
||||
"/p?foo=42=17") == 0);
|
||||
/* a value segment is not a key: stripping "42" must not touch foo=42=17 */
|
||||
assertf(strcmp(fil_normalized_filtered("/p?foo=42=17", dest, "42"),
|
||||
"/p?foo=42=17") == 0);
|
||||
|
||||
/* Idempotency: the read path re-normalizes an already-normalized fil, so the
|
||||
result must be a fixpoint or dedup misses (catches a dropped empty/trailing
|
||||
arg like "?&&", "a&"). */
|
||||
{
|
||||
static const char *const qs[] = {"/p?a=&b&c==",
|
||||
"/p?a&&b",
|
||||
"/p?&a",
|
||||
"/p?a&",
|
||||
"/p?",
|
||||
"/p?=v",
|
||||
"/p?&&",
|
||||
"/p?b=2&a=1",
|
||||
"/p?utm=x&",
|
||||
"/p?&utm=x",
|
||||
"/p?foo=42&bar=13&foo=43",
|
||||
"/p?foo&bar=13&foo=42&foo=",
|
||||
"/p?foo=42=17&bar="};
|
||||
static const char *const strips[] = {NULL, "z", "utm", "*", "a", "foo"};
|
||||
char once[1024], twice[1024];
|
||||
size_t i, j;
|
||||
|
||||
for (i = 0; i < sizeof(qs) / sizeof(qs[0]); i++) {
|
||||
for (j = 0; j < sizeof(strips) / sizeof(strips[0]); j++) {
|
||||
fil_normalized_filtered(qs[i], once, strips[j]);
|
||||
fil_normalized_filtered(once, twice, strips[j]);
|
||||
assertf(strcmp(once, twice) == 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("strip-query self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* -%u url-hack split (#271): each sub-flag must toggle independently. */
|
||||
static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
#define EQ(aa, fa, ab, fb) hash_url_equals(opt, aa, fa, ab, fb)
|
||||
/* urlhack on, no opt-outs: www, // and query order all collapse */
|
||||
opt->urlhack = 1;
|
||||
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = 0;
|
||||
assertf(EQ("www.foo.com", "/a", "foo.com", "/a"));
|
||||
assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b"));
|
||||
assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
|
||||
|
||||
/* keep-www-prefix: host off; // and query still collapse */
|
||||
opt->no_www_dedup = 1;
|
||||
assertf(!EQ("www.foo.com", "/a", "foo.com", "/a"));
|
||||
assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b"));
|
||||
assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
|
||||
opt->no_www_dedup = 0;
|
||||
|
||||
/* keep-double-slashes: // significant; www, query order still collapse */
|
||||
opt->no_slash_dedup = 1;
|
||||
assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b"));
|
||||
assertf(EQ("www.foo.com", "/a", "foo.com", "/a"));
|
||||
assertf(EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
|
||||
opt->no_slash_dedup = 0;
|
||||
|
||||
/* keep-query-order: query order significant; www and // still collapse */
|
||||
opt->no_query_dedup = 1;
|
||||
assertf(!EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
|
||||
assertf(EQ("www.foo.com", "/a", "foo.com", "/a"));
|
||||
assertf(EQ("foo.com", "/a//b", "foo.com", "/a/b"));
|
||||
opt->no_query_dedup = 0;
|
||||
|
||||
/* all opt-outs == urlhack off entirely */
|
||||
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = 1;
|
||||
assertf(!EQ("www.foo.com", "/a", "foo.com", "/a"));
|
||||
assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b"));
|
||||
assertf(!EQ("foo.com", "/p?b=2&a=1", "foo.com", "/p?a=1&b=2"));
|
||||
opt->urlhack = 0;
|
||||
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = 0;
|
||||
assertf(!EQ("www.foo.com", "/a", "foo.com", "/a"));
|
||||
assertf(!EQ("foo.com", "/a//b", "foo.com", "/a/b"));
|
||||
#undef EQ
|
||||
printf("urlhack self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* Registry: name -> handler, with a usage hint and a one-line description. */
|
||||
/* ------------------------------------------------------------ */
|
||||
@@ -1066,6 +1235,10 @@ static const struct selftest_entry {
|
||||
"size-aware filter verdict (negative size = unknown/scan time)",
|
||||
st_filtersize},
|
||||
{"simplify", "<path>", "collapse ./ and ../ in a path", st_simplify},
|
||||
{"stripquery", "", "--strip-query pattern/key stripping self-test",
|
||||
st_stripquery},
|
||||
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
|
||||
st_urlhack},
|
||||
{"mime", "<filename>", "MIME type for a filename", st_mime},
|
||||
{"charset", "<charset> <string>",
|
||||
"convert a string to UTF-8 from a charset", st_charset},
|
||||
|
||||
@@ -84,6 +84,9 @@ fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # sc
|
||||
fsize 'verdict=forbidden size_flag=1' 5 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # <10KB: cancel
|
||||
fsize 'verdict=allowed size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # >=10KB: keep
|
||||
fsize 'verdict=forbidden size_flag=0' -1 foo.txt -* '+*.jpg' '-*.jpg*[<10]' # not a jpg
|
||||
# the '>' operator is just as neutral at scan time, and fires once size is known
|
||||
fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # scan time: keep
|
||||
fsize 'verdict=forbidden size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # >10KB: cancel
|
||||
|
||||
# [name]/[file]/[path] never span '?' mid-string; a trailing query is still
|
||||
# tolerated by the global '?' rule (same as plain *.aspx), not the class (#144).
|
||||
|
||||
8
tests/01_engine-stripquery.test
Executable file
8
tests/01_engine-stripquery.test
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# --strip-query: pattern-scoped query-key stripping for dedup. All assertions
|
||||
# live in the engine self-test (hts_query_strip_keys + fil_normalized_filtered).
|
||||
httrack -O /dev/null -#test=stripquery | grep -q "strip-query self-test OK"
|
||||
8
tests/01_engine-urlhack.test
Normal file
8
tests/01_engine-urlhack.test
Normal file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# -%u url-hack split (#271): www / // / query-order dedup toggle independently.
|
||||
# All assertions live in the engine self-test (hash compare flag resolution).
|
||||
httrack -O /dev/null -#test=urlhack run | grep -q "urlhack self-test OK"
|
||||
18
tests/26_local-strip-query.test
Executable file
18
tests/26_local-strip-query.test
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# End-to-end --strip-query (#112): two links to one resource differing only by
|
||||
# ?utm_source dedup to a single saved file (2 files written: index + resource);
|
||||
# the control crawl without the option keeps both variants (3 files). Locks the
|
||||
# CLI->opt->hash plumbing the engine self-test can't reach.
|
||||
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# stripped: the two ?utm_source variants collapse to one resource
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \
|
||||
httrack 'BASEURL/stripquery/index.html' --strip-query 'utm_source'
|
||||
|
||||
# control: no stripping -> both query-named variants are saved
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \
|
||||
httrack 'BASEURL/stripquery/index.html'
|
||||
@@ -5,6 +5,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
proxy-https-server.py \
|
||||
local-crawl.sh local-server.py server.crt server.key \
|
||||
server-root/simple/basic.html server-root/simple/link.html \
|
||||
server-root/stripquery/index.html server-root/stripquery/a.html \
|
||||
fixtures/cache-golden/hts-cache/new.zip
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
@@ -45,7 +46,9 @@ TESTS = \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
01_engine-urlhack.test \
|
||||
02_manpage-regen.test \
|
||||
02_update-cache.test \
|
||||
10_crawl-simple.test \
|
||||
@@ -68,6 +71,7 @@ TESTS = \
|
||||
22_local-broken-size.test \
|
||||
23_local-errpage.test \
|
||||
24_local-resume-overlap.test \
|
||||
25_local-mime-exclude.test
|
||||
25_local-mime-exclude.test \
|
||||
26_local-strip-query.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
1
tests/server-root/stripquery/a.html
Normal file
1
tests/server-root/stripquery/a.html
Normal file
@@ -0,0 +1 @@
|
||||
<html><body>resource A</body></html>
|
||||
5
tests/server-root/stripquery/index.html
Normal file
5
tests/server-root/stripquery/index.html
Normal file
@@ -0,0 +1,5 @@
|
||||
<html><body>
|
||||
Two links to one resource, differing only by a tracking parameter.
|
||||
<a href="a.html?utm_source=x">x</a>
|
||||
<a href="a.html?utm_source=y">y</a>
|
||||
</body></html>
|
||||
Reference in New Issue
Block a user