mirror of
https://github.com/xroche/httrack.git
synced 2026-06-27 04:27:16 +03:00
Compare commits
4 Commits
fix/mime-e
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b138c87a93 | ||
|
|
3de47433b7 | ||
|
|
fb8827718e | ||
|
|
7228210061 |
@@ -736,26 +736,39 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
/* OPTIMIZED for fast load */
|
||||
if (StringNotEmpty(opt->filelist)) {
|
||||
char *filelist_buff = NULL;
|
||||
const size_t filelist_sz = off_t_to_size_t(fsize(StringBuff(opt->filelist)));
|
||||
size_t filelist_sz = 0;
|
||||
const char *filelist_err = NULL; /* failure reason, NULL on success */
|
||||
const off_t fs = fsize(StringBuff(opt->filelist));
|
||||
|
||||
if (filelist_sz != (size_t) -1) {
|
||||
if (fs < 0) {
|
||||
/* fsize() hides the cause; redo stat() for a precise errno (#49) */
|
||||
struct stat st;
|
||||
filelist_err = stat(StringBuff(opt->filelist), &st) != 0
|
||||
? strerror(errno)
|
||||
: "not a regular file";
|
||||
} else if ((filelist_sz = off_t_to_size_t(fs)) == (size_t) -1) {
|
||||
filelist_err = "file too large";
|
||||
filelist_sz = 0;
|
||||
} else {
|
||||
FILE *fp = fopen(StringBuff(opt->filelist), "rb");
|
||||
|
||||
if (fp) {
|
||||
if (fp == NULL) {
|
||||
filelist_err = strerror(errno);
|
||||
} else {
|
||||
filelist_buff = malloct(filelist_sz + 1);
|
||||
if (filelist_buff) {
|
||||
if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
|
||||
freet(filelist_buff);
|
||||
filelist_buff = NULL;
|
||||
} else {
|
||||
*(filelist_buff + filelist_sz) = '\0';
|
||||
}
|
||||
if (filelist_buff == NULL) {
|
||||
filelist_err = "out of memory";
|
||||
} else if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
|
||||
freet(filelist_buff);
|
||||
filelist_err = "read error";
|
||||
} else {
|
||||
filelist_buff[filelist_sz] = '\0';
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
if (filelist_buff) {
|
||||
if (filelist_buff != NULL) {
|
||||
int filelist_ptr = 0;
|
||||
int n = 0;
|
||||
char BIGSTK line[HTS_URLMAXSIZE * 2];
|
||||
@@ -780,8 +793,8 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
// Free buffer
|
||||
freet(filelist_buff);
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR, "Could not include URL list: %s",
|
||||
StringBuff(opt->filelist));
|
||||
hts_log_print(opt, LOG_ERROR, "Could not include URL list \"%s\": %s",
|
||||
StringBuff(opt->filelist), filelist_err);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -76,7 +76,8 @@ int fa_strjoker(int type, char **filters, int nfil, const char *nom, LLint * siz
|
||||
}
|
||||
if (size)
|
||||
sz = *size;
|
||||
if (strjoker(nom, filters[i] + filteroffs, &sz, size_flag)) { // reconnu
|
||||
/* size unknown (scan time): no size pointer => size tests stay neutral */
|
||||
if (strjoker(nom, filters[i] + filteroffs, size ? &sz : NULL, size_flag)) {
|
||||
if (size)
|
||||
if (sz != *size)
|
||||
sizelimit = sz;
|
||||
|
||||
@@ -524,6 +524,32 @@ static int st_filter(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Size-aware filter verdict via fa_strjoker: a negative <size> means the size
|
||||
is still unknown (scan time), so a size rule like -*.jpg*[<10] must stay
|
||||
neutral. */
|
||||
static int st_filtersize(httrackp *opt, int argc, char **argv) {
|
||||
LLint sz;
|
||||
int size_flag = 0, verdict, known;
|
||||
|
||||
(void) opt;
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "filtersize: needs <size> <string> <filter> [filter...]\n");
|
||||
return 1;
|
||||
}
|
||||
known = (argv[0][0] != '-'); /* "-1"/"-" => size unknown */
|
||||
sz = -1;
|
||||
if (known)
|
||||
sscanf(argv[0], LLintP, &sz);
|
||||
verdict = fa_strjoker(0, &argv[2], argc - 2, argv[1], known ? &sz : NULL,
|
||||
known ? &size_flag : NULL, NULL);
|
||||
printf("verdict=%s size_flag=%d\n",
|
||||
verdict > 0 ? "allowed"
|
||||
: verdict < 0 ? "forbidden"
|
||||
: "unknown",
|
||||
size_flag);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int st_simplify(httrackp *opt, int argc, char **argv) {
|
||||
(void) opt;
|
||||
if (argc < 1) {
|
||||
@@ -1038,6 +1064,9 @@ static const struct selftest_entry {
|
||||
} selftests[] = {
|
||||
{"filter", "<pattern> <string>", "match a string against a wildcard filter",
|
||||
st_filter},
|
||||
{"filtersize", "<size> <string> <filter>...",
|
||||
"size-aware filter verdict (negative size = unknown/scan time)",
|
||||
st_filtersize},
|
||||
{"simplify", "<path>", "collapse ./ and ../ in a path", st_simplify},
|
||||
{"mime", "<filename>", "MIME type for a filename", st_mime},
|
||||
{"charset", "<charset> <string>",
|
||||
|
||||
65
tests/01_engine-filelist.test
Normal file
65
tests/01_engine-filelist.test
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# -%L URL-list loading (#49): a readable list is honored; an unusable one fails
|
||||
# with the reason (errno / not-a-regular-file), not a bare "Could not include
|
||||
# URL list". Offline: file:// fixture, no server. Asserts on httrack's own
|
||||
# strings and the message shape, so it is locale-independent.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
tmp=$(mktemp -d "${TMPDIR:-/tmp}/httrack_filelist.XXXXXX") || exit 1
|
||||
trap 'rm -rf "$tmp"' EXIT HUP INT QUIT PIPE TERM
|
||||
|
||||
echo '<html><body>hi</body></html>' >"$tmp/index.html"
|
||||
|
||||
# run httrack with the given -%L target; structured log lands in $out/hts-log.txt
|
||||
run() {
|
||||
local out="$1" list="$2"
|
||||
rm -rf "$out"
|
||||
mkdir -p "$out"
|
||||
httrack -O "$out" --quiet -n "-%L" "$list" >"$out/.stdout" 2>&1 || true
|
||||
LOG="$out/hts-log.txt"
|
||||
}
|
||||
|
||||
fail() {
|
||||
echo "FAIL: $1"
|
||||
cat "$LOG"
|
||||
exit 1
|
||||
}
|
||||
loghas() {
|
||||
grep -Eq "$1" "$LOG" || fail "expected /$1/ in $LOG"
|
||||
}
|
||||
lognot() {
|
||||
if grep -Eq "$1" "$LOG"; then fail "unexpected /$1/ in $LOG"; fi
|
||||
}
|
||||
|
||||
# readable list: its one URL is loaded and counted (count must be non-zero)
|
||||
printf 'file://%s/index.html\n' "$tmp" >"$tmp/urls.txt"
|
||||
run "$tmp/ok" "$tmp/urls.txt"
|
||||
loghas '[1-9][0-9]* links added from'
|
||||
|
||||
# missing file: quoted name + a non-empty reason, never the old reasonless
|
||||
# "Could not include URL list: <name>". The reason is the stat() errno, not the
|
||||
# directory fallback literal (guards against dropping the errno lookup).
|
||||
run "$tmp/miss" "$tmp/nope.txt"
|
||||
loghas 'Could not include URL list "[^"]+": .+'
|
||||
lognot 'Could not include URL list: '
|
||||
lognot 'not a regular file'
|
||||
|
||||
# a directory is rejected with our own reason (locale-independent)
|
||||
mkdir -p "$tmp/adir"
|
||||
run "$tmp/dir" "$tmp/adir"
|
||||
loghas 'Could not include URL list "[^"]+": not a regular file'
|
||||
|
||||
# unreadable regular file: the fopen() errno arm fires, distinct from the
|
||||
# directory branch. Root bypasses mode 000, so skip it there.
|
||||
if test "$(id -u)" -ne 0; then
|
||||
: >"$tmp/noperm.txt"
|
||||
chmod 000 "$tmp/noperm.txt"
|
||||
run "$tmp/perm" "$tmp/noperm.txt"
|
||||
chmod 644 "$tmp/noperm.txt"
|
||||
loghas 'Could not include URL list "[^"]+": .+'
|
||||
lognot 'not a regular file'
|
||||
fi
|
||||
|
||||
exit 0
|
||||
@@ -71,3 +71,27 @@ nomatch '*[\[\]]' '[' # not matched, despite the docs
|
||||
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
|
||||
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
|
||||
nomatch '*[\[\]]' '[]x'
|
||||
|
||||
# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
|
||||
# means the size is still unknown (scan time). A size exclusion must stay neutral
|
||||
# then, so the file is fetched and only cancelled once its size is known (#143).
|
||||
fsize() {
|
||||
local want="$1"
|
||||
shift
|
||||
test "$(httrack -O /dev/null -#test=filtersize "$@")" == "$want" || exit 1
|
||||
}
|
||||
fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # scan time: keep
|
||||
fsize 'verdict=forbidden size_flag=1' 5 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # <10KB: cancel
|
||||
fsize 'verdict=allowed size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # >=10KB: keep
|
||||
fsize 'verdict=forbidden size_flag=0' -1 foo.txt -* '+*.jpg' '-*.jpg*[<10]' # not a jpg
|
||||
# the '>' operator is just as neutral at scan time, and fires once size is known
|
||||
fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # scan time: keep
|
||||
fsize 'verdict=forbidden size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # >10KB: cancel
|
||||
|
||||
# [name]/[file]/[path] never span '?' mid-string; a trailing query is still
|
||||
# tolerated by the global '?' rule (same as plain *.aspx), not the class (#144).
|
||||
nomatch '*[path]/end' 'a?b/end'
|
||||
nomatch '*[file]end' 'foo?xend'
|
||||
nomatch '*[name]X' 'abc?X'
|
||||
match '*[file]' 'foo?x=1' # trailing query: tolerated, as for *.aspx
|
||||
match '*.aspx' 'page.aspx?y=2'
|
||||
|
||||
@@ -34,6 +34,7 @@ TESTS = \
|
||||
01_engine-dns.test \
|
||||
01_engine-doitlog.test \
|
||||
01_engine-entities.test \
|
||||
01_engine-filelist.test \
|
||||
01_engine-filter.test \
|
||||
01_engine-hashtable.test \
|
||||
01_engine-idna.test \
|
||||
|
||||
Reference in New Issue
Block a user