mirror of
https://github.com/xroche/httrack.git
synced 2026-07-04 08:04:13 +03:00
Compare commits
2 Commits
master
...
phase1-res
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
04e5ced5f5 | ||
|
|
ccfa286b3c |
@@ -1093,113 +1093,33 @@ static int st_resolve(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Extra args are key=value: adr= cdispo= statuscode= status= strip= urlhack=
|
||||
no-www= no-slash= no-query= n83= type=, plus repeatable prior=adr|fil|sav
|
||||
registering an already-crawled link (dedup/collision paths). */
|
||||
/* Parse raw response-header lines and print the naming-relevant fields. */
|
||||
static int st_header(httrackp *opt, int argc, char **argv) {
|
||||
htsblk r;
|
||||
int i;
|
||||
|
||||
(void) opt;
|
||||
if (argc < 1) {
|
||||
fprintf(stderr, "header: needs at least one raw header line\n");
|
||||
return 1;
|
||||
}
|
||||
memset(&r, 0, sizeof(r));
|
||||
for (i = 0; i < argc; i++) {
|
||||
char BIGSTK line[HTS_URLMAXSIZE * 2];
|
||||
|
||||
strcpybuff(line, argv[i]);
|
||||
treathead(NULL, "www.example.com", "/", &r, line);
|
||||
}
|
||||
printf("contenttype=%s cdispo=%s\n", r.contenttype, r.cdispo);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int st_savename(httrackp *opt, int argc, char **argv) {
|
||||
lien_adrfilsave afs;
|
||||
cache_back cache;
|
||||
struct_back *sback;
|
||||
hash_struct hash;
|
||||
lien_back headers;
|
||||
const char *adr = "www.example.com";
|
||||
const char *cdispo = NULL;
|
||||
int statuscode = HTTP_OK, status = 0;
|
||||
int i;
|
||||
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "savename: needs a fil and a content-type\n");
|
||||
return 1;
|
||||
}
|
||||
/* knobs first: hash_init and the prior links depend on them */
|
||||
for (i = 2; i < argc; i++) {
|
||||
const char *const a = argv[i];
|
||||
|
||||
if (strncmp(a, "adr=", 4) == 0)
|
||||
adr = a + 4;
|
||||
else if (strncmp(a, "cdispo=", 7) == 0)
|
||||
cdispo = a + 7;
|
||||
else if (strncmp(a, "statuscode=", 11) == 0)
|
||||
statuscode = atoi(a + 11);
|
||||
else if (strncmp(a, "status=", 7) == 0)
|
||||
status = atoi(a + 7);
|
||||
else if (strncmp(a, "strip=", 6) == 0)
|
||||
StringCopy(opt->strip_query, a + 6);
|
||||
else if (strncmp(a, "urlhack=", 8) == 0)
|
||||
opt->urlhack = atoi(a + 8) ? HTS_TRUE : HTS_FALSE;
|
||||
else if (strncmp(a, "no-www=", 7) == 0)
|
||||
opt->no_www_dedup = atoi(a + 7) ? HTS_TRUE : HTS_FALSE;
|
||||
else if (strncmp(a, "no-slash=", 9) == 0)
|
||||
opt->no_slash_dedup = atoi(a + 9) ? HTS_TRUE : HTS_FALSE;
|
||||
else if (strncmp(a, "no-query=", 9) == 0)
|
||||
opt->no_query_dedup = atoi(a + 9) ? HTS_TRUE : HTS_FALSE;
|
||||
else if (strncmp(a, "n83=", 4) == 0)
|
||||
opt->savename_83 = atoi(a + 4);
|
||||
else if (strncmp(a, "type=", 5) == 0)
|
||||
opt->savename_type = atoi(a + 5);
|
||||
else if (strncmp(a, "prior=", 6) != 0) {
|
||||
fprintf(stderr, "savename: unknown arg '%s'\n", a);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
memset(&afs, 0, sizeof(afs));
|
||||
strcpybuff(afs.af.adr, adr);
|
||||
strcpybuff(afs.af.adr, "www.example.com");
|
||||
strcpybuff(afs.af.fil, argv[0]);
|
||||
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.hashtable = (void *) coucal_new(0);
|
||||
|
||||
sback = back_new(opt, opt->maxsoc * 32 + 1024);
|
||||
/* same wiring as hts_mirror (htscore.c) */
|
||||
hash_init(opt, &hash, opt->urlhack);
|
||||
hash.liens = (const lien_url *const *const *) &opt->liens;
|
||||
opt->hash = &hash;
|
||||
hts_record_init(opt);
|
||||
|
||||
for (i = 2; i < argc; i++) {
|
||||
if (strncmp(argv[i], "prior=", 6) == 0) {
|
||||
char *dup = strdupt(argv[i] + 6);
|
||||
char *const p1 = strchr(dup, '|');
|
||||
char *const p2 = p1 != NULL ? strchr(p1 + 1, '|') : NULL;
|
||||
|
||||
if (p2 == NULL) {
|
||||
fprintf(stderr, "savename: prior needs adr|fil|sav\n");
|
||||
return 1;
|
||||
}
|
||||
*p1 = *p2 = '\0';
|
||||
if (!hts_record_link(opt, dup, p1 + 1, p2 + 1, "", "", NULL))
|
||||
return 1;
|
||||
freet(dup);
|
||||
}
|
||||
}
|
||||
|
||||
memset(&headers, 0, sizeof(headers));
|
||||
headers.status = status;
|
||||
headers.r.statuscode = statuscode;
|
||||
headers.status = 0;
|
||||
headers.r.statuscode = HTTP_OK;
|
||||
strcpybuff(headers.r.contenttype, argv[1]);
|
||||
if (cdispo != NULL)
|
||||
strcpybuff(headers.r.cdispo, cdispo);
|
||||
if (argc >= 3)
|
||||
strcpybuff(headers.r.cdispo, argv[2]);
|
||||
strcpybuff(headers.url_fil, argv[0]);
|
||||
|
||||
url_savename(&afs, NULL, NULL, NULL, opt, sback, &cache, &hash, 0, 0,
|
||||
@@ -2006,10 +1926,8 @@ static const struct selftest_entry {
|
||||
st_relative},
|
||||
{"resolve", "<link> <adr> <fil>", "resolve a link against an origin",
|
||||
st_resolve},
|
||||
{"header", "<raw-header-line> ...", "response header-line parsing",
|
||||
st_header},
|
||||
{"savename", "<fil> <content-type> [key=value ...]",
|
||||
"local save-name for a URL", st_savename},
|
||||
{"savename", "<fil> <content-type> [cdispo]", "local save-name for a URL",
|
||||
st_savename},
|
||||
{"cache", "<dir>", "cache read/write round-trip self-test", st_cache},
|
||||
{"cache-golden", "<dir> [regen]", "frozen cache-format read self-test",
|
||||
st_cache_golden},
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Response header-line parsing (treathead via -#test=header <raw-line> ...).
|
||||
# Isolates the wire layer from url_savename, which strips traversal on its own.
|
||||
|
||||
hdr() {
|
||||
local want="$1"
|
||||
shift
|
||||
out="$(httrack -O /dev/null -#test=header "$@" | grep '^contenttype=')"
|
||||
test "$out" == "$want" || {
|
||||
echo "FAIL: $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
hdr 'contenttype=application/pdf cdispo=' 'Content-Type: application/pdf'
|
||||
|
||||
# filename= is honored quoted or bare.
|
||||
hdr 'contenttype= cdispo=report.pdf' \
|
||||
'Content-Disposition: attachment; filename="report.pdf"'
|
||||
hdr 'contenttype= cdispo=report.pdf' \
|
||||
'Content-Disposition: attachment; filename=report.pdf'
|
||||
|
||||
# Path components in the filename are dropped on the wire (RFC 2616).
|
||||
hdr 'contenttype= cdispo=evil.pdf' \
|
||||
'Content-Disposition: attachment; filename="../../evil.pdf"'
|
||||
@@ -3,30 +3,13 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Local save-name resolution (url_savename via -#test=savename <fil> <content-type> [key=value ...]).
|
||||
# name() asserts on the basename, full() on the whole path; prior= registers an
|
||||
# already-crawled link whose sav is rooted under the -O path (/dev/null here).
|
||||
|
||||
run() {
|
||||
httrack -O /dev/null -#test=savename "$@" | sed -n 's/^savename: //p'
|
||||
}
|
||||
# Local save-name extension resolution (url_savename via -#test=savename <fil> <content-type> [cdispo]).
|
||||
# Asserts on the basename of "savename: <path>".
|
||||
|
||||
name() {
|
||||
local fil="$1" ctype="$2" want="$3"
|
||||
shift 3
|
||||
out="$(run "$fil" "$ctype" "$@")"
|
||||
test "${out##*/}" == "$want" || {
|
||||
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
full() {
|
||||
local fil="$1" ctype="$2" want="$3"
|
||||
shift 3
|
||||
out="$(run "$fil" "$ctype" "$@")"
|
||||
test "$out" == "$want" || {
|
||||
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
|
||||
out="$(httrack -O /dev/null -#test=savename "$1" "$2" ${4:+"$4"} | sed -n 's/^savename: //p')"
|
||||
test "${out##*/}" == "$3" || {
|
||||
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
@@ -58,84 +41,8 @@ name '/types/data.json' 'application/json' 'data.json'
|
||||
name '/x.JPG' 'image/jpeg' 'x.JPG'
|
||||
|
||||
# A Content-Disposition filename replaces the URL name outright.
|
||||
name '/x.php' 'application/pdf' 'report.pdf' cdispo=report.pdf
|
||||
name '/download' 'text/html' 'setup.exe' cdispo=setup.exe
|
||||
name '/x.php' 'application/pdf' 'report.pdf' 'report.pdf'
|
||||
name '/download' 'text/html' 'setup.exe' 'setup.exe'
|
||||
|
||||
# Reserved characters in a hostile Content-Disposition name are sanitized.
|
||||
name '/x.php' 'application/pdf' 'set_up.exe' 'cdispo=set:up.exe'
|
||||
|
||||
# The md5-of-query suffix lands inside a Content-Disposition name too.
|
||||
name '/x.php?id=1' 'application/pdf' 'report681a.pdf' cdispo=report.pdf
|
||||
|
||||
# Still-downloading path (status=-1): mime drives the ext, cdispo is ignored
|
||||
# there (the deliberately unfolded 4th resolve_extension variant).
|
||||
name '/x.pdf' 'text/html' 'x.html' status=-1
|
||||
name '/x.html' 'text/html' 'x.html' status=-1
|
||||
name '/x.php' 'application/pdf' 'x.pdf' status=-1 cdispo=report.pdf
|
||||
|
||||
# A redirect answer resolves nothing: delayed placeholder name.
|
||||
name '/x.php' 'text/html' 'x.0.delayed' statuscode=301
|
||||
|
||||
# Root and query-only URLs get index + the md5-of-query suffix.
|
||||
name '/' 'text/html' 'index.html'
|
||||
name '/?a=1' 'text/html' 'index3872.html'
|
||||
|
||||
# Same URL crawled before: reuse its sav verbatim (case preserved).
|
||||
full '/X.PHP' 'text/html' 'www.example.com/CASE.HTML' \
|
||||
'prior=www.example.com|/X.PHP|www.example.com/CASE.HTML'
|
||||
|
||||
# Another URL owns the name: collision suffix -2, then -3, case-insensitively.
|
||||
name '/x.php' 'text/html' 'x-2.html' \
|
||||
'prior=www.example.com|/other.html|/dev/null/www.example.com/x.html'
|
||||
name '/x.php' 'text/html' 'x-3.html' \
|
||||
'prior=www.example.com|/o1.html|/dev/null/www.example.com/x.html' \
|
||||
'prior=www.example.com|/o2.html|/dev/null/www.example.com/x-2.html'
|
||||
name '/INDEX.HTML' 'text/html' 'INDEX-2.HTML' \
|
||||
'prior=www.example.com|/index.html|/dev/null/www.example.com/index.html'
|
||||
|
||||
# Same basename in another directory is NOT a collision.
|
||||
name '/x.php' 'text/html' 'x.html' \
|
||||
'prior=www.example.com|/sub/x.html|/dev/null/www.example.com/sub/x.html'
|
||||
|
||||
# 8-3 modes: DOS truncates every component to 8+3, ISO9660 level 2 to 31.
|
||||
full '/directory-long/verylongfilename.html' 'text/html' \
|
||||
'/dev/null/EXAMPLE/DIRECTOR/VERYLONG.HTM' n83=1
|
||||
full '/directory-long/verylongfilename.html' 'text/html' \
|
||||
'/dev/null/EXAMPLE_C/DIRECTORY_LONG/VERYLONGFILENAME.HTM' n83=2
|
||||
name '/verylongfilename.php' 'text/html' 'VERYLO-2.HTM' n83=1 \
|
||||
'prior=www.example.com|/other.html|/dev/null/EXAMPLE/VERYLONG.HTM'
|
||||
|
||||
# urlhack dedup (#271): // collapse and www-strip map to the prior link's sav;
|
||||
# the per-feature negatives opt out and take a fresh name.
|
||||
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/PRIOR.html' \
|
||||
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
|
||||
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' no-slash=1 \
|
||||
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
|
||||
full '/w.php' 'text/html' '/dev/null/www.example.com/W-PRIOR.html' adr=example.com \
|
||||
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
|
||||
full '/w.php' 'text/html' '/dev/null/example.com/w.html' adr=example.com no-www=1 \
|
||||
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
|
||||
|
||||
# Distinct URLs must stay distinct under urlhack (no over-normalization).
|
||||
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' \
|
||||
'prior=www.example.com|/a/c.php|/dev/null/www.example.com/a/C-PRIOR.html'
|
||||
|
||||
# --strip-query (#112): stripped key dedups onto the prior sav; without the
|
||||
# option the same URLs stay distinct.
|
||||
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/PAGE-PRIOR.html' \
|
||||
strip=sid 'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
|
||||
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
|
||||
'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
|
||||
|
||||
# A kept key that differs must still block the dedup (no over-stripping).
|
||||
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
|
||||
strip=sid 'prior=www.example.com|/page.php?id=4|/dev/null/www.example.com/PAGE-PRIOR.html'
|
||||
|
||||
# Hostile fils stay rooted under the mirror: ../ (raw or %2e-encoded) drops out,
|
||||
# control characters become spaces, oversized names cap at 210 chars (the cap
|
||||
# can chop the extension off entirely).
|
||||
full '/../../etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
|
||||
full '/%2e%2e/%2e%2e/etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
|
||||
full '/x.php' 'application/pdf' '/dev/null/www.example.com///evil.exe' 'cdispo=../../evil.exe'
|
||||
name $'/evil\rname\t.php' 'text/html' 'evil name .html'
|
||||
name "/$(printf 'a%.0s' {1..300}).php" 'text/html' "$(printf 'a%.0s' {1..210})"
|
||||
name '/x.php' 'application/pdf' 'set_up.exe' 'set:up.exe'
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Content-Disposition names the saved file: the attachment filename replaces
|
||||
# the URL-derived name, and a traversal filename is reduced to its last
|
||||
# component, inside the mirror.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'cdispo/report.pdf' \
|
||||
--file-matches 'cdispo/report.pdf' '%PDF' \
|
||||
--not-found 'cdispo/fetch.pdf' \
|
||||
--found 'cdispo/evil.pdf' \
|
||||
--not-found 'evil.pdf' \
|
||||
httrack 'BASEURL/cdispo/index.html'
|
||||
@@ -38,7 +38,6 @@ TESTS = \
|
||||
01_engine-ftp-line.test \
|
||||
01_engine-ftp-userpass.test \
|
||||
01_engine-hashtable.test \
|
||||
01_engine-header.test \
|
||||
01_engine-idna.test \
|
||||
01_engine-escape-room.test \
|
||||
01_engine-inplace-escape.test \
|
||||
@@ -92,7 +91,6 @@ TESTS = \
|
||||
28_local-pause.test \
|
||||
29_local-redirect-fragment.test \
|
||||
30_local-fragment-link.test \
|
||||
31_local-javaclass.test \
|
||||
32_local-cdispo.test
|
||||
31_local-javaclass.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -134,14 +134,12 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
|
||||
# --- type/extension matrix (issue #267 family) -------------------------
|
||||
|
||||
def send_raw(self, body, content_type, extra_headers=()):
|
||||
def send_raw(self, body, content_type):
|
||||
"""Send a raw body with an explicit Content-Type, or none at all when
|
||||
content_type is None (to observe httrack's typeless-file naming)."""
|
||||
self.send_response(200)
|
||||
if content_type is not None:
|
||||
self.send_header("Content-Type", content_type)
|
||||
for name, value in extra_headers:
|
||||
self.send_header(name, value)
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
@@ -356,27 +354,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
# Content-Disposition naming: the attachment filename replaces the
|
||||
# URL-derived name; path components in it are stripped (RFC 2616).
|
||||
CDISPO_NAMES = {
|
||||
"/cdispo/fetch.php": "report.pdf",
|
||||
"/cdispo/evil.php": "../../evil.pdf",
|
||||
}
|
||||
|
||||
def route_cdispo_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="fetch.php">report</a>\n' '\t<a href="evil.php">evil</a>\n'
|
||||
)
|
||||
|
||||
def route_cdispo(self):
|
||||
filename = self.CDISPO_NAMES[urlsplit(self.path).path]
|
||||
cdispo = 'attachment; filename="%s"' % filename
|
||||
self.send_raw(
|
||||
self.FAKE_PDF,
|
||||
"application/pdf",
|
||||
extra_headers=[("Content-Disposition", cdispo)],
|
||||
)
|
||||
|
||||
# 302 whose Location carries a #fragment (#204): the fragment is a UA anchor
|
||||
# that must be dropped before the target is fetched. A leaked '#' reaches the
|
||||
# strict-server guard below and 400s.
|
||||
@@ -429,9 +406,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/mimex/index.html": route_mimex_index,
|
||||
"/mimex/blob.pdf": route_mimex_blob,
|
||||
"/mimex/real.html": route_mimex_real,
|
||||
"/cdispo/index.html": route_cdispo_index,
|
||||
"/cdispo/fetch.php": route_cdispo,
|
||||
"/cdispo/evil.php": route_cdispo,
|
||||
"/redir/index.html": route_redir_index,
|
||||
"/redir/go.php": route_redir_go,
|
||||
"/redir/target.html": route_redir_target,
|
||||
|
||||
Reference in New Issue
Block a user