mirror of
https://github.com/xroche/httrack.git
synced 2026-06-21 09:38:24 +03:00
Compare commits
5 Commits
fix/proxy-
...
fix/css-im
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6a08ca7d39 | ||
|
|
a8b491e509 | ||
|
|
a8e4bb3b81 | ||
|
|
0145ec37a3 | ||
|
|
a80fab38ba |
@@ -1341,6 +1341,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
int can_avoid_quotes = 0;
|
int can_avoid_quotes = 0;
|
||||||
char quotes_replacement = '\0';
|
char quotes_replacement = '\0';
|
||||||
int ensure_not_mime = 0;
|
int ensure_not_mime = 0;
|
||||||
|
// @import: the quoted token is the URL; a trailing
|
||||||
|
// media/supports/layer condition is not part of it
|
||||||
|
int is_import = 0;
|
||||||
|
|
||||||
if (inscript_tag)
|
if (inscript_tag)
|
||||||
expected_end = ";\"\'"; // voir a href="javascript:doc.location='foo'"
|
expected_end = ";\"\'"; // voir a href="javascript:doc.location='foo'"
|
||||||
@@ -1390,6 +1393,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if ((nc = strfield(html, "import"))) { // import "url"
|
if ((nc = strfield(html, "import"))) { // import "url"
|
||||||
if (is_space(*(html + nc))) {
|
if (is_space(*(html + nc))) {
|
||||||
expected = 0; // no char expected
|
expected = 0; // no char expected
|
||||||
|
is_import = 1;
|
||||||
} else
|
} else
|
||||||
nc = 0;
|
nc = 0;
|
||||||
}
|
}
|
||||||
@@ -1407,6 +1411,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
|
if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
|
||||||
const char *b, *c;
|
const char *b, *c;
|
||||||
int ndelim = 1;
|
int ndelim = 1;
|
||||||
|
int valid_url = 0;
|
||||||
|
|
||||||
if ((*a == 34) || (*a == '\''))
|
if ((*a == 34) || (*a == '\''))
|
||||||
a++;
|
a++;
|
||||||
@@ -1421,12 +1426,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
b++;
|
b++;
|
||||||
}
|
}
|
||||||
c = b--;
|
c = b--;
|
||||||
|
// no closing delimiter here (truncated input):
|
||||||
|
// Don't scan past the buffer NUL or capture it.
|
||||||
|
if (*c != '\0') {
|
||||||
c += ndelim;
|
c += ndelim;
|
||||||
while(*c == ' ')
|
while (*c == ' ')
|
||||||
c++;
|
c++;
|
||||||
if ((strchr(expected_end, *c)) || (*c == '\n')
|
valid_url =
|
||||||
|| (*c == '\r')) {
|
(strchr(expected_end, *c)) || (*c == '\n') ||
|
||||||
c -= (ndelim + 1);
|
(*c == '\r') ||
|
||||||
|
(is_import && *(b + 1 + ndelim) == ' ');
|
||||||
|
}
|
||||||
|
if (valid_url) {
|
||||||
|
// URL end = last char (b), not the delimiter
|
||||||
|
c = b;
|
||||||
if ((int) (c - a + 1)) {
|
if ((int) (c - a + 1)) {
|
||||||
if (ensure_not_mime) {
|
if (ensure_not_mime) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
@@ -1485,7 +1498,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1692,6 +1704,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
hts_nodetect[i -
|
hts_nodetect[i -
|
||||||
1]);
|
1]);
|
||||||
}
|
}
|
||||||
|
// xmlns / xmlns:prefix declare
|
||||||
|
// XML namespaces, not resources
|
||||||
|
// (#191)
|
||||||
|
else {
|
||||||
|
const int xl = strfield(
|
||||||
|
intag_startattr, "xmlns");
|
||||||
|
const char xc =
|
||||||
|
intag_startattr[xl];
|
||||||
|
if (xl &&
|
||||||
|
(xc == ':' || xc == '=' ||
|
||||||
|
is_space(xc))) {
|
||||||
|
url_ok = 0;
|
||||||
|
hts_log_print(
|
||||||
|
opt, LOG_DEBUG,
|
||||||
|
"dirty parsing: xmlns "
|
||||||
|
"namespace avoided");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -154,4 +154,70 @@ grep -Eq "style=\"background-image:url\('ibgs\.gif'\)\"" "$saved2" ||
|
|||||||
grep -q 'title="file://' "$saved2" ||
|
grep -q 'title="file://' "$saved2" ||
|
||||||
! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1
|
! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1
|
||||||
|
|
||||||
|
# xmlns / xmlns:prefix decls must not be crawled (#191). Local file:// targets so a
|
||||||
|
# regression downloads them; each is the LAST attr (heuristic only scans a value before '>').
|
||||||
|
site3="$tmp/xmlns"
|
||||||
|
mkdir -p "$site3"
|
||||||
|
for f in ns og rdfs real; do gif "$site3/$f.gif"; done
|
||||||
|
cat >"$site3/index.html" <<EOF
|
||||||
|
<html xmlns="file://$site3/ns.gif"><body>
|
||||||
|
<svg xmlns:og="file://$site3/og.gif"></svg>
|
||||||
|
<div class="c" xmlns:rdfs="file://$site3/rdfs.gif"></div>
|
||||||
|
<a href="file://$site3/real.gif">real link</a>
|
||||||
|
</body></html>
|
||||||
|
EOF
|
||||||
|
out3="$tmp/xmlns-out"
|
||||||
|
crawl "$site3/index.html" "$out3"
|
||||||
|
|
||||||
|
# the real link is still captured
|
||||||
|
found "real.gif" "$out3"
|
||||||
|
# namespace-declaration targets must not be fetched (default + prefixed forms)
|
||||||
|
notfound "ns.gif" "$out3"
|
||||||
|
notfound "og.gif" "$out3"
|
||||||
|
notfound "rdfs.gif" "$out3"
|
||||||
|
|
||||||
|
# CSS @import (#94): every form's target is captured, crawling the .css directly.
|
||||||
|
# The "cond"/"sup"/"spc" cases carry a trailing media/supports/layer condition (or
|
||||||
|
# a space before ';'); they are the negative controls: without the parser fix the
|
||||||
|
# URL is dropped, so a regression fails these found() checks.
|
||||||
|
site4="$tmp/cssimport"
|
||||||
|
mkdir -p "$site4"
|
||||||
|
for f in nq dqu squ dqs sqs med cond sup lay spc; do printf 'body{}\n' >"$site4/$f.css"; done
|
||||||
|
cat >"$site4/main.css" <<'EOF'
|
||||||
|
@import url(nq.css);
|
||||||
|
@import url("dqu.css");
|
||||||
|
@import url('squ.css');
|
||||||
|
@import "dqs.css";
|
||||||
|
@import 'sqs.css';
|
||||||
|
@import url(med.css) screen and (min-width: 400px);
|
||||||
|
@import "cond.css" screen;
|
||||||
|
@import "sup.css" supports(display: flex);
|
||||||
|
@import url(lay.css) layer(base);
|
||||||
|
@import "spc.css" ;
|
||||||
|
EOF
|
||||||
|
out4="$tmp/cssimport-out"
|
||||||
|
crawl "$site4/main.css" "$out4"
|
||||||
|
for f in nq dqu squ dqs sqs med cond sup lay spc; do found "$f.css" "$out4"; done
|
||||||
|
|
||||||
|
# Over-capture guard: the trailing condition is not part of the URL, so it must
|
||||||
|
# survive the rewrite verbatim. A regression that grabs it would mangle these.
|
||||||
|
m4=$(find "$out4" -type f -path '*/file/*' -name main.css -print -quit)
|
||||||
|
test -n "$m4" || ! echo "FAIL: saved main.css not found" || exit 1
|
||||||
|
for cond in '@import "cond.css" screen;' 'supports(display: flex)' 'layer(base)'; do
|
||||||
|
grep -Fq "$cond" "$m4" ||
|
||||||
|
! echo "FAIL #94: '$cond' altered on rewrite (condition captured as URL?)" || exit 1
|
||||||
|
done
|
||||||
|
|
||||||
|
# Malformed input: an unterminated @import quote (truncated CSS) must not crash or
|
||||||
|
# capture a bogus link; a valid sibling import is still captured. Guards a heap
|
||||||
|
# overflow on the URL-end scan that aborts under ASan (CI sanitizer job).
|
||||||
|
site5="$tmp/cssimport-trunc"
|
||||||
|
mkdir -p "$site5"
|
||||||
|
printf 'body{}\n' >"$site5/good.css"
|
||||||
|
printf '@import "good.css";\n@import "trunc' >"$site5/main.css"
|
||||||
|
out5="$tmp/cssimport-trunc-out"
|
||||||
|
crawl "$site5/main.css" "$out5"
|
||||||
|
found "good.css" "$out5"
|
||||||
|
notfound "trunc" "$out5"
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
Reference in New Issue
Block a user