mirror of
https://github.com/xroche/httrack.git
synced 2026-06-20 00:58:47 +03:00
Compare commits
5 Commits
fix/xmlns-
...
fix/html-u
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
799c045061 | ||
|
|
fb1ee3bf2e | ||
|
|
6a08ca7d39 | ||
|
|
a8b491e509 | ||
|
|
a8e4bb3b81 |
@@ -296,6 +296,12 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
|
||||
return dst;
|
||||
}
|
||||
|
||||
/* Byte before html, or a space sentinel at the buffer start where html[-1]
|
||||
would underflow; space reads as the word boundary the guards want there. */
|
||||
static HTS_INLINE char html_prevc(const char *html, const char *start) {
|
||||
return html > start ? html[-1] : ' ';
|
||||
}
|
||||
|
||||
/* Main parser */
|
||||
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
@@ -556,7 +562,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (opt->getmode & HTS_GETMODE_HTML) {
|
||||
p = strfield(html, "title");
|
||||
if (p) {
|
||||
if (*(html - 1) == '/')
|
||||
if (html_prevc(html, r->adr) == '/')
|
||||
p = 0; // /title
|
||||
} else {
|
||||
if (strfield(html, "/html"))
|
||||
@@ -1341,6 +1347,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int can_avoid_quotes = 0;
|
||||
char quotes_replacement = '\0';
|
||||
int ensure_not_mime = 0;
|
||||
// @import: the quoted token is the URL; a trailing
|
||||
// media/supports/layer condition is not part of it
|
||||
int is_import = 0;
|
||||
|
||||
if (inscript_tag)
|
||||
expected_end = ";\"\'"; // voir a href="javascript:doc.location='foo'"
|
||||
@@ -1357,9 +1366,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (!nc)
|
||||
nc = strfield(html, ":location"); // javascript:location="doc"
|
||||
if (!nc) { // location="doc"
|
||||
if ((nc = strfield(html, "location"))
|
||||
&& !isspace(*(html - 1))
|
||||
)
|
||||
if ((nc = strfield(html, "location")) &&
|
||||
!isspace(html_prevc(html, r->adr)))
|
||||
nc = 0;
|
||||
}
|
||||
if (!nc)
|
||||
@@ -1380,7 +1388,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
expected = '('; // parenthèse
|
||||
expected_end = ")"; // fin: parenthèse
|
||||
}
|
||||
if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') { // url(url)
|
||||
if (!nc && (nc = strfield(html, "url")) &&
|
||||
(!isalnum(html_prevc(html, r->adr))) &&
|
||||
html_prevc(html, r->adr) != '_') { // url(url)
|
||||
expected = '('; // parenthèse
|
||||
expected_end = ")"; // fin: parenthèse
|
||||
can_avoid_quotes = 1;
|
||||
@@ -1390,6 +1400,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if ((nc = strfield(html, "import"))) { // import "url"
|
||||
if (is_space(*(html + nc))) {
|
||||
expected = 0; // no char expected
|
||||
is_import = 1;
|
||||
} else
|
||||
nc = 0;
|
||||
}
|
||||
@@ -1407,6 +1418,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
|
||||
const char *b, *c;
|
||||
int ndelim = 1;
|
||||
int valid_url = 0;
|
||||
|
||||
if ((*a == 34) || (*a == '\''))
|
||||
a++;
|
||||
@@ -1421,12 +1433,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
b++;
|
||||
}
|
||||
c = b--;
|
||||
c += ndelim;
|
||||
while(*c == ' ')
|
||||
c++;
|
||||
if ((strchr(expected_end, *c)) || (*c == '\n')
|
||||
|| (*c == '\r')) {
|
||||
c -= (ndelim + 1);
|
||||
// no closing delimiter here (truncated input):
|
||||
// Don't scan past the buffer NUL or capture it.
|
||||
if (*c != '\0') {
|
||||
c += ndelim;
|
||||
while (*c == ' ')
|
||||
c++;
|
||||
valid_url =
|
||||
(strchr(expected_end, *c)) || (*c == '\n') ||
|
||||
(*c == '\r') ||
|
||||
(is_import && *(b + 1 + ndelim) == ' ');
|
||||
}
|
||||
if (valid_url) {
|
||||
// URL end = last char (b), not the delimiter
|
||||
c = b;
|
||||
if ((int) (c - a + 1)) {
|
||||
if (ensure_not_mime) {
|
||||
int i = 0;
|
||||
@@ -1485,7 +1505,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,4 +176,59 @@ notfound "ns.gif" "$out3"
|
||||
notfound "og.gif" "$out3"
|
||||
notfound "rdfs.gif" "$out3"
|
||||
|
||||
# CSS @import (#94): every form's target is captured, crawling the .css directly.
|
||||
# The "cond"/"sup"/"spc" cases carry a trailing media/supports/layer condition (or
|
||||
# a space before ';'); they are the negative controls: without the parser fix the
|
||||
# URL is dropped, so a regression fails these found() checks.
|
||||
site4="$tmp/cssimport"
|
||||
mkdir -p "$site4"
|
||||
for f in nq dqu squ dqs sqs med cond sup lay spc; do printf 'body{}\n' >"$site4/$f.css"; done
|
||||
cat >"$site4/main.css" <<'EOF'
|
||||
@import url(nq.css);
|
||||
@import url("dqu.css");
|
||||
@import url('squ.css');
|
||||
@import "dqs.css";
|
||||
@import 'sqs.css';
|
||||
@import url(med.css) screen and (min-width: 400px);
|
||||
@import "cond.css" screen;
|
||||
@import "sup.css" supports(display: flex);
|
||||
@import url(lay.css) layer(base);
|
||||
@import "spc.css" ;
|
||||
EOF
|
||||
out4="$tmp/cssimport-out"
|
||||
crawl "$site4/main.css" "$out4"
|
||||
for f in nq dqu squ dqs sqs med cond sup lay spc; do found "$f.css" "$out4"; done
|
||||
|
||||
# Over-capture guard: the trailing condition is not part of the URL, so it must
|
||||
# survive the rewrite verbatim. A regression that grabs it would mangle these.
|
||||
m4=$(find "$out4" -type f -path '*/file/*' -name main.css -print -quit)
|
||||
test -n "$m4" || ! echo "FAIL: saved main.css not found" || exit 1
|
||||
for cond in '@import "cond.css" screen;' 'supports(display: flex)' 'layer(base)'; do
|
||||
grep -Fq "$cond" "$m4" ||
|
||||
! echo "FAIL #94: '$cond' altered on rewrite (condition captured as URL?)" || exit 1
|
||||
done
|
||||
|
||||
# Malformed input: an unterminated @import quote (truncated CSS) must not crash or
|
||||
# capture a bogus link; a valid sibling import is still captured. Guards a heap
|
||||
# overflow on the URL-end scan that aborts under ASan (CI sanitizer job).
|
||||
site5="$tmp/cssimport-trunc"
|
||||
mkdir -p "$site5"
|
||||
printf 'body{}\n' >"$site5/good.css"
|
||||
printf '@import "good.css";\n@import "trunc' >"$site5/main.css"
|
||||
out5="$tmp/cssimport-trunc-out"
|
||||
crawl "$site5/main.css" "$out5"
|
||||
found "good.css" "$out5"
|
||||
notfound "trunc" "$out5"
|
||||
|
||||
# Offset-0 underflow (#396): a token at the buffer start makes the detector's
|
||||
# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
|
||||
# url() target is still captured; here it just must not underflow.
|
||||
site6="$tmp/parse-off0"
|
||||
mkdir -p "$site6"
|
||||
printf 'body{}\n' >"$site6/off0.css"
|
||||
printf 'url(off0.css)\n' >"$site6/main.css"
|
||||
out6="$tmp/parse-off0-out"
|
||||
crawl "$site6/main.css" "$out6"
|
||||
found "off0.css" "$out6"
|
||||
|
||||
exit 0
|
||||
|
||||
Reference in New Issue
Block a user