mirror of
https://github.com/xroche/httrack.git
synced 2026-06-20 00:58:47 +03:00
Compare commits
5 Commits
fix/css-im
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cae11499f1 | ||
|
|
02c7f4ebf6 | ||
|
|
9070b44a70 | ||
|
|
799c045061 | ||
|
|
fb1ee3bf2e |
@@ -296,6 +296,27 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
|
||||
return dst;
|
||||
}
|
||||
|
||||
/* Byte before html, or a space sentinel at the buffer start where html[-1]
|
||||
would underflow; space reads as the word boundary the guards want there. */
|
||||
static HTS_INLINE char html_prevc(const char *html, const char *start) {
|
||||
return html > start ? html[-1] : ' ';
|
||||
}
|
||||
|
||||
/* True if [s, s+len) is exactly an HTTP method token (XHR.open's first
|
||||
argument is a method, not a URL: #218). Case-insensitive. */
|
||||
static int is_http_method(const char *s, size_t len) {
|
||||
static const char *const methods[] = {"GET", "POST", "PUT",
|
||||
"DELETE", "HEAD", "OPTIONS",
|
||||
"PATCH", "TRACE", NULL};
|
||||
int i;
|
||||
|
||||
for (i = 0; methods[i] != NULL; i++) {
|
||||
if (strlen(methods[i]) == len && strfield(s, methods[i]) == (int) len)
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Main parser */
|
||||
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
@@ -556,7 +577,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (opt->getmode & HTS_GETMODE_HTML) {
|
||||
p = strfield(html, "title");
|
||||
if (p) {
|
||||
if (*(html - 1) == '/')
|
||||
if (html_prevc(html, r->adr) == '/')
|
||||
p = 0; // /title
|
||||
} else {
|
||||
if (strfield(html, "/html"))
|
||||
@@ -1341,6 +1362,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int can_avoid_quotes = 0;
|
||||
char quotes_replacement = '\0';
|
||||
int ensure_not_mime = 0;
|
||||
// .open(method,url): reject an HTTP-method first arg (#218)
|
||||
int ensure_not_method = 0;
|
||||
// @import: the quoted token is the URL; a trailing
|
||||
// media/supports/layer condition is not part of it
|
||||
int is_import = 0;
|
||||
@@ -1360,9 +1383,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (!nc)
|
||||
nc = strfield(html, ":location"); // javascript:location="doc"
|
||||
if (!nc) { // location="doc"
|
||||
if ((nc = strfield(html, "location"))
|
||||
&& !isspace(*(html - 1))
|
||||
)
|
||||
if ((nc = strfield(html, "location")) &&
|
||||
!isspace(html_prevc(html, r->adr)))
|
||||
nc = 0;
|
||||
}
|
||||
if (!nc)
|
||||
@@ -1372,6 +1394,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
expected = '('; // parenthèse
|
||||
expected_end = "),"; // fin: virgule ou parenthèse
|
||||
ensure_not_mime = 1; //* ensure the url is not a mime type */
|
||||
ensure_not_method = 1; // xhr.open: don't grab method
|
||||
}
|
||||
if (!nc)
|
||||
if ((nc = strfield(html, ".replace"))) { // window.replace("url")
|
||||
@@ -1383,7 +1406,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
expected = '('; // parenthèse
|
||||
expected_end = ")"; // fin: parenthèse
|
||||
}
|
||||
if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') { // url(url)
|
||||
if (!nc && (nc = strfield(html, "url")) &&
|
||||
(!isalnum(html_prevc(html, r->adr))) &&
|
||||
html_prevc(html, r->adr) != '_') { // url(url)
|
||||
expected = '('; // parenthèse
|
||||
expected_end = ")"; // fin: parenthèse
|
||||
can_avoid_quotes = 1;
|
||||
@@ -1455,6 +1480,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
// XHR.open's "GET" etc. is a method, not a URL
|
||||
if (a != NULL && ensure_not_method &&
|
||||
is_http_method(a, (size_t) (c - a + 1))) {
|
||||
a = NULL;
|
||||
}
|
||||
// Check for bogus links (Vasiliy)
|
||||
if (a != NULL) {
|
||||
const size_t size = c - a + 1;
|
||||
|
||||
@@ -220,4 +220,48 @@ crawl "$site5/main.css" "$out5"
|
||||
found "good.css" "$out5"
|
||||
notfound "trunc" "$out5"
|
||||
|
||||
# Offset-0 underflow (#396): a token at the buffer start makes the detector's
|
||||
# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
|
||||
# url() target is still captured; here it just must not underflow.
|
||||
site6="$tmp/parse-off0"
|
||||
mkdir -p "$site6"
|
||||
printf 'body{}\n' >"$site6/off0.css"
|
||||
printf 'url(off0.css)\n' >"$site6/main.css"
|
||||
out6="$tmp/parse-off0-out"
|
||||
crawl "$site6/main.css" "$out6"
|
||||
found "off0.css" "$out6"
|
||||
|
||||
# XMLHttpRequest.open(method, url) (#218): the first argument is an HTTP method,
|
||||
# not a URL. Without the fix "GET" is captured as a link and fetched (the offline
|
||||
# fixture saves a bare file named GET; a live server mangles it to GET.html).
|
||||
# window.open(url) detection must be unaffected.
|
||||
site7="$tmp/xhropen"
|
||||
mkdir -p "$site7"
|
||||
gif "$site7/winopen.gif"
|
||||
cat >"$site7/index.html" <<EOF
|
||||
<html><body><script>
|
||||
var x = new XMLHttpRequest();
|
||||
x.open("GET", "ajax_info.txt");
|
||||
var y = new XMLHttpRequest();
|
||||
y.open("Post", "submit.cgi");
|
||||
window.open("file://$site7/winopen.gif");
|
||||
</script></body></html>
|
||||
EOF
|
||||
out7="$tmp/xhropen-out"
|
||||
crawl "$site7/index.html" "$out7"
|
||||
# negative control: without the fix a file named exactly GET is downloaded
|
||||
notfound "GET" "$out7"
|
||||
# methods are matched case-insensitively (XHR spec normalizes them): a mixed-case
|
||||
# method is rejected too, so a file named Post must not appear either
|
||||
notfound "Post" "$out7"
|
||||
# regression guard: window.open(url) is still detected, so its absolute URL is
|
||||
# rewritten to a local link. The rewrite only happens if the parser saw it, so
|
||||
# these two assertions fail if .open detection broke (not a trivial --near save).
|
||||
saved7=$(savedhtml "$out7")
|
||||
test -n "$saved7" || ! echo "FAIL: saved xhr page not found" || exit 1
|
||||
grep -Fq 'window.open("winopen.gif")' "$saved7" ||
|
||||
! echo "FAIL #218: window.open(url) no longer detected/rewritten" || exit 1
|
||||
! grep -Fq 'window.open("file://' "$saved7" ||
|
||||
! echo "FAIL #218: window.open URL left absolute (not rewritten)" || exit 1
|
||||
|
||||
exit 0
|
||||
|
||||
Reference in New Issue
Block a user