mirror of
https://github.com/xroche/httrack.git
synced 2026-06-20 00:58:47 +03:00
Compare commits
4 Commits
fix/html-u
...
fix/css-ur
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
42acbe6c97 | ||
|
|
cae11499f1 | ||
|
|
02c7f4ebf6 | ||
|
|
9070b44a70 |
@@ -302,6 +302,42 @@ static HTS_INLINE char html_prevc(const char *html, const char *start) {
|
||||
return html > start ? html[-1] : ' ';
|
||||
}
|
||||
|
||||
/* True if [s, s+len) is exactly an HTTP method token (XHR.open's first
|
||||
argument is a method, not a URL: #218). Case-insensitive. */
|
||||
static int is_http_method(const char *s, size_t len) {
|
||||
static const char *const methods[] = {"GET", "POST", "PUT",
|
||||
"DELETE", "HEAD", "OPTIONS",
|
||||
"PATCH", "TRACE", NULL};
|
||||
int i;
|
||||
|
||||
for (i = 0; methods[i] != NULL; i++) {
|
||||
if (strlen(methods[i]) == len && strfield(s, methods[i]) == (int) len)
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Percent-encode '(' and ')' in a link emitted into an unquoted CSS url(...):
|
||||
a literal ')' closes the token early and the UA mis-parses the value (#163).
|
||||
The UA decodes %28/%29 back to the saved-on-disk name. */
|
||||
static void escape_url_parens(char *const s, const size_t size) {
|
||||
char BIGSTK buff[HTS_URLMAXSIZE * 2];
|
||||
size_t i, j;
|
||||
|
||||
for (i = 0, j = 0; s[i] != '\0' && j + 3 < size && j + 3 < sizeof(buff);
|
||||
i++) {
|
||||
if (s[i] == '(' || s[i] == ')') {
|
||||
buff[j++] = '%';
|
||||
buff[j++] = '2';
|
||||
buff[j++] = s[i] == '(' ? '8' : '9';
|
||||
} else {
|
||||
buff[j++] = s[i];
|
||||
}
|
||||
}
|
||||
buff[j] = '\0';
|
||||
strlcpybuff(s, buff, size);
|
||||
}
|
||||
|
||||
/* Main parser */
|
||||
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
@@ -1347,6 +1383,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int can_avoid_quotes = 0;
|
||||
char quotes_replacement = '\0';
|
||||
int ensure_not_mime = 0;
|
||||
// .open(method,url): reject an HTTP-method first arg (#218)
|
||||
int ensure_not_method = 0;
|
||||
// @import: the quoted token is the URL; a trailing
|
||||
// media/supports/layer condition is not part of it
|
||||
int is_import = 0;
|
||||
@@ -1377,6 +1415,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
expected = '('; // parenthèse
|
||||
expected_end = "),"; // fin: virgule ou parenthèse
|
||||
ensure_not_mime = 1; //* ensure the url is not a mime type */
|
||||
ensure_not_method = 1; // xhr.open: don't grab method
|
||||
}
|
||||
if (!nc)
|
||||
if ((nc = strfield(html, ".replace"))) { // window.replace("url")
|
||||
@@ -1462,6 +1501,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
// XHR.open's "GET" etc. is a method, not a URL
|
||||
if (a != NULL && ensure_not_method &&
|
||||
is_http_method(a, (size_t) (c - a + 1))) {
|
||||
a = NULL;
|
||||
}
|
||||
// Check for bogus links (Vasiliy)
|
||||
if (a != NULL) {
|
||||
const size_t size = c - a + 1;
|
||||
@@ -3004,6 +3048,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
/* Never escape high-chars (we don't know the encoding!!) */
|
||||
inplace_escape_uri_utf(tempo, sizeof(tempo));
|
||||
|
||||
// unquoted CSS url(...): keep parens escaped (#163)
|
||||
if (ending_p == ')')
|
||||
escape_url_parens(tempo, sizeof(tempo));
|
||||
|
||||
//if (!no_esc_utf)
|
||||
// escape_uri(tempo); // escape with %xx
|
||||
//else {
|
||||
|
||||
@@ -231,4 +231,55 @@ out6="$tmp/parse-off0-out"
|
||||
crawl "$site6/main.css" "$out6"
|
||||
found "off0.css" "$out6"
|
||||
|
||||
# XMLHttpRequest.open(method, url) (#218): the first argument is an HTTP method,
|
||||
# not a URL. Without the fix "GET" is captured as a link and fetched (the offline
|
||||
# fixture saves a bare file named GET; a live server mangles it to GET.html).
|
||||
# window.open(url) detection must be unaffected.
|
||||
site7="$tmp/xhropen"
|
||||
mkdir -p "$site7"
|
||||
gif "$site7/winopen.gif"
|
||||
cat >"$site7/index.html" <<EOF
|
||||
<html><body><script>
|
||||
var x = new XMLHttpRequest();
|
||||
x.open("GET", "ajax_info.txt");
|
||||
var y = new XMLHttpRequest();
|
||||
y.open("Post", "submit.cgi");
|
||||
window.open("file://$site7/winopen.gif");
|
||||
</script></body></html>
|
||||
EOF
|
||||
out7="$tmp/xhropen-out"
|
||||
crawl "$site7/index.html" "$out7"
|
||||
# negative control: without the fix a file named exactly GET is downloaded
|
||||
notfound "GET" "$out7"
|
||||
# methods are matched case-insensitively (XHR spec normalizes them): a mixed-case
|
||||
# method is rejected too, so a file named Post must not appear either
|
||||
notfound "Post" "$out7"
|
||||
# regression guard: window.open(url) is still detected, so its absolute URL is
|
||||
# rewritten to a local link. The rewrite only happens if the parser saw it, so
|
||||
# these two assertions fail if .open detection broke (not a trivial --near save).
|
||||
saved7=$(savedhtml "$out7")
|
||||
test -n "$saved7" || ! echo "FAIL: saved xhr page not found" || exit 1
|
||||
grep -Fq 'window.open("winopen.gif")' "$saved7" ||
|
||||
! echo "FAIL #218: window.open(url) no longer detected/rewritten" || exit 1
|
||||
! grep -Fq 'window.open("file://' "$saved7" ||
|
||||
! echo "FAIL #218: window.open URL left absolute (not rewritten)" || exit 1
|
||||
|
||||
# Parens inside an unquoted CSS url(...) (#163): the saved-on-disk name has
|
||||
# literal '(' ')' (the source %28/%29 decode when fetching), but a literal ')'
|
||||
# in the rewritten url() would close the token early and break the value, so
|
||||
# they must stay percent-encoded. Negative control: without the fix the output
|
||||
# is url(img%20(1).gif) and the grep for %281%29 fails (parens are RFC2396
|
||||
# "mark" chars, which the URI escaper leaves alone).
|
||||
site8="$tmp/cssparens"
|
||||
mkdir -p "$site8"
|
||||
gif "$site8/img (1).gif"
|
||||
printf 'body { background-image: url(img%%20%%281%%29.gif); }\n' >"$site8/style.css"
|
||||
out8="$tmp/cssparens-out"
|
||||
crawl "$site8/style.css" "$out8"
|
||||
found "img (1).gif" "$out8"
|
||||
css8=$(find "$out8" -type f -path '*/file/*' -name style.css -print -quit)
|
||||
test -n "$css8" || ! echo "FAIL: saved style.css not found" || exit 1
|
||||
grep -Fq 'url(img%20%281%29.gif)' "$css8" ||
|
||||
! echo "FAIL #163: parens in unquoted CSS url() not percent-encoded on rewrite" || exit 1
|
||||
|
||||
exit 0
|
||||
|
||||
Reference in New Issue
Block a user