Compare commits

..

6 Commits

Author SHA1 Message Date
Xavier Roche
2e948b9acd htsparse: percent-encode parens in rewritten CSS url() (#163)
A source url(...) whose target encodes '(' ')' as %28/%29 was rewritten
with literal parens, because they are RFC2396 "mark" characters that the
URI escaper (escape_uri_utf, mode 30) leaves alone. In an unquoted CSS
url(...) the literal ')' closes the token early, so the browser mis-parses
the value and drops the background image.

Re-escape '(' and ')' back to %28/%29 when emitting the link, gated on the
url() context (ending_p == ')'). The UA decodes them to the saved-on-disk
name, so the reference still resolves. Quoted url("...") and ordinary HTML
attributes keep their parens, matching prior behavior.

Test in 01_engine-parse.test crawls a CSS fixture whose url() references a
%20%28...%29 name and asserts the rewrite keeps the parens encoded;
negative control confirmed (literal-paren output fails it).

Closes #163

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-20 10:01:17 +02:00
Xavier Roche
cae11499f1 Merge pull request #399 from xroche/fix/js-string-falsepos-218
htsparse: don't treat XHR.open's method argument as a URL (#218)
2026-06-19 20:36:26 +02:00
Xavier Roche
02c7f4ebf6 htsparse: don't treat XHR.open's method argument as a URL (#218)
The JavaScript URL detector matched `.open(` for window.open("url",...)
and captured the first argument as a link. XMLHttpRequest.open(method,
url) puts the HTTP method first, so `xhr.open("GET", "ajax_info.txt")`
turned "GET" into a bogus link, rewritten to "GET.html" on a live server.

Reject a first argument that is exactly an HTTP method, mirroring the
existing ensure_not_mime guard. window.open(url) is unaffected; the real
XHR url (the second argument) is still picked up by the dirty parser.

Closes #218

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-19 20:27:04 +02:00
Xavier Roche
9070b44a70 Merge pull request #398 from xroche/fix/html-underflow-396
htsparse: fix buffer underflow reading *(html-1) at offset 0 (#396)
2026-06-19 19:55:40 +02:00
Xavier Roche
799c045061 htsparse: don't read *(html-1) before the parse buffer (#396)
The link detector's word-boundary guards dereference *(html-1) to check
the byte preceding a matched token. When the token sits at the very start
of the parse buffer (html == r->adr), that reads one byte before the
allocation: a heap-buffer-overflow under ASan, silent on a normal build.
A stylesheet beginning with a url() token is enough to hit it.

Route the three reachable guards (url(), location=, the makeindex /title
check) through html_prevc(), which returns a space sentinel at the buffer
start. Space is the right value for these tests: a token at offset 0 is at
a word boundary, so it stays a valid match. The other *(html-1) sites only
run after html has advanced past an opening tag or quote.

Covers it with an offset-0 url() fixture in 01_engine-parse.test; without
the fix it aborts at htsparse.c:1386 under the CI sanitizer job.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-19 19:44:25 +02:00
Xavier Roche
fb1ee3bf2e Merge pull request #397 from xroche/fix/css-import-94
CSS @import: capture URLs that carry a media/supports/layer condition (#94)
2026-06-19 19:30:21 +02:00
2 changed files with 163 additions and 5 deletions

View File

@@ -296,6 +296,48 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
return dst;
}
/* Byte before html, or a space sentinel at the buffer start where html[-1]
would underflow; space reads as the word boundary the guards want there. */
static HTS_INLINE char html_prevc(const char *html, const char *start) {
return html > start ? html[-1] : ' ';
}
/* True if [s, s+len) is exactly an HTTP method token (XHR.open's first
argument is a method, not a URL: #218). Case-insensitive. */
static int is_http_method(const char *s, size_t len) {
static const char *const methods[] = {"GET", "POST", "PUT",
"DELETE", "HEAD", "OPTIONS",
"PATCH", "TRACE", NULL};
int i;
for (i = 0; methods[i] != NULL; i++) {
if (strlen(methods[i]) == len && strfield(s, methods[i]) == (int) len)
return 1;
}
return 0;
}
/* Percent-encode '(' and ')' in a link emitted into an unquoted url(...) (CSS
or JS): a literal ')' closes the token early and the UA mis-parses the value
(#163). The UA decodes %28/%29 back to the saved-on-disk name. */
static void escape_url_parens(char *const s, const size_t size) {
char BIGSTK buff[HTS_URLMAXSIZE * 2];
size_t i, j;
for (i = 0, j = 0; s[i] != '\0' && j + 3 < size && j + 3 < sizeof(buff);
i++) {
if (s[i] == '(' || s[i] == ')') {
buff[j++] = '%';
buff[j++] = '2';
buff[j++] = s[i] == '(' ? '8' : '9';
} else {
buff[j++] = s[i];
}
}
buff[j] = '\0';
strlcpybuff(s, buff, size);
}
/* Main parser */
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
char catbuff[CATBUFF_SIZE];
@@ -556,7 +598,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
if (opt->getmode & HTS_GETMODE_HTML) {
p = strfield(html, "title");
if (p) {
if (*(html - 1) == '/')
if (html_prevc(html, r->adr) == '/')
p = 0; // /title
} else {
if (strfield(html, "/html"))
@@ -1341,6 +1383,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
int can_avoid_quotes = 0;
char quotes_replacement = '\0';
int ensure_not_mime = 0;
// .open(method,url): reject an HTTP-method first arg (#218)
int ensure_not_method = 0;
// @import: the quoted token is the URL; a trailing
// media/supports/layer condition is not part of it
int is_import = 0;
@@ -1360,9 +1404,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
if (!nc)
nc = strfield(html, ":location"); // javascript:location="doc"
if (!nc) { // location="doc"
if ((nc = strfield(html, "location"))
&& !isspace(*(html - 1))
)
if ((nc = strfield(html, "location")) &&
!isspace(html_prevc(html, r->adr)))
nc = 0;
}
if (!nc)
@@ -1372,6 +1415,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
expected = '('; // parenthèse
expected_end = "),"; // fin: virgule ou parenthèse
ensure_not_mime = 1; //* ensure the url is not a mime type */
ensure_not_method = 1; // xhr.open: don't grab method
}
if (!nc)
if ((nc = strfield(html, ".replace"))) { // window.replace("url")
@@ -1383,7 +1427,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
expected = '('; // parenthèse
expected_end = ")"; // fin: parenthèse
}
if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') { // url(url)
if (!nc && (nc = strfield(html, "url")) &&
(!isalnum(html_prevc(html, r->adr))) &&
html_prevc(html, r->adr) != '_') { // url(url)
expected = '('; // parenthèse
expected_end = ")"; // fin: parenthèse
can_avoid_quotes = 1;
@@ -1455,6 +1501,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
i++;
}
}
// XHR.open's "GET" etc. is a method, not a URL
if (a != NULL && ensure_not_method &&
is_http_method(a, (size_t) (c - a + 1))) {
a = NULL;
}
// Check for bogus links (Vasiliy)
if (a != NULL) {
const size_t size = c - a + 1;
@@ -2997,6 +3048,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
/* Never escape high-chars (we don't know the encoding!!) */
inplace_escape_uri_utf(tempo, sizeof(tempo));
// unquoted url() (CSS/JS): keep parens escaped
if (ending_p == ')')
escape_url_parens(tempo, sizeof(tempo));
//if (!no_esc_utf)
// escape_uri(tempo); // escape with %xx
//else {

View File

@@ -220,4 +220,107 @@ crawl "$site5/main.css" "$out5"
found "good.css" "$out5"
notfound "trunc" "$out5"
# Offset-0 underflow (#396): a token at the buffer start makes the detector's
# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
# url() target is still captured; here it just must not underflow.
site6="$tmp/parse-off0"
mkdir -p "$site6"
printf 'body{}\n' >"$site6/off0.css"
printf 'url(off0.css)\n' >"$site6/main.css"
out6="$tmp/parse-off0-out"
crawl "$site6/main.css" "$out6"
found "off0.css" "$out6"
# XMLHttpRequest.open(method, url) (#218): the first argument is an HTTP method,
# not a URL. Without the fix "GET" is captured as a link and fetched (the offline
# fixture saves a bare file named GET; a live server mangles it to GET.html).
# window.open(url) detection must be unaffected.
site7="$tmp/xhropen"
mkdir -p "$site7"
gif "$site7/winopen.gif"
cat >"$site7/index.html" <<EOF
<html><body><script>
var x = new XMLHttpRequest();
x.open("GET", "ajax_info.txt");
var y = new XMLHttpRequest();
y.open("Post", "submit.cgi");
window.open("file://$site7/winopen.gif");
</script></body></html>
EOF
out7="$tmp/xhropen-out"
crawl "$site7/index.html" "$out7"
# negative control: without the fix a file named exactly GET is downloaded
notfound "GET" "$out7"
# methods are matched case-insensitively (XHR spec normalizes them): a mixed-case
# method is rejected too, so a file named Post must not appear either
notfound "Post" "$out7"
# regression guard: window.open(url) is still detected, so its absolute URL is
# rewritten to a local link. The rewrite only happens if the parser saw it, so
# these two assertions fail if .open detection broke (not a trivial --near save).
saved7=$(savedhtml "$out7")
test -n "$saved7" || ! echo "FAIL: saved xhr page not found" || exit 1
grep -Fq 'window.open("winopen.gif")' "$saved7" ||
! echo "FAIL #218: window.open(url) no longer detected/rewritten" || exit 1
! grep -Fq 'window.open("file://' "$saved7" ||
! echo "FAIL #218: window.open URL left absolute (not rewritten)" || exit 1
# Parens in an unquoted url(...) (#163): the source %28/%29 decode to literal
# '(' ')' in the saved name, but a literal ')' in the rewritten url() closes the
# token early, so they must stay encoded. Negative control: without the fix the
# %281%29 greps fail (parens are RFC2396 "mark" chars the escaper leaves alone).
site8="$tmp/cssparens"
mkdir -p "$site8"
for f in 'img (1).gif' 'a(b)c(1).gif' 'q (4).gif'; do gif "$site8/$f"; done
cat >"$site8/style.css" <<'EOF'
.a { background: url(img%20%281%29.gif); }
.b { background: url(a%28b%29c%281%29.gif); }
.c { background: url("q%20%284%29.gif"); }
EOF
out8="$tmp/cssparens-out"
crawl "$site8/style.css" "$out8"
found "img (1).gif" "$out8"
found "a(b)c(1).gif" "$out8"
found "q (4).gif" "$out8"
css8=$(find "$out8" -type f -path '*/file/*' -name style.css -print -quit)
test -n "$css8" || ! echo "FAIL: saved style.css not found" || exit 1
grep -Fq 'url(img%20%281%29.gif)' "$css8" ||
! echo "FAIL #163: parens in unquoted url() not percent-encoded on rewrite" || exit 1
grep -Fq 'url(a%28b%29c%281%29.gif)' "$css8" ||
! echo "FAIL #163: not every paren in a url() was percent-encoded" || exit 1
grep -Fq 'url("q%20%284%29.gif")' "$css8" ||
! echo "FAIL #163: quoted url() altered or parens left literal on rewrite" || exit 1
# The url() detector is not CSS-specific: <script> and inline style= get the
# same encoding, but ordinary href/src (ending_p is the quote, not ')') keep
# literal parens -- the attribute checks guard the gate against over-firing.
site9="$tmp/urlparens"
mkdir -p "$site9"
for f in 'js (1).gif' 'inl (2).gif' 'asrc (3).gif' 'ahref (4).gif'; do gif "$site9/$f"; done
cat >"$site9/index.html" <<EOF
<html><body>
<script>var bg = "url(js%20%281%29.gif)";</script>
<div style="background-image:url(inl%20%282%29.gif)"></div>
<img src="asrc%20%283%29.gif">
<a href="ahref%20%284%29.gif">link</a>
</body></html>
EOF
out9="$tmp/urlparens-out"
crawl "$site9/index.html" "$out9"
saved9=$(savedhtml "$out9")
test -n "$saved9" || ! echo "FAIL: saved urlparens page not found" || exit 1
# rewrite-only: the JS-string asset is not queued for download
grep -Fq 'url(js%20%281%29.gif)' "$saved9" ||
! echo "FAIL #163: parens in <script> url() not percent-encoded" || exit 1
found "inl (2).gif" "$out9"
grep -Fq 'url(inl%20%282%29.gif)' "$saved9" ||
! echo "FAIL #163: parens in inline style url() not percent-encoded" || exit 1
found "asrc (3).gif" "$out9"
found "ahref (4).gif" "$out9"
grep -Fq 'src="asrc%20(3).gif"' "$saved9" ||
! echo "FAIL #163: parens in a plain src attribute were wrongly encoded" || exit 1
grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
! echo "FAIL #163: parens in a plain href attribute were wrongly encoded" || exit 1
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
exit 0