Compare commits

...

2 Commits

Author SHA1 Message Date
Xavier Roche
2e948b9acd htsparse: percent-encode parens in rewritten CSS url() (#163)
A source url(...) whose target encodes '(' ')' as %28/%29 was rewritten
with literal parens, because they are RFC2396 "mark" characters that the
URI escaper (escape_uri_utf, mode 30) leaves alone. In an unquoted CSS
url(...) the literal ')' closes the token early, so the browser mis-parses
the value and drops the background image.

Re-escape '(' and ')' back to %28/%29 when emitting the link, gated on the
url() context (ending_p == ')'). The UA decodes them to the saved-on-disk
name, so the reference still resolves. Quoted url("...") and ordinary HTML
attributes keep their parens, matching prior behavior.

Test in 01_engine-parse.test crawls a CSS fixture whose url() references a
%20%28...%29 name and asserts the rewrite keeps the parens encoded;
negative control confirmed (literal-paren output fails it).

Closes #163

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-20 10:01:17 +02:00
Xavier Roche
cae11499f1 Merge pull request #399 from xroche/fix/js-string-falsepos-218
htsparse: don't treat XHR.open's method argument as a URL (#218)
2026-06-19 20:36:26 +02:00
2 changed files with 84 additions and 0 deletions

View File

@@ -317,6 +317,27 @@ static int is_http_method(const char *s, size_t len) {
return 0;
}
/* Percent-encode '(' and ')' in a link emitted into an unquoted url(...) (CSS
or JS): a literal ')' closes the token early and the UA mis-parses the value
(#163). The UA decodes %28/%29 back to the saved-on-disk name. */
static void escape_url_parens(char *const s, const size_t size) {
char BIGSTK buff[HTS_URLMAXSIZE * 2];
size_t i, j;
for (i = 0, j = 0; s[i] != '\0' && j + 3 < size && j + 3 < sizeof(buff);
i++) {
if (s[i] == '(' || s[i] == ')') {
buff[j++] = '%';
buff[j++] = '2';
buff[j++] = s[i] == '(' ? '8' : '9';
} else {
buff[j++] = s[i];
}
}
buff[j] = '\0';
strlcpybuff(s, buff, size);
}
/* Main parser */
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
char catbuff[CATBUFF_SIZE];
@@ -3027,6 +3048,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
/* Never escape high-chars (we don't know the encoding!!) */
inplace_escape_uri_utf(tempo, sizeof(tempo));
// unquoted url() (CSS/JS): keep parens escaped
if (ending_p == ')')
escape_url_parens(tempo, sizeof(tempo));
//if (!no_esc_utf)
// escape_uri(tempo); // escape with %xx
//else {

View File

@@ -264,4 +264,63 @@ grep -Fq 'window.open("winopen.gif")' "$saved7" ||
! grep -Fq 'window.open("file://' "$saved7" ||
! echo "FAIL #218: window.open URL left absolute (not rewritten)" || exit 1
# Parens in an unquoted url(...) (#163): the source %28/%29 decode to literal
# '(' ')' in the saved name, but a literal ')' in the rewritten url() closes the
# token early, so they must stay encoded. Negative control: without the fix the
# %281%29 greps fail (parens are RFC2396 "mark" chars the escaper leaves alone).
site8="$tmp/cssparens"
mkdir -p "$site8"
for f in 'img (1).gif' 'a(b)c(1).gif' 'q (4).gif'; do gif "$site8/$f"; done
cat >"$site8/style.css" <<'EOF'
.a { background: url(img%20%281%29.gif); }
.b { background: url(a%28b%29c%281%29.gif); }
.c { background: url("q%20%284%29.gif"); }
EOF
out8="$tmp/cssparens-out"
crawl "$site8/style.css" "$out8"
found "img (1).gif" "$out8"
found "a(b)c(1).gif" "$out8"
found "q (4).gif" "$out8"
css8=$(find "$out8" -type f -path '*/file/*' -name style.css -print -quit)
test -n "$css8" || ! echo "FAIL: saved style.css not found" || exit 1
grep -Fq 'url(img%20%281%29.gif)' "$css8" ||
! echo "FAIL #163: parens in unquoted url() not percent-encoded on rewrite" || exit 1
grep -Fq 'url(a%28b%29c%281%29.gif)' "$css8" ||
! echo "FAIL #163: not every paren in a url() was percent-encoded" || exit 1
grep -Fq 'url("q%20%284%29.gif")' "$css8" ||
! echo "FAIL #163: quoted url() altered or parens left literal on rewrite" || exit 1
# The url() detector is not CSS-specific: <script> and inline style= get the
# same encoding, but ordinary href/src (ending_p is the quote, not ')') keep
# literal parens -- the attribute checks guard the gate against over-firing.
site9="$tmp/urlparens"
mkdir -p "$site9"
for f in 'js (1).gif' 'inl (2).gif' 'asrc (3).gif' 'ahref (4).gif'; do gif "$site9/$f"; done
cat >"$site9/index.html" <<EOF
<html><body>
<script>var bg = "url(js%20%281%29.gif)";</script>
<div style="background-image:url(inl%20%282%29.gif)"></div>
<img src="asrc%20%283%29.gif">
<a href="ahref%20%284%29.gif">link</a>
</body></html>
EOF
out9="$tmp/urlparens-out"
crawl "$site9/index.html" "$out9"
saved9=$(savedhtml "$out9")
test -n "$saved9" || ! echo "FAIL: saved urlparens page not found" || exit 1
# rewrite-only: the JS-string asset is not queued for download
grep -Fq 'url(js%20%281%29.gif)' "$saved9" ||
! echo "FAIL #163: parens in <script> url() not percent-encoded" || exit 1
found "inl (2).gif" "$out9"
grep -Fq 'url(inl%20%282%29.gif)' "$saved9" ||
! echo "FAIL #163: parens in inline style url() not percent-encoded" || exit 1
found "asrc (3).gif" "$out9"
found "ahref (4).gif" "$out9"
grep -Fq 'src="asrc%20(3).gif"' "$saved9" ||
! echo "FAIL #163: parens in a plain src attribute were wrongly encoded" || exit 1
grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
! echo "FAIL #163: parens in a plain href attribute were wrongly encoded" || exit 1
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
exit 0