Compare commits

...

1 Commits

Author SHA1 Message Date
Xavier Roche
42acbe6c97 htsparse: percent-encode parens in rewritten CSS url() (#163)
A source url(...) whose target encodes '(' ')' as %28/%29 was rewritten
with literal parens, because they are RFC2396 "mark" characters that the
URI escaper (escape_uri_utf, mode 30) leaves alone. In an unquoted CSS
url(...) the literal ')' closes the token early, so the browser mis-parses
the value and drops the background image.

Re-escape '(' and ')' back to %28/%29 when emitting the link, gated on the
url() context (ending_p == ')'). The UA decodes them to the saved-on-disk
name, so the reference still resolves. Quoted url("...") and ordinary HTML
attributes keep their parens, matching prior behavior.

Test in 01_engine-parse.test crawls a CSS fixture whose url() references a
%20%28...%29 name and asserts the rewrite keeps the parens encoded;
negative control confirmed (literal-paren output fails it).

Closes #163

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-19 20:49:13 +02:00
2 changed files with 43 additions and 0 deletions

View File

@@ -317,6 +317,27 @@ static int is_http_method(const char *s, size_t len) {
return 0;
}
/* Percent-encode '(' and ')' in a link emitted into an unquoted CSS url(...):
a literal ')' closes the token early and the UA mis-parses the value (#163).
The UA decodes %28/%29 back to the saved-on-disk name. */
static void escape_url_parens(char *const s, const size_t size) {
char BIGSTK buff[HTS_URLMAXSIZE * 2];
size_t i, j;
for (i = 0, j = 0; s[i] != '\0' && j + 3 < size && j + 3 < sizeof(buff);
i++) {
if (s[i] == '(' || s[i] == ')') {
buff[j++] = '%';
buff[j++] = '2';
buff[j++] = s[i] == '(' ? '8' : '9';
} else {
buff[j++] = s[i];
}
}
buff[j] = '\0';
strlcpybuff(s, buff, size);
}
/* Main parser */
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
char catbuff[CATBUFF_SIZE];
@@ -3027,6 +3048,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
/* Never escape high-chars (we don't know the encoding!!) */
inplace_escape_uri_utf(tempo, sizeof(tempo));
// unquoted CSS url(...): keep parens escaped (#163)
if (ending_p == ')')
escape_url_parens(tempo, sizeof(tempo));
//if (!no_esc_utf)
// escape_uri(tempo); // escape with %xx
//else {

View File

@@ -264,4 +264,22 @@ grep -Fq 'window.open("winopen.gif")' "$saved7" ||
! grep -Fq 'window.open("file://' "$saved7" ||
! echo "FAIL #218: window.open URL left absolute (not rewritten)" || exit 1
# Parens inside an unquoted CSS url(...) (#163): the saved-on-disk name has
# literal '(' ')' (the source %28/%29 decode when fetching), but a literal ')'
# in the rewritten url() would close the token early and break the value, so
# they must stay percent-encoded. Negative control: without the fix the output
# is url(img%20(1).gif) and the grep for %281%29 fails (parens are RFC2396
# "mark" chars, which the URI escaper leaves alone).
site8="$tmp/cssparens"
mkdir -p "$site8"
gif "$site8/img (1).gif"
printf 'body { background-image: url(img%%20%%281%%29.gif); }\n' >"$site8/style.css"
out8="$tmp/cssparens-out"
crawl "$site8/style.css" "$out8"
found "img (1).gif" "$out8"
css8=$(find "$out8" -type f -path '*/file/*' -name style.css -print -quit)
test -n "$css8" || ! echo "FAIL: saved style.css not found" || exit 1
grep -Fq 'url(img%20%281%29.gif)' "$css8" ||
! echo "FAIL #163: parens in unquoted CSS url() not percent-encoded on rewrite" || exit 1
exit 0