Compare commits

..

2 Commits

Author SHA1 Message Date
Xavier Roche
7eacbbaf2a Quoted attribute values mid-tag were invisible to the aggressive parser
Even with the automaton unfrozen, a quoted string only qualified when
its next non-space character was one of ),;>/+ or end-of-line: in HTML
terms, only the last attribute of a tag (or line) could be detected.
The mid-tag attributes of #201 (data-gifsrc="x.gif" data-poster=...)
were structurally unreachable, making detection depend on attribute
order and source formatting.

In a tag outside any script, a quoted value ends at its closing quote,
so waive the follower requirement there. The arm resolves the owning
attribute name itself and declines for no-detect/xmlns names and
non-attribute quotes: the intag_startattr lookup is unreliable mid-tag
(repointed at every in-tag whitespace, stale on glued attributes), and
adversarial review showed a plain intag arm fetching alt/xmlns values
that master suppressed. Script and event-handler contexts keep the
strict gate.

Tests pin each automaton-exit reset (handler fixtures freeze lastc on
';' so they stay live), the nodetect/xmlns bypass shapes, the frozen-
quote phantom-fetch mode, and the '='-resolution decoys. Runtime
audits in httrack-works/issue-201-203/.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-05 23:24:22 +02:00
Xavier Roche
38cff3ce5c Aggressive parser dies for the rest of the page after the first script
The script automaton state (inscript_state_pos) is reset when a script
or style element is entered, but never when it exits. The dirty-parser
character tracker (parseall_lastc) only advances while that state is
INSCRIPT_START, without checking inscript, so whatever state the
automaton holds at </script> freezes for all following HTML. The '/' of
</script> itself is fed to the automaton before the exit branch runs,
so even a clean script deterministically parks it in INSCRIPT_SLASH:
URL detection in unknown attributes (data-*, content) goes dead after
the first script on the page (#201, #203). A </script> inside a JS
string or comment freezes a quote state with parseall_lastc stuck on
'=', turning stray quoted tokens into phantom fetches instead.

Reset the state at the three live exit sites, mirroring the entry
resets. Regression test proven to fail on the unfixed parser.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-05 19:35:30 +02:00
2 changed files with 112 additions and 4 deletions

View File

@@ -192,6 +192,48 @@ static void hts_automate_lookup(const script_automate *aut) {
}
}
/* Attribute name owning the quoted value at 'quote' inside a tag, spanning
[name, *nend); NULL when the quote is not an attribute value. */
static const char *dirty_attr_name(const char *quote, const char *tag_start,
const char **nend) {
const char *a = quote - 1;
while (a > tag_start && is_taborspace(*a))
a--;
if (a == tag_start || *a != '=')
return NULL;
a--;
while (a > tag_start && is_taborspace(*a))
a--;
*nend = a + 1;
while (a > tag_start && *a != '=' && *a != '\"' && *a != '\'' &&
!is_realspace(*a))
a--;
a++;
// a name starting right after '<' is the tag name, not an attribute
return a < *nend && a > tag_start + 1 ? a : NULL;
}
/* Accept the in-tag quoted value at 'quote' for dirty parsing? Resolves the
owning attribute itself (intag_startattr is unreliable mid-tag) and rejects
no-detect/xmlns names. */
static hts_boolean dirty_attr_detectable(const char *quote,
const char *tag_start) {
const char *nend;
const char *name = dirty_attr_name(quote, tag_start, &nend);
int i;
if (name == NULL)
return HTS_FALSE;
for (i = 0; strnotempty(hts_nodetect[i]); i++) {
const int l = strfield(name, hts_nodetect[i]);
if (l && name + l == nend)
return HTS_FALSE;
}
i = strfield(name, "xmlns");
if (i && (name + i == nend || name[i] == ':'))
return HTS_FALSE;
return HTS_TRUE;
}
/* Advance the cursor by 'steps' bytes, feeding each to the automaton. */
static void hts_automate_increment(const script_automate *aut, int steps) {
while (steps > 0) {
@@ -754,6 +796,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
) {
if (inscript_tag) {
inscript_tag = inscript = 0;
// reset the automaton on exit or its state leaks into plain HTML
inscript_state_pos = INSCRIPT_START;
intag = 0;
incomment = 0;
intag_start_valid = 0;
@@ -837,6 +881,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
if (*html == inscript_tag_lastc) {
/* sortir */
inscript_tag = inscript = 0;
inscript_state_pos = INSCRIPT_START;
incomment = 0;
if (opt->parsedebug) {
HT_ADD("<@@ /inscript @@>");
@@ -1293,6 +1338,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
a--;
if (*a == '<') { // sûr que c'est un tag?
inscript = 0;
inscript_state_pos = INSCRIPT_START;
if (opt->parsedebug) {
HT_ADD("<@@ /inscript @@>");
}
@@ -1568,8 +1614,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
while(is_taborspace(*a))
a++;
c = *a;
if (strchr("),;>/+\r\n", c)) { // exemple: ..img.gif";
// le / est pour funct("img.gif" /* URL */);
// in-tag, an attribute value ends at its quote: no
// delimiter required after it (mid-tag attrs, #201)
if (strchr("),;>/+\r\n", c) ||
(intag && !inscript && intag_start_valid &&
dirty_attr_detectable(html, intag_start))) {
// '/' covers a value followed by a JS comment
char BIGSTK tempo[HTS_URLMAXSIZE * 2];
char type[256];
int url_ok = 0; // url valide?

View File

@@ -106,8 +106,6 @@ grep -Eq 'srcset="j\.gif 2x"' "$saved" ||
# inline style attribute, with the URL unquoted, double-quoted and single-quoted
# (the quote style is preserved on rewrite). No-detect attributes (title, alt,
# ...) are left untouched. Asserted by rewrite (deterministic), not download.
# data-* (#201/#203) is omitted: its detection is currently nondeterministic and
# can't be locked yet.
site2="$tmp/attrs"
mkdir -p "$site2"
for f in xl ibg ibgs cex cexd cexs tt; do gif "$site2/$f.gif"; done
@@ -352,4 +350,64 @@ found "v.webm" "$out10"
found "subs.vtt" "$out10"
notfound "plain.gif" "$out10"
# Unknown-attr (data-*) URLs (#201/#203): the script automaton state must reset at
# </script>, or detection dies for the rest of the page after the first script.
site11="$tmp/dataattr"
mkdir -p "$site11"
for f in pre post mid spdata handler handler2 jsdecoy jsdecoy2 nodecoy \
spalt glalt spxml jtail1 jtail2 textdecoy tagdecoy phantom; do gif "$site11/$f.gif"; done
cat >"$site11/index.html" <<EOF
<html><body>
<img data-pre="file://$site11/pre.gif">
<script>var x = 1;</script>
<img data-post="file://$site11/post.gif">
<img data-mid="file://$site11/mid.gif" alt="mid-tag attr, no delimiter after">
<img data-sp = "file://$site11/spdata.gif" q=1>
<img alt="nodecoy.gif" src="pre.gif">
<img alt = "spalt.gif" id=x>
<img src="pre.gif"alt="glalt.gif" id=x>
<p xmlns:bar = "spxml.gif" id=x></p>
<div data-json='["jtail1.gif","jtail2.gif"]' q=1></div>
<p>t = "textdecoy.gif" q</p>
<img ="tagdecoy.gif" q>
<a onclick='q = 1; "h1'>x</a>
<img data-h1="file://$site11/handler.gif">
<a onclick='w = 1; "h2>x</a>
<img data-h2="file://$site11/handler2.gif">
<script>var s; s = 1; "jsdecoy.gif";</script>
<script>var s2 = "jsdecoy2.gif" x;</script>
<script>var f = "</script>
<img alt="y" "phantom.gif">
</body></html>
EOF
out11="$tmp/dataattr-out"
crawl "$site11/index.html" "$out11"
saved11=$(savedhtml "$out11")
test -n "$saved11" || ! echo "FAIL: saved dataattr page not found" || exit 1
grep -Fq 'data-pre="pre.gif"' "$saved11" ||
! echo "FAIL #201: data-* URL before any script not detected/rewritten" || exit 1
grep -Fq 'data-post="post.gif"' "$saved11" ||
! echo "FAIL #201: data-* URL after a script not detected (state leak)" || exit 1
grep -Fq 'data-mid="mid.gif"' "$saved11" ||
! echo "FAIL #201: mid-tag data-* URL not detected" || exit 1
grep -Fq 'data-sp = "spdata.gif"' "$saved11" ||
! echo "FAIL #201: spaced-= data-* URL not detected" || exit 1
found "handler.gif" "$out11" # automaton reset at the handler-terminator exit
found "handler2.gif" "$out11" # ... and at the '>' exit of an unterminated handler
# a JS string not preceded by =/(/, is still ignored after the reset
notfound "jsdecoy.gif" "$out11"
# in-script, the strict follower gate still applies (intag is 1 in script bodies)
notfound "jsdecoy2.gif" "$out11"
# no-detect attrs stay exempt, incl. spaced-'=' and glued-attr forms
notfound "nodecoy.gif" "$out11"
notfound "spalt.gif" "$out11"
notfound "glalt.gif" "$out11"
notfound "spxml.gif" "$out11"
# not '='-preceded: comma-list tails and out-of-tag text tokens stay ignored
notfound "jtail2.gif" "$out11"
notfound "textdecoy.gif" "$out11"
notfound "tagdecoy.gif" "$out11" # a '=' glued to the tag name is not an attr
# a </script> inside a JS string must not cause phantom fetches later
notfound "phantom.gif" "$out11"
exit 0