mirror of
https://github.com/xroche/httrack.git
synced 2026-07-06 00:46:30 +03:00
Compare commits
2 Commits
master
...
fix-201-20
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7eacbbaf2a | ||
|
|
38cff3ce5c |
@@ -192,6 +192,48 @@ static void hts_automate_lookup(const script_automate *aut) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Attribute name owning the quoted value at 'quote' inside a tag, spanning
|
||||
[name, *nend); NULL when the quote is not an attribute value. */
|
||||
static const char *dirty_attr_name(const char *quote, const char *tag_start,
|
||||
const char **nend) {
|
||||
const char *a = quote - 1;
|
||||
while (a > tag_start && is_taborspace(*a))
|
||||
a--;
|
||||
if (a == tag_start || *a != '=')
|
||||
return NULL;
|
||||
a--;
|
||||
while (a > tag_start && is_taborspace(*a))
|
||||
a--;
|
||||
*nend = a + 1;
|
||||
while (a > tag_start && *a != '=' && *a != '\"' && *a != '\'' &&
|
||||
!is_realspace(*a))
|
||||
a--;
|
||||
a++;
|
||||
// a name starting right after '<' is the tag name, not an attribute
|
||||
return a < *nend && a > tag_start + 1 ? a : NULL;
|
||||
}
|
||||
|
||||
/* Accept the in-tag quoted value at 'quote' for dirty parsing? Resolves the
|
||||
owning attribute itself (intag_startattr is unreliable mid-tag) and rejects
|
||||
no-detect/xmlns names. */
|
||||
static hts_boolean dirty_attr_detectable(const char *quote,
|
||||
const char *tag_start) {
|
||||
const char *nend;
|
||||
const char *name = dirty_attr_name(quote, tag_start, &nend);
|
||||
int i;
|
||||
if (name == NULL)
|
||||
return HTS_FALSE;
|
||||
for (i = 0; strnotempty(hts_nodetect[i]); i++) {
|
||||
const int l = strfield(name, hts_nodetect[i]);
|
||||
if (l && name + l == nend)
|
||||
return HTS_FALSE;
|
||||
}
|
||||
i = strfield(name, "xmlns");
|
||||
if (i && (name + i == nend || name[i] == ':'))
|
||||
return HTS_FALSE;
|
||||
return HTS_TRUE;
|
||||
}
|
||||
|
||||
/* Advance the cursor by 'steps' bytes, feeding each to the automaton. */
|
||||
static void hts_automate_increment(const script_automate *aut, int steps) {
|
||||
while (steps > 0) {
|
||||
@@ -754,6 +796,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
) {
|
||||
if (inscript_tag) {
|
||||
inscript_tag = inscript = 0;
|
||||
// reset the automaton on exit or its state leaks into plain HTML
|
||||
inscript_state_pos = INSCRIPT_START;
|
||||
intag = 0;
|
||||
incomment = 0;
|
||||
intag_start_valid = 0;
|
||||
@@ -837,6 +881,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (*html == inscript_tag_lastc) {
|
||||
/* sortir */
|
||||
inscript_tag = inscript = 0;
|
||||
inscript_state_pos = INSCRIPT_START;
|
||||
incomment = 0;
|
||||
if (opt->parsedebug) {
|
||||
HT_ADD("<@@ /inscript @@>");
|
||||
@@ -1293,6 +1338,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
a--;
|
||||
if (*a == '<') { // sûr que c'est un tag?
|
||||
inscript = 0;
|
||||
inscript_state_pos = INSCRIPT_START;
|
||||
if (opt->parsedebug) {
|
||||
HT_ADD("<@@ /inscript @@>");
|
||||
}
|
||||
@@ -1568,8 +1614,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
while(is_taborspace(*a))
|
||||
a++;
|
||||
c = *a;
|
||||
if (strchr("),;>/+\r\n", c)) { // exemple: ..img.gif";
|
||||
// le / est pour funct("img.gif" /* URL */);
|
||||
// in-tag, an attribute value ends at its quote: no
|
||||
// delimiter required after it (mid-tag attrs, #201)
|
||||
if (strchr("),;>/+\r\n", c) ||
|
||||
(intag && !inscript && intag_start_valid &&
|
||||
dirty_attr_detectable(html, intag_start))) {
|
||||
// '/' covers a value followed by a JS comment
|
||||
char BIGSTK tempo[HTS_URLMAXSIZE * 2];
|
||||
char type[256];
|
||||
int url_ok = 0; // url valide?
|
||||
|
||||
@@ -106,8 +106,6 @@ grep -Eq 'srcset="j\.gif 2x"' "$saved" ||
|
||||
# inline style attribute, with the URL unquoted, double-quoted and single-quoted
|
||||
# (the quote style is preserved on rewrite). No-detect attributes (title, alt,
|
||||
# ...) are left untouched. Asserted by rewrite (deterministic), not download.
|
||||
# data-* (#201/#203) is omitted: its detection is currently nondeterministic and
|
||||
# can't be locked yet.
|
||||
site2="$tmp/attrs"
|
||||
mkdir -p "$site2"
|
||||
for f in xl ibg ibgs cex cexd cexs tt; do gif "$site2/$f.gif"; done
|
||||
@@ -352,4 +350,64 @@ found "v.webm" "$out10"
|
||||
found "subs.vtt" "$out10"
|
||||
notfound "plain.gif" "$out10"
|
||||
|
||||
# Unknown-attr (data-*) URLs (#201/#203): the script automaton state must reset at
|
||||
# </script>, or detection dies for the rest of the page after the first script.
|
||||
site11="$tmp/dataattr"
|
||||
mkdir -p "$site11"
|
||||
for f in pre post mid spdata handler handler2 jsdecoy jsdecoy2 nodecoy \
|
||||
spalt glalt spxml jtail1 jtail2 textdecoy tagdecoy phantom; do gif "$site11/$f.gif"; done
|
||||
cat >"$site11/index.html" <<EOF
|
||||
<html><body>
|
||||
<img data-pre="file://$site11/pre.gif">
|
||||
<script>var x = 1;</script>
|
||||
<img data-post="file://$site11/post.gif">
|
||||
<img data-mid="file://$site11/mid.gif" alt="mid-tag attr, no delimiter after">
|
||||
<img data-sp = "file://$site11/spdata.gif" q=1>
|
||||
<img alt="nodecoy.gif" src="pre.gif">
|
||||
<img alt = "spalt.gif" id=x>
|
||||
<img src="pre.gif"alt="glalt.gif" id=x>
|
||||
<p xmlns:bar = "spxml.gif" id=x></p>
|
||||
<div data-json='["jtail1.gif","jtail2.gif"]' q=1></div>
|
||||
<p>t = "textdecoy.gif" q</p>
|
||||
<img ="tagdecoy.gif" q>
|
||||
<a onclick='q = 1; "h1'>x</a>
|
||||
<img data-h1="file://$site11/handler.gif">
|
||||
<a onclick='w = 1; "h2>x</a>
|
||||
<img data-h2="file://$site11/handler2.gif">
|
||||
<script>var s; s = 1; "jsdecoy.gif";</script>
|
||||
<script>var s2 = "jsdecoy2.gif" x;</script>
|
||||
<script>var f = "</script>
|
||||
<img alt="y" "phantom.gif">
|
||||
</body></html>
|
||||
EOF
|
||||
out11="$tmp/dataattr-out"
|
||||
crawl "$site11/index.html" "$out11"
|
||||
saved11=$(savedhtml "$out11")
|
||||
test -n "$saved11" || ! echo "FAIL: saved dataattr page not found" || exit 1
|
||||
grep -Fq 'data-pre="pre.gif"' "$saved11" ||
|
||||
! echo "FAIL #201: data-* URL before any script not detected/rewritten" || exit 1
|
||||
grep -Fq 'data-post="post.gif"' "$saved11" ||
|
||||
! echo "FAIL #201: data-* URL after a script not detected (state leak)" || exit 1
|
||||
grep -Fq 'data-mid="mid.gif"' "$saved11" ||
|
||||
! echo "FAIL #201: mid-tag data-* URL not detected" || exit 1
|
||||
grep -Fq 'data-sp = "spdata.gif"' "$saved11" ||
|
||||
! echo "FAIL #201: spaced-= data-* URL not detected" || exit 1
|
||||
found "handler.gif" "$out11" # automaton reset at the handler-terminator exit
|
||||
found "handler2.gif" "$out11" # ... and at the '>' exit of an unterminated handler
|
||||
# a JS string not preceded by =/(/, is still ignored after the reset
|
||||
notfound "jsdecoy.gif" "$out11"
|
||||
# in-script, the strict follower gate still applies (intag is 1 in script bodies)
|
||||
notfound "jsdecoy2.gif" "$out11"
|
||||
# no-detect attrs stay exempt, incl. spaced-'=' and glued-attr forms
|
||||
notfound "nodecoy.gif" "$out11"
|
||||
notfound "spalt.gif" "$out11"
|
||||
notfound "glalt.gif" "$out11"
|
||||
notfound "spxml.gif" "$out11"
|
||||
# not '='-preceded: comma-list tails and out-of-tag text tokens stay ignored
|
||||
notfound "jtail2.gif" "$out11"
|
||||
notfound "textdecoy.gif" "$out11"
|
||||
notfound "tagdecoy.gif" "$out11" # a '=' glued to the tag name is not an attr
|
||||
# a </script> inside a JS string must not cause phantom fetches later
|
||||
notfound "phantom.gif" "$out11"
|
||||
|
||||
exit 0
|
||||
|
||||
Reference in New Issue
Block a user