Compare commits

...

1 Commits

Author SHA1 Message Date
Xavier Roche
5d0ae89e36 Follow <source>/<track> as embedded near-links (#451)
src/srcset are already extracted from any element via hts_detect[], so
<source>/<track> URLs were captured. But hts_detect_embed only listed
<img src>, so at the recursion-depth boundary under --near these media
were treated as plain links and dropped, unlike <img>. Add a separate
HTML5 media table (keeping the legacy one clang-format-stable) and chain
the embed lookup through both.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-29 00:03:39 +02:00
2 changed files with 47 additions and 9 deletions

View File

@@ -80,6 +80,10 @@ htspair_t hts_detect_embed[] = {
{NULL, NULL}
};
/* HTML5 media siblings of <img src>: same near-link treatment (#451) */
static const htspair_t hts_detect_embed_html5[] = {
{"source", "src"}, {"source", "srcset"}, {"track", "src"}, {NULL, NULL}};
/* Internal */
static int hts_acceptlink_(httrackp * opt, int ptr, const char *adr,
const char *fil, const char *tag,
@@ -136,6 +140,17 @@ static int cmp_token(const char *tag, const char *cmp) {
&& !isalnum((unsigned char) tag[p]));
}
/* TRUE if (tag, attribute) matches an embedded-asset pair in the table */
static hts_boolean is_embed_pair(const htspair_t *table, const char *tag,
const char *attribute) {
int i;
for (i = 0; table[i].tag != NULL; i++) {
if (cmp_token(tag, table[i].tag) && cmp_token(attribute, table[i].attr))
return HTS_TRUE;
}
return HTS_FALSE;
}
static int hts_acceptlink_(httrackp * opt, int ptr,
const char *adr, const char *fil, const char *tag,
const char *attribute, int *set_prio_to,
@@ -163,15 +178,9 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
/* Built-in known tags (<img src=..>, ..) */
if (forbidden_url != 0 && opt->nearlink && tag != NULL && attribute != NULL) {
int i;
for(i = 0; hts_detect_embed[i].tag != NULL; i++) {
if (cmp_token(tag, hts_detect_embed[i].tag)
&& cmp_token(attribute, hts_detect_embed[i].attr)
) {
embedded_triggered = 1;
break;
}
if (is_embed_pair(hts_detect_embed, tag, attribute) ||
is_embed_pair(hts_detect_embed_html5, tag, attribute)) {
embedded_triggered = 1;
}
}

View File

@@ -323,4 +323,33 @@ grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
# HTML5 <source>/<track> follow as embedded near-links past the -r2 depth boundary (#451).
# img.gif positive control; plain.gif (bare <a href>) negative control proves the gate is selective.
site10="$tmp/html5media"
mkdir -p "$site10"
for f in img ss plain; do gif "$site10/$f.gif"; done
printf 'x' >"$site10/v.webm"
printf 'x' >"$site10/subs.vtt"
cat >"$site10/index.html" <<EOF
<html><body><a href="leaf.html">leaf</a></body></html>
EOF
cat >"$site10/leaf.html" <<EOF
<html><body>
<img src="img.gif">
<picture><source srcset="ss.gif 2x"></picture>
<video><source src="v.webm"></video>
<video><track src="subs.vtt"></video>
<a href="plain.gif">plain link past the boundary</a>
</body></html>
EOF
out10="$tmp/html5media-out"
rm -rf "$out10"
mkdir -p "$out10"
httrack "file://$site10/index.html" -O "$out10" --quiet --near -r2 >"$out10/.log" 2>&1 || true
found "img.gif" "$out10"
found "ss.gif" "$out10"
found "v.webm" "$out10"
found "subs.vtt" "$out10"
notfound "plain.gif" "$out10"
exit 0