mirror of
https://github.com/xroche/httrack.git
synced 2026-06-25 11:37:28 +03:00
The test scripts mostly ran with no error flags, so a failing command in
the middle would be ignored and the script would limp on to a misleading
result. Turn on strict mode everywhere, guarding the spots that legitimately
expect a non-zero exit:
- the htssafe overflow probes (-#8) deliberately abort, and the strsafe/
cmdline crawls capture an exit code to assert on, so those are run with
`|| true` / `|| rc=$?` rather than letting set -e kill the script first;
- the parser fixture crawl ignores httrack's own exit (it checks the mirrored
files), so it keeps `|| true`;
- 02_update-cache replaced `find ... | grep -q .` with a `-print -quit`
command substitution: under pipefail grep -q can close the pipe early and
leave find killed by SIGPIPE, which would spuriously fail an existing file;
- 12_crawl_https guards $HTTPS_SUPPORT with `${...:-}` for set -u.
02_manpage-regen and 01_engine-cache stay on `set -eu` (no pipefail): both are
run via $(BASH), which can be a plain POSIX /bin/sh where `set -o pipefail`
does not exist.
shellcheck clean; make check: 15 PASS, 7 SKIP (offline).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
158 lines
6.9 KiB
Bash
Executable File
158 lines
6.9 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
|
|
# Offline HTML parser tests: each section crawls a file:// fixture (no network)
|
|
# and checks which assets the parser captured and how it rewrote the links.
|
|
|
|
set -euo pipefail
|
|
|
|
tmp=$(mktemp -d "${TMPDIR:-/tmp}/httrack_parse.XXXXXX") || exit 1
|
|
trap 'rm -rf "$tmp"' EXIT HUP INT QUIT PIPE TERM
|
|
|
|
# a minimal valid 1x1 GIF, reused for every referenced asset
|
|
gif() {
|
|
printf 'GIF89a\1\0\1\0\200\0\0\0\0\0\377\377\377!\371\4\1\0\0\0\0,\0\0\0\0\1\0\1\0\0\2\2D\1\0;' >"$1"
|
|
}
|
|
|
|
# crawl <fixture-html> into <out> with link rewriting on, no extra fetching
|
|
crawl() {
|
|
local html="$1" out="$2"
|
|
rm -rf "$out"
|
|
mkdir -p "$out"
|
|
# the crawl's own exit status is irrelevant here; the assertions below check
|
|
# the mirrored files, so don't let set -e trip on a non-zero httrack exit
|
|
httrack "file://$html" -O "$out" --quiet --near -n >"$out/.log" 2>&1 || true
|
|
}
|
|
|
|
# assert a file with the given basename was saved somewhere under <out>
|
|
found() {
|
|
test -n "$(find "$2" -type f -name "$1" -print -quit)" ||
|
|
! echo "FAIL: expected '$1' to be downloaded under $2" || exit 1
|
|
}
|
|
|
|
# assert NO file with the given basename was saved (e.g. a descriptor token must
|
|
# not be mistaken for a URL)
|
|
notfound() {
|
|
test -z "$(find "$2" -type f -name "$1" -print -quit)" ||
|
|
! echo "FAIL: '$1' should not have been downloaded under $2" || exit 1
|
|
}
|
|
|
|
# the mirrored fixture page (under "file/"), not HTTrack's own landing index
|
|
savedhtml() {
|
|
find "$1" -type f -path '*/file/*' -name index.html -print -quit
|
|
}
|
|
|
|
# srcset on <img> and <source> (#235, #236): every candidate captured and
|
|
# rewritten, descriptors preserved, following attributes left intact.
|
|
site="$tmp/srcset"
|
|
mkdir -p "$site"
|
|
for f in a b c d e f g h i j v dz; do gif "$site/$f.gif"; done
|
|
# unquoted heredoc: $site expands in the absolute-URL candidate
|
|
cat >"$site/index.html" <<EOF
|
|
<html><body>
|
|
<img src="a.gif" srcset="b.gif 480w, c.gif 800w">
|
|
<picture><source srcset="d.gif 1x, c.gif 2x"><img src="a.gif"></picture>
|
|
<img srcset="e.gif, f.gif">
|
|
<img srcset="g.gif 2x" alt="trailing attr after srcset">
|
|
<img srcset=" h.gif 2x , i.gif ">
|
|
<video><source src="v.gif"></video>
|
|
<img srcset="file://$site/j.gif 2x">
|
|
<img srcset="data:image/gif;base64,R0lGODlhAQABAAAAACw= 1x, dz.gif 2x">
|
|
<img srcset="">
|
|
<a href="a.gif">plain link still works</a>
|
|
</body></html>
|
|
EOF
|
|
out="$tmp/srcset-out"
|
|
crawl "$site/index.html" "$out"
|
|
|
|
# every candidate downloads, incl. unique tails (catches first-only parsing),
|
|
# whitespace-padded (h,i), <source src> (v), absolute (j), post-data: URI (dz)
|
|
for f in a b c d e f g h i j v dz; do found "$f.gif" "$out"; done
|
|
|
|
# the width/density descriptors are not URLs and must not be fetched
|
|
notfound "480w" "$out"
|
|
notfound "800w" "$out"
|
|
notfound "2x" "$out"
|
|
|
|
saved=$(savedhtml "$out")
|
|
test -n "$saved" || ! echo "FAIL: saved index.html not found" || exit 1
|
|
|
|
# descriptors must survive the rewrite (no "b.gif 480w" mangled into a path)
|
|
grep -Eq 'srcset="[^"]*480w[^"]*800w' "$saved" ||
|
|
! echo "FAIL: srcset width descriptors lost/reordered in rewritten HTML" || exit 1
|
|
grep -Eq 'srcset="[^"]*1x[^"]*2x' "$saved" ||
|
|
! echo "FAIL: srcset density descriptors lost/reordered in rewritten HTML" || exit 1
|
|
# the descriptor-less comma form keeps both candidates and the separator verbatim
|
|
grep -Eq 'srcset="e\.gif, f\.gif"' "$saved" ||
|
|
! echo "FAIL: comma-separated srcset without descriptors was altered" || exit 1
|
|
# an attribute following srcset in the same tag must be left intact
|
|
grep -q 'alt="trailing attr after srcset"' "$saved" ||
|
|
! echo "FAIL: srcset swallowed a following attribute" || exit 1
|
|
|
|
# a comma inside a URL (data: URI, CDN path) is part of the URL, not a split
|
|
# point (WHATWG): the data: URI stays verbatim; the next candidate (dz) downloads
|
|
grep -Fq 'data:image/gif;base64,R0lGODlhAQABAAAAACw= 1x' "$saved" ||
|
|
! echo "FAIL: a comma inside a data: URI srcset candidate was mis-split" || exit 1
|
|
|
|
# real rewrite, not passthrough: the absolute file:// candidate becomes local
|
|
# (a flat fixture can't show this; the footer comment's file:// is not in srcset)
|
|
grep -Eq 'srcset="j\.gif 2x"' "$saved" ||
|
|
! echo "FAIL: absolute file:// srcset URL was not rewritten to a local link" || exit 1
|
|
! grep -Eq 'srcset="[^"]*file://' "$saved" ||
|
|
! echo "FAIL: a file:// URL survived inside a rewritten srcset attribute" || exit 1
|
|
|
|
# xlink:href (#298) and CSS background-image (#237): detected and rewritten to
|
|
# local. background-image is covered in both an external <style> block and an
|
|
# inline style attribute, with the URL unquoted, double-quoted and single-quoted
|
|
# (the quote style is preserved on rewrite). No-detect attributes (title, alt,
|
|
# ...) are left untouched. Asserted by rewrite (deterministic), not download.
|
|
# data-* (#201/#203) is omitted: its detection is currently nondeterministic and
|
|
# can't be locked yet.
|
|
site2="$tmp/attrs"
|
|
mkdir -p "$site2"
|
|
for f in xl ibg ibgs cex cexd cexs tt; do gif "$site2/$f.gif"; done
|
|
cat >"$site2/index.html" <<EOF
|
|
<html><head><style>
|
|
.a { background-image: url(file://$site2/cex.gif); }
|
|
.b { background-image: url("file://$site2/cexd.gif"); }
|
|
.c { background-image: url('file://$site2/cexs.gif'); }
|
|
</style></head><body>
|
|
<a xlink:href="file://$site2/xl.gif">xlink:href (#298)</a>
|
|
<div style="background-image:url(file://$site2/ibg.gif)"></div>
|
|
<div style="background-image:url('file://$site2/ibgs.gif')"></div>
|
|
<span title="file://$site2/tt.gif">excluded attribute</span>
|
|
</body></html>
|
|
EOF
|
|
out2="$tmp/attrs-out"
|
|
crawl "$site2/index.html" "$out2"
|
|
saved2=$(savedhtml "$out2")
|
|
test -n "$saved2" || ! echo "FAIL: saved attrs page not found" || exit 1
|
|
|
|
# detected attributes: the absolute URL is rewritten to a local link
|
|
grep -Eq 'xlink:href="xl\.gif"' "$saved2" ||
|
|
! echo "FAIL #298: xlink:href not detected/rewritten" || exit 1
|
|
|
|
# #237 external <style> block, each quoting form, quote style preserved
|
|
grep -Eq 'url\(cex\.gif\)' "$saved2" ||
|
|
! echo "FAIL #237: unquoted background-image in <style> not rewritten" || exit 1
|
|
grep -Eq 'url\("cexd\.gif"\)' "$saved2" ||
|
|
! echo "FAIL #237: double-quoted background-image in <style> not rewritten" || exit 1
|
|
grep -Eq "url\('cexs\.gif'\)" "$saved2" ||
|
|
! echo "FAIL #237: single-quoted background-image in <style> not rewritten" || exit 1
|
|
|
|
# #237 inline style attribute, unquoted and single-quoted url()
|
|
grep -Eq 'style="background-image:url\(ibg\.gif\)"' "$saved2" ||
|
|
! echo "FAIL #237: inline unquoted background-image not rewritten" || exit 1
|
|
grep -Eq "style=\"background-image:url\('ibgs\.gif'\)\"" "$saved2" ||
|
|
! echo "FAIL #237: inline single-quoted background-image not rewritten" || exit 1
|
|
|
|
# no file:// URL survived inside any rewritten background-image
|
|
! grep -Eq 'background-image:[^;"]*file://' "$saved2" ||
|
|
! echo "FAIL #237: a file:// URL survived inside a rewritten background-image" || exit 1
|
|
|
|
# excluded attribute: title is on the no-detect list, so its value is left as-is
|
|
grep -q 'title="file://' "$saved2" ||
|
|
! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1
|
|
|
|
exit 0
|