Compare commits

...

1 Commits

Author SHA1 Message Date
Xavier Roche
43f72afbad test: expand offline engine self-test coverage
Add filter (-#0) and MIME (-#2) tests, and broaden the charset, entity,
IDNA, and path-simplify cases that previously had one or two assertions
each.

Cover the punycode, charset, and entity parsers (areas with a CVE
history) with malformed-input probes that check the hardened build exits
cleanly rather than overflowing. The IDNA and path-simplify edge cases
are pinned to RFC 3492 and RFC 3986 semantics.

The   entity case documents the known U+00A0 -> space behavior in
htsencoding.c instead of asserting the spec byte, so a future fix is not
blocked by a stale test.
2026-06-13 09:55:19 +02:00
8 changed files with 212 additions and 18 deletions

View File

@@ -1,5 +1,36 @@
#!/bin/bash
#
# minimalistic charset test
test "$(httrack -O /dev/null -#3 "iso-8859-1" "café")" == "café" || exit 1
# charset -> UTF-8 conversion (hts_convertStringToUTF8).
# -#3 <charset> <string> prints the string re-decoded from <charset> as UTF-8.
conv() {
test "$(httrack -O /dev/null -#3 "$1" "$2")" == "$3" || exit 1
}
# crash probe: malformed input must exit cleanly, not abort.
runs() {
httrack -O /dev/null -#3 "$1" "$2" >/dev/null 2>&1 || exit 1
}
# the source bytes below are UTF-8 (this file is UTF-8); "café" is 0x63 61 66 C3 A9.
# already UTF-8: identity
conv 'utf-8' 'café' 'café'
# bytes reinterpreted as latin-1: each input byte becomes one codepoint
conv 'iso-8859-1' 'café' 'café'
# windows-1252 is NOT latin-1: 0x80 is the euro sign, not U+0080. This is the
# case that actually exercises the cp1252 table (the 0x80-0x9F range).
conv 'windows-1252' $'\x80' '€'
# pure ASCII is charset-invariant
conv 'us-ascii' 'hello' 'hello'
# unknown charset: ASCII passes through unchanged, but non-ASCII input cannot be
# decoded and yields empty output (an error is printed to stderr).
conv 'no-such-charset-xyz' 'abc' 'abc'
test "$(httrack -O /dev/null -#3 'no-such-charset-xyz' 'café' 2>/dev/null)" == "" || exit 1
# malformed UTF-8 (lone continuation byte, truncated lead byte) must not crash
runs 'utf-8' $'\x80'
runs 'utf-8' $'\xc3'

View File

@@ -1,5 +1,49 @@
#!/bin/bash
#
# basic entities handling (with boggy entities handling)
test "$(httrack -O /dev/null -#6 "&foo;&nbsp;th&eacute;&amp;caf&#xe9;&#e9;&#x3082;&#12398;&#x306e;&#x3051;&#x59eb;")" == "&foo; thé&café&#e9;もののけ姫" || exit 1
# HTML entity unescaping (hts_unescapeEntitiesWithCharset).
# -#6 <string> prints the string with entities decoded (UTF-8 output).
ent() {
test "$(httrack -O /dev/null -#6 "$1")" == "$2" || exit 1
}
# crash probe: malformed input must exit cleanly, not abort.
runs() {
httrack -O /dev/null -#6 "$1" >/dev/null 2>&1 || exit 1
}
# named entities
ent '&amp;' '&'
ent '&lt;&gt;' '<>'
ent '&eacute;' 'é'
# numeric: decimal and hex
ent '&#65;&#66;' 'AB'
ent '&#x41;' 'A'
ent '&#xe9;' 'é'
# malformed numeric reference (decimal 'e9' has no digits) is left verbatim
ent '&#e9;' '&#e9;'
# U+0000 is not emitted; the reference is left verbatim
ent '&#0;' '&#0;'
# unknown entity is left verbatim
ent '&unknownentity;' '&unknownentity;'
# no entities: pass-through
ent 'plain text' 'plain text'
# decoding is a single pass: &amp;amp; -> &amp; (not &)
ent '&amp;amp;' '&amp;'
# KNOWN BUG: &nbsp; (U+00A0) decodes to a plain space (0x20), not C2 A0. The
# engine forces 160 -> 32 in htsencoding.c (FIXME hack). Locked here; if that
# hack is ever removed, update this to expect the C2 A0 byte.
ent '&nbsp;' ' '
# overflowing numeric reference must not crash (value far above U+10FFFF)
runs '&#9999999999;'
# original compound case. NOTE: the space after '&foo;' is the &nbsp; known bug
# above (U+00A0 -> 0x20), not a real space in the source.
ent '&foo;&nbsp;th&eacute;&amp;caf&#xe9;&#e9;&#x3082;&#12398;&#x306e;&#x3051;&#x59eb;' '&foo; thé&café&#e9;もののけ姫'

49
tests/01_engine-filter.test Executable file
View File

@@ -0,0 +1,49 @@
#!/bin/bash
#
# wildcard filter engine (strjoker), the core of +/- include/exclude rules.
# -#0 <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
match() {
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does match $1" || exit 1
}
nomatch() {
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does NOT match $1" || exit 1
}
# bare star matches everything
match '*' 'anything/at/all'
# prefix / suffix
match 'foo*' 'foobar'
nomatch 'foo*' 'xfoobar'
match '*.gif' 'a/b/c.gif'
# extension match is case-insensitive
match '*.GIF' 'a.gif'
# character classes
match '*[A-Z].txt' 'B.txt'
nomatch '*[A-Z].txt' 'b.txt'
match '*[0-9]' '5'
nomatch '*[0-9]' 'x'
# comma-separated class: both ranges are active, the comma is not matched
# literally and a char in neither range fails
match '*[A-Z,0-9]' 'Q'
match '*[A-Z,0-9]' '3'
nomatch '*[A-Z,0-9]' 'a'
# named groups: [file] stops at '/', [path] spans it
match '*[file].html' 'foo.html'
nomatch '*[file].html' 'foo/bar.html'
match '*[path]x' 'a/b/x'
# *[] means "nothing more after the star"
nomatch '*[]' 'abc'
# multiple stars
match '*foo*bar' 'foozbar'
# '?' is the query-string marker, not a single-char wildcard
nomatch 'a?c' 'abc'

View File

@@ -1,10 +1,36 @@
#!/bin/bash
#
# IDNA routine
test "$(httrack -O /dev/null -#4 "www.café.com")" == "www.xn--caf-dma.com" || exit 1
test "$(httrack -O /dev/null -#4 "www.もののけ姫-the-movie.com")" == "www.xn---the-movie-g63irla2z8297c.com" || exit 1
# IDNA / punycode encode (-#4) and decode (-#5). This code has a CVE history,
# so the edge cases below cover passthrough, round-trips, and malformed input.
# reverse IDNA
test "$(httrack -O /dev/null -#5 "www.xn--caf-dma.com")" == "www.café.com" || exit 1
test "$(httrack -O /dev/null -#5 "www.xn---the-movie-g63irla2z8297c.com")" == "www.もののけ姫-the-movie.com" || exit 1
enc() { test "$(httrack -O /dev/null -#4 "$1")" == "$2" || exit 1; }
dec() { test "$(httrack -O /dev/null -#5 "$1")" == "$2" || exit 1; }
# crash probe: malformed ACE input must exit cleanly, not abort.
runs() { httrack -O /dev/null -#5 "$1" >/dev/null 2>&1 || exit 1; }
# encode
enc 'www.café.com' 'www.xn--caf-dma.com'
enc 'www.もののけ姫-the-movie.com' 'www.xn---the-movie-g63irla2z8297c.com'
enc 'münchen.de' 'xn--mnchen-3ya.de'
# decode (reverse of the above)
dec 'www.xn--caf-dma.com' 'www.café.com'
dec 'www.xn---the-movie-g63irla2z8297c.com' 'www.もののけ姫-the-movie.com'
dec 'xn--mnchen-3ya.de' 'münchen.de'
# pure-ASCII hostnames are unchanged either way
enc 'plain.example.com' 'plain.example.com'
dec 'plain.example.com' 'plain.example.com'
enc 'a.b.c.example.org' 'a.b.c.example.org'
# an all-ASCII label (even one starting with the xn-- prefix) is passed through
# by the encoder untouched, since there is nothing to encode
enc 'xn--already-encoded.com' 'xn--already-encoded.com'
# an empty punycode payload decodes back to the bare xn-- label
dec 'xn--' 'xn--'
# malformed ACE payloads (invalid base-36, garbage) must not crash
runs 'xn--!!!'
runs 'xn--already-encoded.com'

27
tests/01_engine-mime.test Executable file
View File

@@ -0,0 +1,27 @@
#!/bin/bash
#
# MIME type guessing from extension (get_httptype / give_mimext).
# -#2 <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
mime() {
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is '$2'" || exit 1
}
unknown() {
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
}
mime '/a/b.html' 'text/html'
mime '/a/b.htm' 'text/html'
mime '/x.css' 'text/css'
mime '/x.js' 'application/x-javascript'
mime '/x.png' 'image/png'
mime '/x.jpg' 'image/jpeg'
mime '/x.gif' 'image/gif'
mime '/x.txt' 'text/plain'
mime '/x.xml' 'application/xml'
mime '/x.pdf' 'application/pdf'
# no extension, or one not in the table
unknown '/noext'
unknown '/x.unknownext'

View File

@@ -1,9 +1,26 @@
#!/bin/bash
#
# simplify engine
test "$(httrack -O /dev/null -#1 ./foo/bar/)" == "simplified=foo/bar/" || exit 1
test "$(httrack -O /dev/null -#1 ./foo/bar)" == "simplified=foo/bar" || exit 1
test "$(httrack -O /dev/null -#1 ./foo/./bar)" == "simplified=foo/bar" || exit 1
test "$(httrack -O /dev/null -#1 ./foo/bar/.././tmp/foobar)" == "simplified=foo/tmp/foobar" || exit 1
test "$(httrack -O /dev/null -#1 ./foo/bar/.././tmp/foobar/../foobaz)" == "simplified=foo/tmp/foobaz" || exit 1
# path simplify engine (fil_simplifie): collapses ./ and ../ segments.
simp() {
test "$(httrack -O /dev/null -#1 "$1")" == "simplified=$2" || exit 1
}
simp './foo/bar/' 'foo/bar/'
simp './foo/bar' 'foo/bar'
simp './foo/./bar' 'foo/bar'
simp './foo/bar/.././tmp/foobar' 'foo/tmp/foobar'
simp './foo/bar/.././tmp/foobar/../foobaz' 'foo/tmp/foobaz'
# single '..' collapses one segment
simp './a/../b' 'b'
simp './a/b/../../c' 'c'
# repeated './' is squeezed
simp './a/./././b' 'a/b'
# leading '..' that would go above the root is discarded, per RFC 3986 §5.2.4
simp './a/../../b' 'b'
# empty segments ('//') are not dot-segments and are preserved, per RFC 3986
simp 'a//b' 'a//b'

View File

@@ -9,6 +9,6 @@ TESTS_ENVIRONMENT += HTTPS_SUPPORT=$(HTTPS_SUPPORT)
TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir)
TEST_EXTENSIONS = .test
TESTS = 00_runnable.test 01_engine-charset.test 01_engine-entities.test 01_engine-hashtable.test 01_engine-idna.test 01_engine-simplify.test 02_manpage-regen.test 10_crawl-simple.test 11_crawl-cookies.test 11_crawl-idna.test 11_crawl-international.test 11_crawl-longurl.test 11_crawl-parsing.test 12_crawl_https.test
TESTS = 00_runnable.test 01_engine-charset.test 01_engine-entities.test 01_engine-filter.test 01_engine-hashtable.test 01_engine-idna.test 01_engine-mime.test 01_engine-simplify.test 02_manpage-regen.test 10_crawl-simple.test 11_crawl-cookies.test 11_crawl-idna.test 11_crawl-international.test 11_crawl-longurl.test 11_crawl-parsing.test 12_crawl_https.test
CLEANFILES = check-network_sh.cache

View File

@@ -472,7 +472,7 @@ TESTS_ENVIRONMENT = PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH \
ONLINE_UNIT_TESTS=$(ONLINE_UNIT_TESTS) \
HTTPS_SUPPORT=$(HTTPS_SUPPORT) top_srcdir=$(top_srcdir)
TEST_EXTENSIONS = .test
TESTS = 00_runnable.test 01_engine-charset.test 01_engine-entities.test 01_engine-hashtable.test 01_engine-idna.test 01_engine-simplify.test 02_manpage-regen.test 10_crawl-simple.test 11_crawl-cookies.test 11_crawl-idna.test 11_crawl-international.test 11_crawl-longurl.test 11_crawl-parsing.test 12_crawl_https.test
TESTS = 00_runnable.test 01_engine-charset.test 01_engine-entities.test 01_engine-filter.test 01_engine-hashtable.test 01_engine-idna.test 01_engine-mime.test 01_engine-simplify.test 02_manpage-regen.test 10_crawl-simple.test 11_crawl-cookies.test 11_crawl-idna.test 11_crawl-international.test 11_crawl-longurl.test 11_crawl-parsing.test 12_crawl_https.test
CLEANFILES = check-network_sh.cache
all: all-am