mirror of
https://github.com/xroche/httrack.git
synced 2026-06-13 22:04:07 +03:00
Compare commits
1 Commits
fix/lockpa
...
test/expan
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
43f72afbad |
@@ -1,5 +1,36 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# minimalistic charset test
|
||||
test "$(httrack -O /dev/null -#3 "iso-8859-1" "café")" == "café" || exit 1
|
||||
# charset -> UTF-8 conversion (hts_convertStringToUTF8).
|
||||
# -#3 <charset> <string> prints the string re-decoded from <charset> as UTF-8.
|
||||
conv() {
|
||||
test "$(httrack -O /dev/null -#3 "$1" "$2")" == "$3" || exit 1
|
||||
}
|
||||
# crash probe: malformed input must exit cleanly, not abort.
|
||||
runs() {
|
||||
httrack -O /dev/null -#3 "$1" "$2" >/dev/null 2>&1 || exit 1
|
||||
}
|
||||
|
||||
# the source bytes below are UTF-8 (this file is UTF-8); "café" is 0x63 61 66 C3 A9.
|
||||
|
||||
# already UTF-8: identity
|
||||
conv 'utf-8' 'café' 'café'
|
||||
|
||||
# bytes reinterpreted as latin-1: each input byte becomes one codepoint
|
||||
conv 'iso-8859-1' 'café' 'café'
|
||||
|
||||
# windows-1252 is NOT latin-1: 0x80 is the euro sign, not U+0080. This is the
|
||||
# case that actually exercises the cp1252 table (the 0x80-0x9F range).
|
||||
conv 'windows-1252' $'\x80' '€'
|
||||
|
||||
# pure ASCII is charset-invariant
|
||||
conv 'us-ascii' 'hello' 'hello'
|
||||
|
||||
# unknown charset: ASCII passes through unchanged, but non-ASCII input cannot be
|
||||
# decoded and yields empty output (an error is printed to stderr).
|
||||
conv 'no-such-charset-xyz' 'abc' 'abc'
|
||||
test "$(httrack -O /dev/null -#3 'no-such-charset-xyz' 'café' 2>/dev/null)" == "" || exit 1
|
||||
|
||||
# malformed UTF-8 (lone continuation byte, truncated lead byte) must not crash
|
||||
runs 'utf-8' $'\x80'
|
||||
runs 'utf-8' $'\xc3'
|
||||
|
||||
@@ -1,5 +1,49 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# basic entities handling (with boggy entities handling)
|
||||
test "$(httrack -O /dev/null -#6 "&foo; thé&café&#e9;もののけ姫")" == "&foo; thé&café&#e9;もののけ姫" || exit 1
|
||||
# HTML entity unescaping (hts_unescapeEntitiesWithCharset).
|
||||
# -#6 <string> prints the string with entities decoded (UTF-8 output).
|
||||
ent() {
|
||||
test "$(httrack -O /dev/null -#6 "$1")" == "$2" || exit 1
|
||||
}
|
||||
# crash probe: malformed input must exit cleanly, not abort.
|
||||
runs() {
|
||||
httrack -O /dev/null -#6 "$1" >/dev/null 2>&1 || exit 1
|
||||
}
|
||||
|
||||
# named entities
|
||||
ent '&' '&'
|
||||
ent '<>' '<>'
|
||||
ent 'é' 'é'
|
||||
|
||||
# numeric: decimal and hex
|
||||
ent 'AB' 'AB'
|
||||
ent 'A' 'A'
|
||||
ent 'é' 'é'
|
||||
|
||||
# malformed numeric reference (decimal 'e9' has no digits) is left verbatim
|
||||
ent '&#e9;' '&#e9;'
|
||||
|
||||
# U+0000 is not emitted; the reference is left verbatim
|
||||
ent '�' '�'
|
||||
|
||||
# unknown entity is left verbatim
|
||||
ent '&unknownentity;' '&unknownentity;'
|
||||
|
||||
# no entities: pass-through
|
||||
ent 'plain text' 'plain text'
|
||||
|
||||
# decoding is a single pass: &amp; -> & (not &)
|
||||
ent '&amp;' '&'
|
||||
|
||||
# KNOWN BUG: (U+00A0) decodes to a plain space (0x20), not C2 A0. The
|
||||
# engine forces 160 -> 32 in htsencoding.c (FIXME hack). Locked here; if that
|
||||
# hack is ever removed, update this to expect the C2 A0 byte.
|
||||
ent ' ' ' '
|
||||
|
||||
# overflowing numeric reference must not crash (value far above U+10FFFF)
|
||||
runs '�'
|
||||
|
||||
# original compound case. NOTE: the space after '&foo;' is the known bug
|
||||
# above (U+00A0 -> 0x20), not a real space in the source.
|
||||
ent '&foo; thé&café&#e9;もののけ姫' '&foo; thé&café&#e9;もののけ姫'
|
||||
|
||||
49
tests/01_engine-filter.test
Executable file
49
tests/01_engine-filter.test
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# wildcard filter engine (strjoker), the core of +/- include/exclude rules.
|
||||
# -#0 <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
|
||||
|
||||
match() {
|
||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does match $1" || exit 1
|
||||
}
|
||||
nomatch() {
|
||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does NOT match $1" || exit 1
|
||||
}
|
||||
|
||||
# bare star matches everything
|
||||
match '*' 'anything/at/all'
|
||||
|
||||
# prefix / suffix
|
||||
match 'foo*' 'foobar'
|
||||
nomatch 'foo*' 'xfoobar'
|
||||
match '*.gif' 'a/b/c.gif'
|
||||
|
||||
# extension match is case-insensitive
|
||||
match '*.GIF' 'a.gif'
|
||||
|
||||
# character classes
|
||||
match '*[A-Z].txt' 'B.txt'
|
||||
nomatch '*[A-Z].txt' 'b.txt'
|
||||
match '*[0-9]' '5'
|
||||
nomatch '*[0-9]' 'x'
|
||||
|
||||
# comma-separated class: both ranges are active, the comma is not matched
|
||||
# literally and a char in neither range fails
|
||||
match '*[A-Z,0-9]' 'Q'
|
||||
match '*[A-Z,0-9]' '3'
|
||||
nomatch '*[A-Z,0-9]' 'a'
|
||||
|
||||
# named groups: [file] stops at '/', [path] spans it
|
||||
match '*[file].html' 'foo.html'
|
||||
nomatch '*[file].html' 'foo/bar.html'
|
||||
match '*[path]x' 'a/b/x'
|
||||
|
||||
# *[] means "nothing more after the star"
|
||||
nomatch '*[]' 'abc'
|
||||
|
||||
# multiple stars
|
||||
match '*foo*bar' 'foozbar'
|
||||
|
||||
# '?' is the query-string marker, not a single-char wildcard
|
||||
nomatch 'a?c' 'abc'
|
||||
@@ -1,10 +1,36 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# IDNA routine
|
||||
test "$(httrack -O /dev/null -#4 "www.café.com")" == "www.xn--caf-dma.com" || exit 1
|
||||
test "$(httrack -O /dev/null -#4 "www.もののけ姫-the-movie.com")" == "www.xn---the-movie-g63irla2z8297c.com" || exit 1
|
||||
# IDNA / punycode encode (-#4) and decode (-#5). This code has a CVE history,
|
||||
# so the edge cases below cover passthrough, round-trips, and malformed input.
|
||||
|
||||
# reverse IDNA
|
||||
test "$(httrack -O /dev/null -#5 "www.xn--caf-dma.com")" == "www.café.com" || exit 1
|
||||
test "$(httrack -O /dev/null -#5 "www.xn---the-movie-g63irla2z8297c.com")" == "www.もののけ姫-the-movie.com" || exit 1
|
||||
enc() { test "$(httrack -O /dev/null -#4 "$1")" == "$2" || exit 1; }
|
||||
dec() { test "$(httrack -O /dev/null -#5 "$1")" == "$2" || exit 1; }
|
||||
# crash probe: malformed ACE input must exit cleanly, not abort.
|
||||
runs() { httrack -O /dev/null -#5 "$1" >/dev/null 2>&1 || exit 1; }
|
||||
|
||||
# encode
|
||||
enc 'www.café.com' 'www.xn--caf-dma.com'
|
||||
enc 'www.もののけ姫-the-movie.com' 'www.xn---the-movie-g63irla2z8297c.com'
|
||||
enc 'münchen.de' 'xn--mnchen-3ya.de'
|
||||
|
||||
# decode (reverse of the above)
|
||||
dec 'www.xn--caf-dma.com' 'www.café.com'
|
||||
dec 'www.xn---the-movie-g63irla2z8297c.com' 'www.もののけ姫-the-movie.com'
|
||||
dec 'xn--mnchen-3ya.de' 'münchen.de'
|
||||
|
||||
# pure-ASCII hostnames are unchanged either way
|
||||
enc 'plain.example.com' 'plain.example.com'
|
||||
dec 'plain.example.com' 'plain.example.com'
|
||||
enc 'a.b.c.example.org' 'a.b.c.example.org'
|
||||
|
||||
# an all-ASCII label (even one starting with the xn-- prefix) is passed through
|
||||
# by the encoder untouched, since there is nothing to encode
|
||||
enc 'xn--already-encoded.com' 'xn--already-encoded.com'
|
||||
|
||||
# an empty punycode payload decodes back to the bare xn-- label
|
||||
dec 'xn--' 'xn--'
|
||||
|
||||
# malformed ACE payloads (invalid base-36, garbage) must not crash
|
||||
runs 'xn--!!!'
|
||||
runs 'xn--already-encoded.com'
|
||||
|
||||
27
tests/01_engine-mime.test
Executable file
27
tests/01_engine-mime.test
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# MIME type guessing from extension (get_httptype / give_mimext).
|
||||
# -#2 <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
|
||||
|
||||
mime() {
|
||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is '$2'" || exit 1
|
||||
}
|
||||
unknown() {
|
||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
|
||||
}
|
||||
|
||||
mime '/a/b.html' 'text/html'
|
||||
mime '/a/b.htm' 'text/html'
|
||||
mime '/x.css' 'text/css'
|
||||
mime '/x.js' 'application/x-javascript'
|
||||
mime '/x.png' 'image/png'
|
||||
mime '/x.jpg' 'image/jpeg'
|
||||
mime '/x.gif' 'image/gif'
|
||||
mime '/x.txt' 'text/plain'
|
||||
mime '/x.xml' 'application/xml'
|
||||
mime '/x.pdf' 'application/pdf'
|
||||
|
||||
# no extension, or one not in the table
|
||||
unknown '/noext'
|
||||
unknown '/x.unknownext'
|
||||
@@ -1,9 +1,26 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# simplify engine
|
||||
test "$(httrack -O /dev/null -#1 ./foo/bar/)" == "simplified=foo/bar/" || exit 1
|
||||
test "$(httrack -O /dev/null -#1 ./foo/bar)" == "simplified=foo/bar" || exit 1
|
||||
test "$(httrack -O /dev/null -#1 ./foo/./bar)" == "simplified=foo/bar" || exit 1
|
||||
test "$(httrack -O /dev/null -#1 ./foo/bar/.././tmp/foobar)" == "simplified=foo/tmp/foobar" || exit 1
|
||||
test "$(httrack -O /dev/null -#1 ./foo/bar/.././tmp/foobar/../foobaz)" == "simplified=foo/tmp/foobaz" || exit 1
|
||||
# path simplify engine (fil_simplifie): collapses ./ and ../ segments.
|
||||
simp() {
|
||||
test "$(httrack -O /dev/null -#1 "$1")" == "simplified=$2" || exit 1
|
||||
}
|
||||
|
||||
simp './foo/bar/' 'foo/bar/'
|
||||
simp './foo/bar' 'foo/bar'
|
||||
simp './foo/./bar' 'foo/bar'
|
||||
simp './foo/bar/.././tmp/foobar' 'foo/tmp/foobar'
|
||||
simp './foo/bar/.././tmp/foobar/../foobaz' 'foo/tmp/foobaz'
|
||||
|
||||
# single '..' collapses one segment
|
||||
simp './a/../b' 'b'
|
||||
simp './a/b/../../c' 'c'
|
||||
|
||||
# repeated './' is squeezed
|
||||
simp './a/./././b' 'a/b'
|
||||
|
||||
# leading '..' that would go above the root is discarded, per RFC 3986 §5.2.4
|
||||
simp './a/../../b' 'b'
|
||||
|
||||
# empty segments ('//') are not dot-segments and are preserved, per RFC 3986
|
||||
simp 'a//b' 'a//b'
|
||||
|
||||
@@ -9,6 +9,6 @@ TESTS_ENVIRONMENT += HTTPS_SUPPORT=$(HTTPS_SUPPORT)
|
||||
TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir)
|
||||
|
||||
TEST_EXTENSIONS = .test
|
||||
TESTS = 00_runnable.test 01_engine-charset.test 01_engine-entities.test 01_engine-hashtable.test 01_engine-idna.test 01_engine-simplify.test 02_manpage-regen.test 10_crawl-simple.test 11_crawl-cookies.test 11_crawl-idna.test 11_crawl-international.test 11_crawl-longurl.test 11_crawl-parsing.test 12_crawl_https.test
|
||||
TESTS = 00_runnable.test 01_engine-charset.test 01_engine-entities.test 01_engine-filter.test 01_engine-hashtable.test 01_engine-idna.test 01_engine-mime.test 01_engine-simplify.test 02_manpage-regen.test 10_crawl-simple.test 11_crawl-cookies.test 11_crawl-idna.test 11_crawl-international.test 11_crawl-longurl.test 11_crawl-parsing.test 12_crawl_https.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -472,7 +472,7 @@ TESTS_ENVIRONMENT = PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH \
|
||||
ONLINE_UNIT_TESTS=$(ONLINE_UNIT_TESTS) \
|
||||
HTTPS_SUPPORT=$(HTTPS_SUPPORT) top_srcdir=$(top_srcdir)
|
||||
TEST_EXTENSIONS = .test
|
||||
TESTS = 00_runnable.test 01_engine-charset.test 01_engine-entities.test 01_engine-hashtable.test 01_engine-idna.test 01_engine-simplify.test 02_manpage-regen.test 10_crawl-simple.test 11_crawl-cookies.test 11_crawl-idna.test 11_crawl-international.test 11_crawl-longurl.test 11_crawl-parsing.test 12_crawl_https.test
|
||||
TESTS = 00_runnable.test 01_engine-charset.test 01_engine-entities.test 01_engine-filter.test 01_engine-hashtable.test 01_engine-idna.test 01_engine-mime.test 01_engine-simplify.test 02_manpage-regen.test 10_crawl-simple.test 11_crawl-cookies.test 11_crawl-idna.test 11_crawl-international.test 11_crawl-longurl.test 11_crawl-parsing.test 12_crawl_https.test
|
||||
CLEANFILES = check-network_sh.cache
|
||||
all: all-am
|
||||
|
||||
|
||||
Reference in New Issue
Block a user