Files
httrack/tests/01_engine-filter.test
Xavier Roche f9f4700ee1 Reformat every shell script with shfmt -i 4
Mechanical pass: run shfmt -i 4 over the whole tracked shell tree (the
test harness .test files, the regen generators, webhttrack, the CGI
search helper, and the build/dist scripts) so they share one style.
shfmt also normalised backticks to $(...) and $[..] to $((..)).

No behaviour change: arithmetic is preserved exactly, non-ASCII bytes
are untouched, and the full make check suite still passes. The tab
indented .test files become 4-space indented, hence the wide diff.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-20 11:24:01 +02:00

74 lines
2.2 KiB
Bash
Executable File

#!/bin/bash
#
set -euo pipefail
# wildcard filter engine (strjoker), the core of +/- include/exclude rules.
# -#0 <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
match() {
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does match $1" || exit 1
}
nomatch() {
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does NOT match $1" || exit 1
}
# bare star matches everything
match '*' 'anything/at/all'
# prefix / suffix
match 'foo*' 'foobar'
nomatch 'foo*' 'xfoobar'
match '*.gif' 'a/b/c.gif'
# extension match is case-insensitive
match '*.GIF' 'a.gif'
# character classes
match '*[A-Z].txt' 'B.txt'
nomatch '*[A-Z].txt' 'b.txt'
match '*[0-9]' '5'
nomatch '*[0-9]' 'x'
# comma-separated class: both ranges are active, the comma is not matched
# literally and a char in neither range fails
match '*[A-Z,0-9]' 'Q'
match '*[A-Z,0-9]' '3'
nomatch '*[A-Z,0-9]' 'a'
# named groups: [file] stops at '/', [path] spans it
match '*[file].html' 'foo.html'
nomatch '*[file].html' 'foo/bar.html'
match '*[path]x' 'a/b/x'
# *[] means "nothing more after the star"
nomatch '*[]' 'abc'
# multiple stars
match '*foo*bar' 'foozbar'
# '?' is the query-string marker, not a single-char wildcard
nomatch 'a?c' 'abc'
# backslash escapes a metacharacter inside a class so it is matched literally.
# Quirk: the decoder also adds the backslash itself to the set, so '\X' matches
# both X and '\'. These assertions pin that behavior.
match '*[\*]' '*'
match '*[\*]' "\\"
nomatch '*[\*]' 'a'
match '*[\\]' "\\"
nomatch '*[\\]' 'a'
match '*[\[]' '['
match '*[\[]' "\\"
nomatch '*[\[]' 'a'
# A literal ']' cannot be a class member: the class parser stops at the first
# ']', escaped or not. So '*[\[\]]' does NOT mean "the [ or ] character" as the
# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
# by a trailing literal ']'. These assertions document the current (buggy)
# behavior so any future matcher fix is a deliberate, visible change.
nomatch '*[\[\]]' '[' # not matched, despite the docs
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
nomatch '*[\[\]]' '[]x'