httrack/tests/01_engine-filter.test

#!/bin/bash
#

set -euo pipefail

# wildcard filter engine (strjoker), the core of +/- include/exclude rules.
# -#0 <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".

match() {
    test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does match $1" || exit 1
}
nomatch() {
    test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does NOT match $1" || exit 1
}

# bare star matches everything
match '*' 'anything/at/all'

# prefix / suffix
match 'foo*' 'foobar'
nomatch 'foo*' 'xfoobar'
match '*.gif' 'a/b/c.gif'

# extension match is case-insensitive
match '*.GIF' 'a.gif'

# character classes
match '*[A-Z].txt' 'B.txt'
nomatch '*[A-Z].txt' 'b.txt'
match '*[0-9]' '5'
nomatch '*[0-9]' 'x'

# comma-separated class: both ranges are active, the comma is not matched
# literally and a char in neither range fails
match '*[A-Z,0-9]' 'Q'
match '*[A-Z,0-9]' '3'
nomatch '*[A-Z,0-9]' 'a'

# named groups: [file] stops at '/', [path] spans it
match '*[file].html' 'foo.html'
nomatch '*[file].html' 'foo/bar.html'
match '*[path]x' 'a/b/x'

# *[] means "nothing more after the star"
nomatch '*[]' 'abc'

# multiple stars
match '*foo*bar' 'foozbar'

# '?' is the query-string marker, not a single-char wildcard
nomatch 'a?c' 'abc'

# backslash escapes a metacharacter inside a class so it is matched literally.
# Quirk: the decoder also adds the backslash itself to the set, so '\X' matches
# both X and '\'. These assertions pin that behavior.
match '*[\*]' '*'
match '*[\*]' "\\"
nomatch '*[\*]' 'a'
match '*[\\]' "\\"
nomatch '*[\\]' 'a'
match '*[\[]' '['
match '*[\[]' "\\"
nomatch '*[\[]' 'a'

# A literal ']' cannot be a class member: the class parser stops at the first
# ']', escaped or not. So '*[\[\]]' does NOT mean "the [ or ] character" as the
# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
# by a trailing literal ']'. These assertions document the current (buggy)
# behavior so any future matcher fix is a deliberate, visible change.
nomatch '*[\[\]]' '[' # not matched, despite the docs
match '*[\[\]]' ']'   # only via the empty class-match + trailing ']'
match '*[\[\]]' '[]'  # one of {'[','\'} then the trailing ']'
nomatch '*[\[\]]' '[]x'