mirror of
https://github.com/xroche/httrack.git
synced 2026-06-28 21:17:57 +03:00
Compare commits
2 Commits
fix-pause-
...
fix/strjok
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c292454271 | ||
|
|
e0d74976e7 |
@@ -247,7 +247,7 @@ See also: The <a href="faq.html#VF1">FAQ</a><br>
|
||||
<td>the \ character</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td nowrap><tt>*[\[\]]</tt></td>
|
||||
<td nowrap><tt>*[\[,\]]</tt></td>
|
||||
<td>the [ or ] character</td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
||||
@@ -193,7 +193,12 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
|
||||
int len = (int) strlen(joker);
|
||||
|
||||
while((joker[i] != RIGHT) && (joker[i]) && (i < len)) {
|
||||
if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10]
|
||||
// '\' escapes the next char as a literal member, e.g. *[\[\]]
|
||||
if (joker[i] == '\\' && joker[i + 1] != '\0') {
|
||||
i++;
|
||||
pass[(int) (unsigned char) joker[i]] = 1;
|
||||
i++;
|
||||
} else if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10]
|
||||
int lsize = 0;
|
||||
int lverdict;
|
||||
|
||||
@@ -221,7 +226,9 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
|
||||
while(isdigit((unsigned char) joker[i]))
|
||||
i++;
|
||||
}
|
||||
} else if (joker[i + 1] == '-') { // 2 car, ex: *[A-Z]
|
||||
} else if (joker[i + 1] == '-' && joker[i + 2] != '\0') {
|
||||
// range *[A-Z]; the '\0' guard rejects a truncated *[a- (else
|
||||
// i+=3 overshoots the NUL)
|
||||
if ((int) (unsigned char) joker[i + 2] >
|
||||
(int) (unsigned char) joker[i]) {
|
||||
int j;
|
||||
@@ -233,10 +240,7 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
|
||||
}
|
||||
// else err=1;
|
||||
i += 3;
|
||||
} else { // 1 car, ex: *[ ]
|
||||
if (joker[i + 2] == '\\' && joker[i + 3] != 0) { // escaped char, such as *[\[] or *[\]]
|
||||
i++;
|
||||
}
|
||||
} else { // 1 car, ex: *[ ]
|
||||
pass[(int) (unsigned char) joker[i]] = 1;
|
||||
i++;
|
||||
}
|
||||
|
||||
@@ -512,15 +512,21 @@ static int string_safety_selftests(void) {
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
static int st_filter(httrackp *opt, int argc, char **argv) {
|
||||
char *str, *pat;
|
||||
int matched;
|
||||
|
||||
(void) opt;
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "filter: needs a filter pattern and a string\n");
|
||||
return 1;
|
||||
}
|
||||
if (strjoker(argv[1], argv[0], NULL, NULL))
|
||||
printf("%s does match %s\n", argv[1], argv[0]);
|
||||
else
|
||||
printf("%s does NOT match %s\n", argv[1], argv[0]);
|
||||
/* exact-size heap copies so a sanitizer traps any over-read of the pattern */
|
||||
str = strdupt(argv[1]);
|
||||
pat = strdupt(argv[0]);
|
||||
matched = strjoker(str, pat, NULL, NULL) != NULL;
|
||||
printf("%s does %s %s\n", argv[1], matched ? "match" : "NOT match", argv[0]);
|
||||
freet(str);
|
||||
freet(pat);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -50,27 +50,54 @@ match '*foo*bar' 'foozbar'
|
||||
# '?' is the query-string marker, not a single-char wildcard
|
||||
nomatch 'a?c' 'abc'
|
||||
|
||||
# backslash escapes a metacharacter inside a class so it is matched literally.
|
||||
# Quirk: the decoder also adds the backslash itself to the set, so '\X' matches
|
||||
# both X and '\'. These assertions pin that behavior.
|
||||
# Inside a class, backslash escapes the next char as a literal member (#148):
|
||||
# '\X' matches X only (not '\'), and an escaped ']' is a member, not the terminator.
|
||||
match '*[\*]' '*'
|
||||
match '*[\*]' "\\"
|
||||
nomatch '*[\*]' 'a'
|
||||
nomatch '*[\*]' "\\"
|
||||
match '*[\\]' "\\"
|
||||
nomatch '*[\\]' 'a'
|
||||
nomatch '*[\\]' '*'
|
||||
match '*[\[]' '['
|
||||
match '*[\[]' "\\"
|
||||
nomatch '*[\[]' 'a'
|
||||
nomatch '*[\[]' "\\"
|
||||
match '*[\]]' ']'
|
||||
nomatch '*[\]]' "\\"
|
||||
|
||||
# A literal ']' cannot be a class member: the class parser stops at the first
|
||||
# ']', escaped or not. So '*[\[\]]' does NOT mean "the [ or ] character" as the
|
||||
# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
|
||||
# by a trailing literal ']'. These assertions document the current (buggy)
|
||||
# behavior so any future matcher fix is a deliberate, visible change.
|
||||
nomatch '*[\[\]]' '[' # not matched, despite the docs
|
||||
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
|
||||
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
|
||||
nomatch '*[\[\]]' '[]x'
|
||||
# '*[\[\]]' is "the [ or ] character", as the filter guide documents.
|
||||
match '*[\[\]]' '['
|
||||
match '*[\[\]]' ']'
|
||||
nomatch '*[\[\]]' 'a'
|
||||
match '*[\[,\]]' '[' # comma between members is optional
|
||||
match '*[\[,\]]' ']'
|
||||
match '*[a,\[]' 'a' # an escaped member no longer eats the preceding one
|
||||
match '*[a,\[]' '['
|
||||
|
||||
# Escape is decoded before the range/separator/size checks, so '\-' '\,' '\<'
|
||||
# are literal members, not operators.
|
||||
match '*[a\-z]' 'a'
|
||||
match '*[a\-z]' 'z'
|
||||
nomatch '*[a\-z]' 'b' # not the a..z range
|
||||
match '*[\,]' ','
|
||||
nomatch '*[\,]' "\\" # the escape must not leak '\' into the class
|
||||
match '*[\<]' '<'
|
||||
nomatch '*[\<]' "\\"
|
||||
match '*[\[,\],a]' '['
|
||||
match '*[\[,\],a]' ']'
|
||||
match '*[\[,\],a]' 'a'
|
||||
|
||||
# A truncated range '*[a-' is the literal members {a,-}; the parser must not
|
||||
# read past the end decoding it (was a 1-byte heap over-read in the range arm).
|
||||
match '*[a-' 'a'
|
||||
nomatch '*[a-' 'b'
|
||||
|
||||
# *(...) matches exactly one char from the class; *[...] matches a run.
|
||||
match '*(a,b)' 'a'
|
||||
nomatch '*(a,b)' 'aa'
|
||||
nomatch '*(a,b)' 'c'
|
||||
|
||||
# documented composite filters (filters.html)
|
||||
match 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.zip'
|
||||
nomatch 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.tar'
|
||||
match '*.html*[]' 'page.html'
|
||||
nomatch '*.html*[]' 'page.html?x=1' # *[] forbids the trailing query
|
||||
|
||||
# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
|
||||
# means the size is still unknown (scan time). A size exclusion must stay neutral
|
||||
|
||||
Reference in New Issue
Block a user