test: characterize wildcard class escape behavior

Add -#0 self-test cases for backslash escapes inside a '*[...]' class. They pin two quirks of the current decoder: '\X' matches both X and the backslash itself, and a literal ']' cannot be a class member because the parser stops at the first ']' (escaped or not). The latter is why the filter guide's '*[\[\]]' = "the [ or ] character" claim is wrong (#148): it parses as the class {[,\} plus a trailing literal ']'. These tests lock the behavior down so a later matcher fix is a deliberate change. refs #148
Merge pull request #322 from xroche/test/expand-engine-coverage
2026-06-14 06:14:23 +03:00 · 2026-06-13 10:15:45 +02:00 · 2026-06-13 09:58:03 +02:00 · 2026-06-13 09:18:39 +02:00 · 2026-06-13 09:18:10 +02:00 · 2026-06-13 09:18:02 +02:00
4 changed files with 68 additions and 3 deletions
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -2899,7 +2899,9 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
  }

  {
-    char n_lock[256];
+    /* Sized to the concat-buffer capacity so it can always hold the lock-file
+       path produced by fconcat(), even with a long log path (issue #183). */
+    char n_lock[OPT_GET_BUFF_SIZE(opt)];

    // on peut pas avoir un affichage ET un fichier log
    // ca sera pour la version 2
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -274,6 +274,28 @@ Please visit our Website: http://www.httrack.com
  } \
 } while(0)

+/* Percent-encode the angle brackets of a string so it is safe to embed inside
+   an HTML comment (the default footer) or any other HTML context. A URL holding
+   "-->" would otherwise close the footer comment and inject markup (issue #165).
+   Raw '<' and '>' are not valid URL characters, so encoding them is harmless. */
+static const char *html_inline_safe(const char *src, char *dst, size_t size) {
+  size_t i, j;
+
+  for(i = 0, j = 0; src[i] != '\0' && j + 4 < size; i++) {
+    const char c = src[i];
+
+    if (c == '<' || c == '>') {
+      dst[j++] = '%';
+      dst[j++] = '3';
+      dst[j++] = (c == '<') ? 'C' : 'E';
+    } else {
+      dst[j++] = c;
+    }
+  }
+  dst[j] = '\0';
+  return dst;
+}
+
 /* Main parser */
 int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
  char catbuff[CATBUFF_SIZE];
@@ -719,13 +741,16 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                if (StringNotEmpty(opt->footer)) {
                  char BIGSTK tempo[1024 + HTS_URLMAXSIZE * 2];
                  char gmttime[256];
+                  char BIGSTK safe_adr[HTS_URLMAXSIZE * 3 + 4];
+                  char BIGSTK safe_fil[HTS_URLMAXSIZE * 3 + 4];

                  tempo[0] = '\0';
                  time_gmt_rfc822(gmttime);
                  strcatbuff(tempo, eol);
                  hts_template_format_str(tempo + strlen(tempo), sizeof(tempo) - strlen(tempo),
                          StringBuff(opt->footer),
-                          jump_identification_const(urladr()), urlfil(), gmttime,
+                          html_inline_safe(jump_identification_const(urladr()), safe_adr, sizeof(safe_adr)),
+                          html_inline_safe(urlfil(), safe_fil, sizeof(safe_fil)), gmttime,
                          HTTRACK_VERSIONID, /* EOF */ NULL);
                  strcatbuff(tempo, eol);
                  //fwrite(tempo,1,strlen(tempo),fp);
--- a/src/htsthread.c
+++ b/src/htsthread.c
@@ -193,7 +193,23 @@ HTSEXT_API void hts_mutexfree(htsmutex * mutex) {
 HTSEXT_API void hts_mutexlock(htsmutex * mutex) {
  assertf(mutex != NULL);
  if (*mutex == HTSMUTEX_INIT) {        /* must be initialized */
-    hts_mutexinit(mutex);
+    /* Initialize exactly once, even when several threads race to lock the same
+       mutex for the first time. Build our own object, then publish it with a
+       single atomic compare-and-swap; the threads that lose the race free the
+       object they built (issue #297). No static guard is needed, which keeps
+       this safe on Windows 2000 (no statically-initializable lock there). */
+    htsmutex created = HTSMUTEX_INIT;
+
+    hts_mutexinit(&created);
+#ifdef _WIN32
+    if (InterlockedCompareExchangePointer((PVOID volatile *) mutex, created,
+                                          HTSMUTEX_INIT) != HTSMUTEX_INIT)
+#else
+    if (!__sync_bool_compare_and_swap(mutex, HTSMUTEX_INIT, created))
+#endif
+    {
+      hts_mutexfree(&created);
+    }
  }
  assertf(*mutex != NULL);
 #ifdef _WIN32
--- a/tests/01_engine-filter.test
+++ b/tests/01_engine-filter.test
@@ -47,3 +47,25 @@ match '*foo*bar' 'foozbar'

 # '?' is the query-string marker, not a single-char wildcard
 nomatch 'a?c' 'abc'
+
+# backslash escapes a metacharacter inside a class so it is matched literally.
+# Quirk: the decoder also adds the backslash itself to the set, so '\X' matches
+# both X and '\'. These assertions pin that behavior.
+match '*[\*]' '*'
+match '*[\*]' "\\"
+nomatch '*[\*]' 'a'
+match '*[\\]' "\\"
+nomatch '*[\\]' 'a'
+match '*[\[]' '['
+match '*[\[]' "\\"
+nomatch '*[\[]' 'a'
+
+# A literal ']' cannot be a class member: the class parser stops at the first
+# ']', escaped or not. So '*[\[\]]' does NOT mean "the [ or ] character" as the
+# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
+# by a trailing literal ']'. These assertions document the current (buggy)
+# behavior so any future matcher fix is a deliberate, visible change.
+nomatch '*[\[\]]' '['   # not matched, despite the docs
+match '*[\[\]]' ']'     # only via the empty class-match + trailing ']'
+match '*[\[\]]' '[]'    # one of {'[','\'} then the trailing ']'
+nomatch '*[\[\]]' '[]x'
Author	SHA1	Message	Date
Xavier Roche	794404bba2	test: characterize wildcard class escape behavior Add -#0 self-test cases for backslash escapes inside a '[...]' class. They pin two quirks of the current decoder: '\X' matches both X and the backslash itself, and a literal ']' cannot be a class member because the parser stops at the first ']' (escaped or not). The latter is why the filter guide's '[\[\]]' = "the [ or ] character" claim is wrong (#148): it parses as the class {[,\} plus a trailing literal ']'. These tests lock the behavior down so a later matcher fix is a deliberate change. refs #148	2026-06-13 10:15:45 +02:00
Xavier Roche	89b25e418b	Merge pull request #322 from xroche/test/expand-engine-coverage test: expand offline engine self-test coverage	2026-06-13 09:58:03 +02:00
Xavier Roche	017c634c53	Merge pull request #321 from xroche/fix/mutex-init-race-297 Fix race in lazy mutex initialization	2026-06-13 09:18:39 +02:00
Xavier Roche	f2b36c4b29	Merge pull request #320 from xroche/fix/lockpath-overflow-183 Fix abort on long log path (lock-file buffer too small)	2026-06-13 09:18:10 +02:00
Xavier Roche	19947efd74	Merge pull request #319 from xroche/fix/footer-xss-165 Fix XSS via unescaped URL in the page footer comment	2026-06-13 09:18:02 +02:00
Xavier Roche	de26ad881a	fix: synchronize lazy mutex initialization (closes #297 ) Two threads locking the same mutex for the first time could both run the unsynchronized lazy init, corrupting the underlying pthread mutex and aborting or deadlocking. Build the object and publish it with a single atomic compare-and-swap; threads that lose the race free the object they built. This needs no statically-initializable guard, so it stays valid on Windows 2000.	2026-06-13 09:15:31 +02:00
Xavier Roche	106d34d82c	fix: size the lock-file path buffer to the concat buffer (closes #183 ) A long log path made the lock-file path overflow the fixed 256-byte n_lock buffer, tripping the guarded copy and aborting with signal 6. Size n_lock to the concat-buffer capacity so it holds any path fconcat can produce. (cherry picked from commit 15144ffd24667712cca2ac0fee96bd355239eff6)	2026-06-12 23:24:20 +02:00
Xavier Roche	61e0b3250b	fix: escape angle brackets in the page footer URL (closes #165 ) The default footer embeds the page URL inside an HTML comment. A URL containing "-->" closed the comment and let an attacker inject script into the mirrored page. Percent-encode < and > before the URL reaches the footer. (cherry picked from commit 606883229244dc233d16915678e63cfa62000ff0)	2026-06-12 23:24:20 +02:00