htsparse: don't read *(html-1) before the parse buffer (#396 )

The link detector's word-boundary guards dereference *(html-1) to check the byte preceding a matched token. When the token sits at the very start of the parse buffer (html == r->adr), that reads one byte before the allocation: a heap-buffer-overflow under ASan, silent on a normal build. A stylesheet beginning with a url() token is enough to hit it. Route the three reachable guards (url(), location=, the makeindex /title check) through html_prevc(), which returns a space sentinel at the buffer start. Space is the right value for these tests: a token at offset 0 is at a word boundary, so it stays a valid match. The other *(html-1) sites only run after html has advanced past an opening tag or quote. Covers it with an offset-0 url() fixture in 01_engine-parse.test; without the fix it aborts at htsparse.c:1386 under the CI sanitizer job. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>
Merge pull request #397 from xroche/fix/css-import-94
2026-06-20 09:09:02 +03:00 · 2026-06-19 19:44:25 +02:00 · 2026-06-19 19:30:21 +02:00
2 changed files with 23 additions and 5 deletions
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -296,6 +296,12 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
  return dst;
 }

+/* Byte before html, or a space sentinel at the buffer start where html[-1]
+   would underflow; space reads as the word boundary the guards want there. */
+static HTS_INLINE char html_prevc(const char *html, const char *start) {
+  return html > start ? html[-1] : ' ';
+}
+
 /* Main parser */
 int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
  char catbuff[CATBUFF_SIZE];
@@ -556,7 +562,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                  if (opt->getmode & HTS_GETMODE_HTML) {
                    p = strfield(html, "title");
                    if (p) {
-                      if (*(html - 1) == '/')
+                      if (html_prevc(html, r->adr) == '/')
                        p = 0;  // /title
                    } else {
                      if (strfield(html, "/html"))
@@ -1360,9 +1366,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                      if (!nc)
                        nc = strfield(html, ":location");        // javascript:location="doc"
                      if (!nc) {        // location="doc"
-                        if ((nc = strfield(html, "location"))
-                            && !isspace(*(html - 1))
-                          )
+                        if ((nc = strfield(html, "location")) &&
+                            !isspace(html_prevc(html, r->adr)))
                          nc = 0;
                      }
                      if (!nc)
@@ -1383,7 +1388,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                          expected = '(';       // parenthèse
                          expected_end = ")";   // fin: parenthèse
                        }
-                      if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') {  // url(url)
+                      if (!nc && (nc = strfield(html, "url")) &&
+                          (!isalnum(html_prevc(html, r->adr))) &&
+                          html_prevc(html, r->adr) != '_') { // url(url)
                        expected = '('; // parenthèse
                        expected_end = ")";     // fin: parenthèse
                        can_avoid_quotes = 1;
--- a/tests/01_engine-parse.test
+++ b/tests/01_engine-parse.test
@@ -220,4 +220,15 @@ crawl "$site5/main.css" "$out5"
 found "good.css" "$out5"
 notfound "trunc" "$out5"

+# Offset-0 underflow (#396): a token at the buffer start makes the detector's
+# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
+# url() target is still captured; here it just must not underflow.
+site6="$tmp/parse-off0"
+mkdir -p "$site6"
+printf 'body{}\n' >"$site6/off0.css"
+printf 'url(off0.css)\n' >"$site6/main.css"
+out6="$tmp/parse-off0-out"
+crawl "$site6/main.css" "$out6"
+found "off0.css" "$out6"
+
 exit 0