Compare commits

..

6 Commits

Author SHA1 Message Date
Xavier Roche
a52a2b146c Add an offline update/cache regression test
Every crawl test runs httrack exactly once (crawl-test.sh), so the cache read /
update path (cache_readex) -- recently touched by the buffer-bounding work -- had
zero regression coverage: the cache was written but never read back.

Add tests/02_update-cache.test, a self-contained file:// two-pass test (no
network, always runs): mirror a local site, re-mirror it unchanged (the cache-
read pass must complete with no errors -- guards a crash/abort in cache_readex),
then change a source file and re-mirror (the update must pick up the new content
-- guards the update decision that reads the cached metadata).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-14 16:29:45 +02:00
Xavier Roche
226a38d3d0 Merge pull request #340 from xroche/cleanup/htscache-bounds
Bound htscache.c cache-field and save-name copies
2026-06-14 15:58:04 +02:00
Xavier Roche
1e463f65a5 Bound htscache.c cache-field and save-name copies
ZIP_READFIELD_STRING (the cached ZIP-header field reader) copied
attacker-influenced cache-file values into fixed htsblk fields with an unchecked
strcpybuff -- benign for the char[] fields, but r.location is a char* (degrades
to raw strcpy). Thread the destination size into the macro: sizeof(field) for
the array fields, HTS_URLMAXSIZE*2 for r.location (it points into a buffer of
that size, in both the caller-supplied and the location_default case).

Also bound cache_readex's return_save copy (its one non-NULL caller passes a
HTS_URLMAXSIZE*2 buffer), the exact-sized malloc copy in cache_rstr's default
path (strlen(defaultdata)+1), and replace the two strcpybuff(r.location, "")
clears with a direct r.location[0] = '\0'.

htscache.c pointer-destination warnings 6 -> 0.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-14 15:43:04 +02:00
Xavier Roche
09ed9968cd Merge pull request #339 from xroche/cleanup/htsbauth-bounds
Bound htsbauth cookie/auth buffer writes
2026-06-14 15:32:37 +02:00
Xavier Roche
ad6915e3cc Bound htsbauth cookie/auth buffer writes
cookie_get(), bauth_prefix(), cookie_insert() and cookie_delete() all wrote into
caller-provided char* buffers via unchecked strcpybuff/strcatbuff/strncatbuff
(the pointer-destination case). Bound them:

- cookie_get: write the extracted field with htsbuff over the buffer's 8192-byte
  contract (all callers use char[8192]).
- bauth_prefix: copy host+path with strlcpybuff/strlcatbuff bounded to the
  caller's HTS_URLMAXSIZE*2 buffer.
- cookie_insert/cookie_delete: thread the destination capacity (the cookie
  store's max_len minus the cursor offset) and use strlcpybuff/strlcatbuff;
  update cookie_add/cookie_del to pass it.

Add cookie_get field-extraction asserts to basic_selftests (run via -#7) rather
than a new -# digit. Translated the touched French comments.

htsbauth.c pointer-destination warnings 9 -> 0.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-14 15:29:33 +02:00
Xavier Roche
4a5580dec0 Merge pull request #338 from xroche/cleanup/htswizard-bounds
Build wizard auto-filter rules with htsbuff (bounded)
2026-06-14 14:37:56 +02:00
6 changed files with 133 additions and 41 deletions

View File

@@ -102,7 +102,8 @@ int cookie_add(t_cookie * cookie, const char *cook_name, const char *cook_value,
strcatbuff(cook, "\n");
if (!((strlen(cookie->data) + strlen(cook)) < cookie->max_len))
return -1; // impossible d'ajouter
cookie_insert(insert, cook);
cookie_insert(insert, cookie->max_len - (size_t) (insert - cookie->data),
cook);
#if DEBUG_COOK
printf("add_new cookie: name=\"%s\" value=\"%s\" domain=\"%s\" path=\"%s\"\n",
cook_name, cook_value, domain, path);
@@ -118,7 +119,7 @@ int cookie_del(t_cookie * cookie, const char *cook_name, const char *domain, con
b = cookie_find(cookie->data, cook_name, domain, path);
if (b) {
a = cookie_nextfield(b);
cookie_delete(b, a - b);
cookie_delete(b, cookie->max_len - (size_t) (b - cookie->data), a - b);
#if DEBUG_COOK
printf("deleted old cookie: %s %s %s\n", cook_name, domain, path);
#endif
@@ -336,41 +337,44 @@ int cookie_save(t_cookie * cookie, const char *name) {
return -1;
}
// insertion chaine ins avant s
void cookie_insert(char *s, const char *ins) {
// Insert string ins before s. s_size is the capacity of the buffer at s.
void cookie_insert(char *s, size_t s_size, const char *ins) {
char *buff;
if (strnotempty(s) == 0) { // rien à faire, juste concat
strcatbuff(s, ins);
if (strnotempty(s) == 0) { // nothing there yet: just concatenate
strlcatbuff(s, ins, s_size);
} else {
buff = (char *) malloct(strlen(s) + 1);
if (buff) {
strcpybuff(buff, s); // copie temporaire
strcpybuff(s, ins); // insérer
strcatbuff(s, buff); // copier
strlcpybuff(buff, s, strlen(s) + 1); // temporary copy of s
strlcpybuff(s, ins, s_size); // write ins
strlcatbuff(s, buff, s_size); // then the saved content
freet(buff);
}
}
}
// destruction chaine dans s position pos
void cookie_delete(char *s, size_t pos) {
// Delete the substring of s at position pos. s_size is the capacity at s.
void cookie_delete(char *s, size_t s_size, size_t pos) {
char *buff;
if (strnotempty(s + pos) == 0) { // rien à faire, effacer
if (strnotempty(s + pos) == 0) { // nothing after pos: truncate
s[0] = '\0';
} else {
buff = (char *) malloct(strlen(s + pos) + 1);
if (buff) {
strcpybuff(buff, s + pos); // copie temporaire
strcpybuff(s, buff); // copier
strlcpybuff(buff, s + pos, strlen(s + pos) + 1); // temporary copy
strlcpybuff(s, buff, s_size); // overwrite from start
freet(buff);
}
}
}
// renvoie champ param de la chaine cookie_base
// ex: cookie_get("ceci est<tab>un<tab>exemple",1) renvoi "un"
// Return field <param> (0-based, tab-separated) of the cookie line cookie_base,
// into buffer. ex: cookie_get("ceci est<tab>un<tab>exemple", 1) returns "un".
// buffer must hold at least COOKIE_FIELD_BUFFER_SIZE bytes (all callers use
// char[8192]).
#define COOKIE_FIELD_BUFFER_SIZE 8192
const char *cookie_get(char *buffer, const char *cookie_base, int param) {
const char *limit;
@@ -394,11 +398,11 @@ const char *cookie_get(char *buffer, const char *cookie_base, int param) {
if (cookie_base) {
if (cookie_base < limit) {
const char *a = cookie_base;
htsbuff b = htsbuff_ptr(buffer, COOKIE_FIELD_BUFFER_SIZE);
while((*a) && (*a != '\t') && (*a != '\n'))
a++;
buffer[0] = '\0';
strncatbuff(buffer, cookie_base, (int) (a - cookie_base));
htsbuff_catn(&b, cookie_base, (size_t) (a - cookie_base));
return buffer;
} else
return "";
@@ -458,11 +462,13 @@ char *bauth_check(t_cookie * cookie, const char *adr, const char *fil) {
return NULL;
}
/* Build the auth prefix (host + path, query stripped) into prefix.
Callers pass a buffer of HTS_URLMAXSIZE * 2 bytes. */
char *bauth_prefix(char *prefix, const char *adr, const char *fil) {
char *a;
strcpybuff(prefix, jump_identification_const(adr));
strcatbuff(prefix, fil);
strlcpybuff(prefix, jump_identification_const(adr), HTS_URLMAXSIZE * 2);
strlcatbuff(prefix, fil, HTS_URLMAXSIZE * 2);
a = strchr(prefix, '?');
if (a)
*a = '\0';

View File

@@ -67,8 +67,8 @@ int cookie_add(t_cookie * cookie, const char *cook_name, const char *cook_valu
int cookie_del(t_cookie * cookie, const char *cook_name, const char *domain, const char *path);
int cookie_load(t_cookie * cookie, const char *path, const char *name);
int cookie_save(t_cookie * cookie, const char *name);
void cookie_insert(char *s, const char *ins);
void cookie_delete(char *s, size_t pos);
void cookie_insert(char *s, size_t s_size, const char *ins);
void cookie_delete(char *s, size_t s_size, size_t pos);
const char *cookie_get(char *buffer, const char *cookie_base, int param);
char *cookie_find(char *s, const char *cook_name, const char *domain, const char *path);
char *cookie_nextfield(char *a);

View File

@@ -196,12 +196,13 @@ struct cache_back_zip_entry {
int compressionMethod;
};
#define ZIP_READFIELD_STRING(line, value, refline, refvalue) do { \
if (line[0] != '\0' && strfield2(line, refline)) { \
strcpybuff(refvalue, value); \
line[0] = '\0'; \
} \
} while(0)
#define ZIP_READFIELD_STRING(line, value, refline, refvalue, refvalue_size) \
do { \
if (line[0] != '\0' && strfield2(line, refline)) { \
strlcpybuff(refvalue, value, refvalue_size); \
line[0] = '\0'; \
} \
} while (0)
#define ZIP_READFIELD_INT(line, value, refline, refvalue) do { \
if (line[0] != '\0' && strfield2(line, refline)) { \
int intval = 0; \
@@ -643,7 +644,7 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
} else {
r.location = location_default;
}
strcpybuff(r.location, "");
r.location[0] = '\0';
strcpybuff(buff, adr);
strcatbuff(buff, fil);
hash_pos_return = coucal_read(cache->hashtable, buff, &hash_pos);
@@ -706,17 +707,25 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
value++;
ZIP_READFIELD_INT(line, value, "X-In-Cache", dataincache);
ZIP_READFIELD_INT(line, value, "X-Statuscode", r.statuscode);
ZIP_READFIELD_STRING(line, value, "X-StatusMessage", r.msg); // msg
ZIP_READFIELD_STRING(line, value, "X-StatusMessage", r.msg,
sizeof(r.msg));
ZIP_READFIELD_LLINT(line, value, "X-Size", r.size); // size
ZIP_READFIELD_STRING(line, value, "Content-Type", r.contenttype); // contenttype
ZIP_READFIELD_STRING(line, value, "X-Charset", r.charset); // contenttype
ZIP_READFIELD_STRING(line, value, "Last-Modified", r.lastmodified); // last-modified
ZIP_READFIELD_STRING(line, value, "Etag", r.etag); // Etag
ZIP_READFIELD_STRING(line, value, "Location", r.location); // 'location' pour moved
ZIP_READFIELD_STRING(line, value, "Content-Disposition", r.cdispo); // Content-disposition
ZIP_READFIELD_STRING(line, value, "Content-Type", r.contenttype,
sizeof(r.contenttype));
ZIP_READFIELD_STRING(line, value, "X-Charset", r.charset,
sizeof(r.charset));
ZIP_READFIELD_STRING(line, value, "Last-Modified", r.lastmodified,
sizeof(r.lastmodified));
ZIP_READFIELD_STRING(line, value, "Etag", r.etag, sizeof(r.etag));
// r.location is a char* pointing into a HTS_URLMAXSIZE*2 buffer
ZIP_READFIELD_STRING(line, value, "Location", r.location,
HTS_URLMAXSIZE * 2);
ZIP_READFIELD_STRING(line, value, "Content-Disposition", r.cdispo,
sizeof(r.cdispo));
//ZIP_READFIELD_STRING(line, value, "X-Addr", ..); // Original address
//ZIP_READFIELD_STRING(line, value, "X-Fil", ..); // Original URI filename
ZIP_READFIELD_STRING(line, value, "X-Save", previous_save_); // Original save filename
ZIP_READFIELD_STRING(line, value, "X-Save", previous_save_,
sizeof(previous_save_));
}
} while(offset < readSizeHeader && !lineEof);
//totalHeader = offset;
@@ -733,7 +742,7 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
}
}
if (return_save != NULL) {
strcpybuff(return_save, previous_save);
strlcpybuff(return_save, previous_save, HTS_URLMAXSIZE * 2);
}
/* Complete fields */
@@ -1025,7 +1034,7 @@ static htsblk cache_readex_old(httrackp * opt, cache_back * cache,
} else {
r.location = location_default;
}
strcpybuff(r.location, "");
r.location[0] = '\0';
#if HTS_FAST_CACHE
strcpybuff(buff, adr);
strcatbuff(buff, fil);
@@ -1111,7 +1120,7 @@ static htsblk cache_readex_old(httrackp * opt, cache_back * cache,
previous_save[0] = '\0';
cache_rstr(cache->olddat, previous_save); // save
if (return_save != NULL) {
strcpybuff(return_save, previous_save);
strlcpybuff(return_save, previous_save, HTS_URLMAXSIZE * 2);
}
}
if (cache->version >= 5) {
@@ -2088,7 +2097,7 @@ char *readfile_or(const char *fil, const char *defaultdata) {
char *adr = malloct(strlen(defaultdata) + 1);
if (adr) {
strcpybuff(adr, defaultdata);
strlcpybuff(adr, defaultdata, strlen(defaultdata) + 1);
return adr;
}
}

View File

@@ -40,6 +40,7 @@ Please visit our Website: http://www.httrack.com
#include "htscore.h"
#include "htsdefines.h"
#include "htsalias.h"
#include "htsbauth.h"
#include "htswrap.h"
#include "htsmodules.h"
#include "htszlib.h"
@@ -138,6 +139,19 @@ static void basic_selftests(void) {
fil_normalized(source, buffer);
// MD5 selftests
md5selftest();
// cookie_get field extraction (tab-separated, 0-based)
{
char cbuf[8192];
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 0), "a") == 0);
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 1), "b") == 0);
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 2), "c") == 0);
// multi-char fields catch length/boundary bugs that 1-char fields hide
assertf(strcmp(cookie_get(cbuf, "host\tx\t/path/to", 0), "host") == 0);
assertf(strcmp(cookie_get(cbuf, "host\tx\t/path/to", 2), "/path/to") == 0);
assertf(strcmp(cookie_get(cbuf, "a\t\tc", 1), "") == 0); // empty field
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 9), "") == 0); // beyond last
}
}
/* Self-tests for the htssafe.h bounded string ops (driven by httrack -#8).

62
tests/02_update-cache.test Executable file
View File

@@ -0,0 +1,62 @@
#!/bin/bash
#
# Update path: re-mirroring a site reads the cache (cache_readex) to decide what
# is up to date -- a path the one-shot crawl tests never exercise. Offline
# (file://), so it always runs.
#
# 1. mirror, then re-mirror unchanged -> the cache-read pass must complete clean
# (guards against a crash/abort/error in cache_readex).
# 2. change a source file, re-mirror -> the update must pick up the new content
# (guards the update decision that reads the cached metadata).
set -eu
site=$(mktemp -d)
out=$(mktemp -d)
trap 'rm -rf "$site" "$out"' EXIT
cat >"$site/index.html" <<EOF
<a href="a.html">a</a> <a href="sub/b.html">b</a>
EOF
echo 'OLDCONTENT' >"$site/a.html"
mkdir -p "$site/sub"
echo '<p>bbb</p>' >"$site/sub/b.html"
url="file://$site/index.html"
# count Error: lines in the log (grep -c exits 1 on zero matches: guard it)
errors() { grep -ciE '^[0-9:]*[[:space:]]Error:' "$out/hts-log.txt" || true; }
# 1. fresh mirror writes the cache
httrack "$url" -O "$out" -q -%v0 -r3 >/dev/null 2>&1
test -e "$out/hts-cache/new.zip" || {
echo "no cache was written" >&2
exit 1
}
# 2. re-mirror unchanged: the update reads the cache and must complete cleanly
httrack "$url" -O "$out" -q -%v0 -r3 >/dev/null 2>&1
test "$(errors)" = 0 || {
echo "update (unchanged) reported errors" >&2
exit 1
}
for suffix in a.html sub/b.html; do
find "$out" -path "*/$suffix" | grep -q . || {
echo "missing $suffix after update" >&2
exit 1
}
done
# 3. change a source file: the update must pick up the new content
sleep 1
echo 'NEWCONTENT' >"$site/a.html"
httrack "$url" -O "$out" -q -%v0 -r3 >/dev/null 2>&1
test "$(errors)" = 0 || {
echo "update (changed) reported errors" >&2
exit 1
}
grep -q NEWCONTENT "$(find "$out" -path '*/a.html')" || {
echo "update did not pick up the changed source" >&2
exit 1
}

View File

@@ -22,6 +22,7 @@ TESTS = \
01_engine-simplify.test \
01_engine-strsafe.test \
02_manpage-regen.test \
02_update-cache.test \
10_crawl-simple.test \
11_crawl-cookies.test \
11_crawl-idna.test \