Compare commits

..

4 Commits

Author SHA1 Message Date
Xavier Roche
83ff148efd Add an in-process cache create/read/update self-test
Wire a new `httrack -#A <dir>` debug option that exercises the ZIP cache
end to end through the public API (cache_init / cache_add / cache_readex),
in a dedicated source file (htscache_selftest.c).

It stores, then reads back asserting every header field and the body
round-trip exactly:
- hand-crafted edge cases: a normal HTML page, an empty redirect with a
  near-limit location, a non-HTML body kept in cache via all-in-cache, and
  a binary body with embedded NUL and high bytes (compared with memcmp);
- a few thousand small entries, to stress the index/lookup at scale;
- a few large compressible and incompressible bodies, to exercise zlib
  deflate/inflate and large-buffer handling.

It then updates one entry and confirms the new value is read back. The
driver returns the number of mismatches so failures are observable. The
whole cache weighs ~1-2 MB and the run takes a fraction of a second.

The location case is sized to the cache's real per-header-line round-trip
limit: cached headers are parsed through a HTS_URLMAXSIZE-sized line
buffer, so a value longer than that is truncated on read regardless of
the larger r.location buffer; 1000 bytes stays safely under it.

A dedicated test (tests/01_engine-cache.test) drives the option, asserts
the success line, that a ZIP cache was written, and that its footprint
stays under a sane ceiling.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-14 17:47:04 +02:00
Xavier Roche
50bb02e729 Merge pull request #342 from xroche/cleanup/cache-rstr-s1
Bound the legacy .dat cache readers (cache_rstr / cache_brstr)
2026-06-14 16:44:27 +02:00
Xavier Roche
b80ee793ac Bound the legacy .dat cache readers (cache_rstr / cache_brstr)
cache_rstr() read an attacker-controlled length (clamped only to 32768) from a
CACHE-1.x .dat and fread() it straight into fixed htsblk fields (r.msg[80],
r.contenttype[64], ...) with no destination bound -- a heap/stack overflow from
a crafted/old cache (the audit's S1). cache_brstr() (the in-memory variant) had
the same shape and, worse, no length cap at all.

Thread a destination size into both:
- cache_rstr stores at most s_size-1 bytes and fseek()s past the remainder so
  the next field stays aligned (the field may be longer than the destination in
  a tampered cache).
- cache_brstr caps the length and bounds the copy.
Update every caller (htscache.c and htscoremain.c) to pass sizeof(field) /
HTS_URLMAXSIZE*2. cache_rstr_addr already malloc()s to the read size, so it is
left as is. Remove the dead cache_quickbrstr (no callers).

A dedicated cache self-test (create/read/update) follows separately.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-14 16:41:17 +02:00
Xavier Roche
d12456c1e8 Merge pull request #341 from xroche/test/cache-update
Add an offline update/cache regression test
2026-06-14 16:31:42 +02:00
8 changed files with 542 additions and 47 deletions

View File

@@ -56,6 +56,7 @@ whttrackrundir = $(bindir)
whttrackrun_SCRIPTS = webhttrack
libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
htscache_selftest.c \
htscatchurl.c htsfilters.c htsftp.c htshash.c coucal/coucal.c \
htshelp.c htslib.c htscoremain.c \
htsname.c htsrobots.c htstools.c htswizard.c \
@@ -65,7 +66,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
md5.c \
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
htsbasenet.h htsbauth.h htscache.h htscatchurl.h \
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htscatchurl.h \
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
htshelp.h htsindex.h htslib.h htsmd5.h \

View File

@@ -1105,20 +1105,24 @@ static htsblk cache_readex_old(httrackp * opt, cache_back * cache,
//
cache_rint(cache->olddat, &r.statuscode);
cache_rLLint(cache->olddat, &r.size);
cache_rstr(cache->olddat, r.msg);
cache_rstr(cache->olddat, r.contenttype);
cache_rstr(cache->olddat, r.msg, sizeof(r.msg));
cache_rstr(cache->olddat, r.contenttype, sizeof(r.contenttype));
if (cache->version >= 3)
cache_rstr(cache->olddat, r.charset);
cache_rstr(cache->olddat, r.lastmodified);
cache_rstr(cache->olddat, r.etag);
cache_rstr(cache->olddat, r.location);
cache_rstr(cache->olddat, r.charset, sizeof(r.charset));
cache_rstr(cache->olddat, r.lastmodified, sizeof(r.lastmodified));
cache_rstr(cache->olddat, r.etag, sizeof(r.etag));
// r.location points into a HTS_URLMAXSIZE*2 buffer
cache_rstr(cache->olddat, r.location, HTS_URLMAXSIZE * 2);
if (cache->version >= 2)
cache_rstr(cache->olddat, r.cdispo);
cache_rstr(cache->olddat, r.cdispo, sizeof(r.cdispo));
if (cache->version >= 4) {
cache_rstr(cache->olddat, previous_save); // adr
cache_rstr(cache->olddat, previous_save); // fil
cache_rstr(cache->olddat, previous_save,
sizeof(previous_save)); // adr
cache_rstr(cache->olddat, previous_save,
sizeof(previous_save)); // fil
previous_save[0] = '\0';
cache_rstr(cache->olddat, previous_save); // save
cache_rstr(cache->olddat, previous_save,
sizeof(previous_save)); // save
if (return_save != NULL) {
strlcpybuff(return_save, previous_save, HTS_URLMAXSIZE * 2);
}
@@ -1127,8 +1131,8 @@ static htsblk cache_readex_old(httrackp * opt, cache_back * cache,
r.headers = cache_rstr_addr(cache->olddat);
}
//
cache_rstr(cache->olddat, check);
if (strcmp(check, "HTS") == 0) { /* intégrité OK */
cache_rstr(cache->olddat, check, sizeof(check));
if (strcmp(check, "HTS") == 0) { /* integrity OK */
ok = 1;
}
cache_rLLint(cache->olddat, &size_read); /* lire size pour être sûr de la taille déclarée (réécrire) */
@@ -1767,12 +1771,12 @@ void cache_init(cache_back * cache, httrackp * opt) {
char firstline[256];
char *a = cache->use;
a += cache_brstr(a, firstline);
if (strncmp(firstline, "CACHE-", 6) == 0) { // Nouvelle version du cache
if (strncmp(firstline, "CACHE-1.", 8) == 0) { // Version 1.1x
a += cache_brstr(a, firstline, sizeof(firstline));
if (strncmp(firstline, "CACHE-", 6) == 0) { // new cache format
if (strncmp(firstline, "CACHE-1.", 8) == 0) { // version 1.1x
cache->version = (int) (firstline[8] - '0'); // cache 1.x
if (cache->version <= 5) {
a += cache_brstr(a, firstline);
a += cache_brstr(a, firstline, sizeof(firstline));
strcpybuff(cache->lastmodified, firstline);
} else {
hts_log_print(opt, LOG_ERROR,
@@ -1783,7 +1787,7 @@ void cache_init(cache_back * cache, httrackp * opt) {
freet(cache->use);
cache->use = NULL;
}
} else { // non supporté
} else { // non supporté
hts_log_print(opt, LOG_ERROR,
"Cache: %s not supported, ignoring current cache",
firstline);
@@ -1793,7 +1797,7 @@ void cache_init(cache_back * cache, httrackp * opt) {
cache->use = NULL;
}
/* */
} else { // Vieille version du cache
} else { // Vieille version du cache
/* */
hts_log_print(opt, LOG_WARNING,
"Cache: importing old cache format");
@@ -2118,7 +2122,7 @@ int cache_wstr(FILE * fp, const char *s) {
return -1;
return 0;
}
void cache_rstr(FILE * fp, char *s) {
void cache_rstr(FILE *fp, char *s, size_t s_size) {
INTsys i;
char buff[256 + 4];
@@ -2127,13 +2131,26 @@ void cache_rstr(FILE * fp, char *s) {
if (i < 0 || i > 32768) /* error, something nasty happened */
i = 0;
if (i > 0) {
if ((int) fread(s, 1, i, fp) != i) {
/* Store at most s_size-1 bytes into s, but consume all i bytes from the
stream so the next field stays aligned (the field may be longer than the
destination in a tampered/old cache). */
const size_t want = (size_t) i;
const size_t store = want < s_size ? want : s_size - 1;
if (fread(s, 1, store, fp) != store) {
int fread_cache_failed = 0;
assertf(fread_cache_failed);
}
if (want > store && fseek(fp, (long) (want - store), SEEK_CUR) != 0) {
int fseek_cache_failed = 0;
assertf(fseek_cache_failed);
}
s[store] = '\0';
} else {
s[0] = '\0';
}
*(s + i) = '\0';
}
char *cache_rstr_addr(FILE * fp) {
INTsys i;
@@ -2157,7 +2174,7 @@ char *cache_rstr_addr(FILE * fp) {
}
return addr;
}
int cache_brstr(char *adr, char *s) {
int cache_brstr(char *adr, char *s, size_t s_size) {
int i;
int off;
char buff[256 + 4];
@@ -2165,23 +2182,17 @@ int cache_brstr(char *adr, char *s) {
off = binput(adr, buff, 256);
adr += off;
sscanf(buff, "%d", &i);
if (i > 0)
strncpy(s, adr, i);
*(s + i) = '\0';
off += i;
return off;
}
int cache_quickbrstr(char *adr, char *s) {
int i;
int off;
char buff[256 + 4];
if (i < 0 || i > 32768) /* guard a corrupt length */
i = 0;
if (i > 0) {
/* copy at most s_size-1 bytes; advance past the full field regardless */
const size_t store = (size_t) i < s_size ? (size_t) i : s_size - 1;
off = binput(adr, buff, 256);
adr += off;
sscanf(buff, "%d", &i);
if (i > 0)
strncpy(s, adr, i);
*(s + i) = '\0';
strncpy(s, adr, store);
s[store] = '\0';
} else {
s[0] = '\0';
}
off += i;
return off;
}
@@ -2189,7 +2200,7 @@ int cache_quickbrstr(char *adr, char *s) {
/* idem, mais en int */
int cache_brint(char *adr, int *i) {
char s[256];
int r = cache_brstr(adr, s);
int r = cache_brstr(adr, s, sizeof(s));
if (r != -1)
sscanf(s, "%d", i);
@@ -2198,7 +2209,7 @@ int cache_brint(char *adr, int *i) {
void cache_rint(FILE * fp, int *i) {
char s[256];
cache_rstr(fp, s);
cache_rstr(fp, s, sizeof(s));
sscanf(s, "%d", i);
}
int cache_wint(FILE * fp, int i) {
@@ -2210,7 +2221,7 @@ int cache_wint(FILE * fp, int i) {
void cache_rLLint(FILE * fp, LLint * i) {
char s[256];
cache_rstr(fp, s);
cache_rstr(fp, s, sizeof(s));
sscanf(s, LLintP, i);
}
int cache_wLLint(FILE * fp, LLint i) {

View File

@@ -80,10 +80,9 @@ int cache_writedata(FILE * cache_ndx, FILE * cache_dat, const char *str1,
int cache_readdata(cache_back * cache, const char *str1, const char *str2,
char **inbuff, int *len);
void cache_rstr(FILE * fp, char *s);
void cache_rstr(FILE *fp, char *s, size_t s_size);
char *cache_rstr_addr(FILE * fp);
int cache_brstr(char *adr, char *s);
int cache_quickbrstr(char *adr, char *s);
int cache_brstr(char *adr, char *s, size_t s_size);
int cache_brint(char *adr, int *i);
void cache_rint(FILE * fp, int *i);
void cache_rLLint(FILE * fp, LLint * i);

374
src/htscache_selftest.c Normal file
View File

@@ -0,0 +1,374 @@
/* ------------------------------------------------------------ */
/*
HTTrack Website Copier, Offline Browser for Windows and Unix
Copyright (C) 1998-2017 Xavier Roche and other contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Important notes:
- We hereby ask people using this source NOT to use it in purpose of grabbing
emails addresses, or collecting any other private information on persons.
This would disgrace our work, and spoil the many hours we spent on it.
Please visit our Website: http://www.httrack.com
*/
/* ------------------------------------------------------------ */
/* File: htscache_selftest.c subroutines: */
/* in-process self-test for the (ZIP) cache subsystem */
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
/* Drives the public cache API (cache_init / cache_add / cache_readex)
through a create -> read -> update cycle on a real on-disk ZIP cache,
asserting every header field and the (binary-safe) body round-trips.
Besides a few hand-crafted edge cases it stores a few thousand entries
(index/lookup scale) and a handful of large compressible/incompressible
bodies (zlib deflate/inflate). Reached via `httrack -#A <dir>`. */
#define HTS_INTERNAL_BYTECODE
#include "htscache_selftest.h"
#include "htscache.h"
#include "htscore.h"
#include "htslib.h"
#include "htszlib.h"
#include <stdio.h>
#include <string.h>
#define SELFTEST_VOLUME 3000 /* number of small entries in the scale pass */
/* Open a cache session. A write session (ro=0) rotates new.zip -> old.zip and
opens a fresh new.zip; a read session (ro=1) opens new.zip in place. */
static void selftest_open(cache_back *cache, httrackp *opt, int ro) {
memset(cache, 0, sizeof(*cache));
cache->type = 1;
cache->log = stderr;
cache->errlog = stderr;
cache->hashtable = coucal_new(0);
cache->ro = ro;
cache_init(cache, opt);
}
static void selftest_open_for_write(cache_back *cache, httrackp *opt) {
selftest_open(cache, opt, 0);
}
static void selftest_open_for_read(cache_back *cache, httrackp *opt) {
selftest_open(cache, opt, 1);
}
static void selftest_close(cache_back *cache) {
if (cache->dat != NULL) {
fclose(cache->dat);
cache->dat = NULL;
}
if (cache->ndx != NULL) {
fclose(cache->ndx);
cache->ndx = NULL;
}
if (cache->zipOutput != NULL) {
zipClose(cache->zipOutput,
"Created by HTTrack Website Copier (cache self-test)");
cache->zipOutput = NULL;
}
if (cache->zipInput != NULL) {
unzClose(cache->zipInput);
cache->zipInput = NULL;
}
/* hashtable is intentionally not coucal_delete()d: it would dump a stats
summary to stderr on every call, and this is a one-shot CLI subcommand
that exits right after (same choice as the other -# cache subcommands) */
}
/* Store one entry. The body is copied into a private buffer (any size), so
callers may pass const data and cache_add never sees a cast-away qualifier;
it consumes everything synchronously, so the copy is freed on return. */
static void store_entry(httrackp *opt, cache_back *cache, const char *adr,
const char *fil, const char *save, int statuscode,
const char *msg, const char *contenttype,
const char *charset, const char *lastmodified,
const char *etag, const char *location,
const char *body, size_t body_len) {
htsblk r;
char locbuf[HTS_URLMAXSIZE * 2];
char *bodycopy = NULL;
hts_init_htsblk(&r);
r.statuscode = statuscode;
r.size = (LLint) body_len;
strcpybuff(r.msg, msg);
strcpybuff(r.contenttype, contenttype);
strcpybuff(r.charset, charset);
strcpybuff(r.lastmodified, lastmodified);
strcpybuff(r.etag, etag);
strcpybuff(locbuf, location);
r.location = locbuf;
r.is_write = 0;
/* an empty body must be a NULL pointer: cache_add rejects a non-NULL
pointer with size 0 */
if (body_len != 0) {
bodycopy = malloct(body_len);
memcpy(bodycopy, body, body_len);
r.adr = bodycopy;
}
/* all_in_cache=1: keep the body in the ZIP whatever the content-type,
so the read path never depends on a file on disk */
cache_add(opt, cache, &r, adr, fil, save, 1, NULL);
if (bodycopy != NULL) {
freet(bodycopy);
}
}
/* Read one entry back and check every field. Returns the number of
mismatches (0 == success). */
static int check_entry(httrackp *opt, cache_back *cache, const char *adr,
const char *fil, int statuscode, const char *msg,
const char *contenttype, const char *charset,
const char *lastmodified, const char *etag,
const char *location, const char *body,
size_t body_len) {
int fail = 0;
char *locbuf = malloct(HTS_URLMAXSIZE * 2);
htsblk r;
locbuf[0] = '\0';
/* readonly=1: pure read, no rename/disk-write decision logic */
r = cache_readex(opt, cache, adr, fil, "", locbuf, NULL, 1);
#define CHECK_STR(field, want) \
do { \
if (strcmp((field), (want)) != 0) { \
fprintf(stderr, \
"cache-selftest: %s%s: " #field " is '%s', expected '%s'\n", \
adr, fil, (field), (want)); \
fail++; \
} \
} while (0)
if (r.statuscode != statuscode) {
fprintf(stderr, "cache-selftest: %s%s: statuscode is %d, expected %d\n",
adr, fil, r.statuscode, statuscode);
fail++;
}
CHECK_STR(r.msg, msg);
CHECK_STR(r.contenttype, contenttype);
CHECK_STR(r.charset, charset);
CHECK_STR(r.lastmodified, lastmodified);
CHECK_STR(r.etag, etag);
CHECK_STR(locbuf, location);
if (r.size != (LLint) body_len) {
fprintf(stderr, "cache-selftest: %s%s: size is " LLintP ", expected %d\n",
adr, fil, (LLint) r.size, (int) body_len);
fail++;
} else if (body_len != 0 &&
(r.adr == NULL || memcmp(r.adr, body, body_len) != 0)) {
fprintf(stderr, "cache-selftest: %s%s: body mismatch\n", adr, fil);
fail++;
}
#undef CHECK_STR
if (r.adr != NULL) {
freet(r.adr);
}
freet(locbuf);
return fail;
}
/* Fill a body of the requested size. kind 0 is highly compressible (a short
repeating pattern), kind 1 is incompressible (a deterministic PRNG), kind 2
alternates the two -- together they exercise both deflate outcomes. */
static void gen_body(char *buf, size_t len, int kind) {
unsigned int seed = 0x9e3779b1u ^ (unsigned int) len;
size_t j;
for (j = 0; j < len; j++) {
if (kind == 0 || (kind == 2 && (j & 1) == 0)) {
buf[j] = (char) ('A' + (j % 26));
} else {
seed = seed * 1103515245u + 12345u;
buf[j] = (char) (seed >> 16);
}
}
}
int cache_selftests(httrackp *opt, const char *dir) {
int failures = 0;
cache_back cache;
int i;
/* near-limit field values. The etag stresses htsblk.etag[256]; the location
stresses a long redirect URL. Each cached header line is read back through
a HTS_URLMAXSIZE-sized parse buffer ("<field>: <value>\r\n"), so the
round-trippable value is shorter than HTS_URLMAXSIZE: 1000 stays safely
under that real limit. */
static char etag_long[251];
static char location_long[1001];
/* a body with embedded NUL and high bytes, to prove binary safety */
static const char binary_body[] = {
'P', 'N', 'G', '\0', '\r', '\n', (char) 0xFF, (char) 0x80,
'\0', '\0', 'e', 'n', 'd', (char) 0xCA, (char) 0xFE, '\n'};
/* large bodies for the compression pass; kept alive across the write and
read passes so the read can compare against them */
static const size_t large_size[] = {200000, 200000, 50000};
const int large_count = (int) (sizeof(large_size) / sizeof(large_size[0]));
char *large_body[3];
/* edge-case bodies, named so store and read assert the exact same bytes */
const char *const body_index = "<html><body>hello</body></html>";
const char *const body_api = "{\"k\":\"v\"}";
const char *const body_updated = "<html><body>UPDATED CONTENT</body></html>";
const char *const body_404 = "<html><body>404 Not Found</body></html>";
memset(etag_long, 'E', sizeof(etag_long) - 1);
etag_long[sizeof(etag_long) - 1] = '\0';
memset(location_long, 'L', sizeof(location_long) - 1);
location_long[sizeof(location_long) - 1] = '\0';
for (i = 0; i < large_count; i++) {
large_body[i] = malloct(large_size[i]);
gen_body(large_body[i], large_size[i], i);
}
/* set up an isolated cache directory */
{
char base[HTS_URLMAXSIZE];
strcpybuff(base, dir);
if (base[0] != '\0' && base[strlen(base) - 1] != '/') {
strcatbuff(base, "/");
}
StringCopy(opt->path_log, base);
}
opt->cache = 1;
/* pass 1: create everything in a single write session */
selftest_open_for_write(&cache, opt);
/* edge cases: normal HTML page */
store_entry(opt, &cache, "example.com", "/", "example.com/index.html", 200,
"OK", "text/html", "utf-8", "Mon, 01 Jan 2024 00:00:00 GMT",
"etag-normal", "", body_index, strlen(body_index));
/* redirect: empty body, empty optional fields, near-limit location */
store_entry(opt, &cache, "example.com", "/moved", "example.com/moved.html",
301, "Moved Permanently", "text/html", "", "", "", location_long,
NULL, 0);
/* non-HTML content-type kept in cache via all_in_cache, near-limit etag */
store_entry(opt, &cache, "example.com", "/api", "example.com/api.json", 200,
"OK", "application/json", "utf-8",
"Tue, 02 Jan 2024 12:00:00 GMT", etag_long, "", body_api,
strlen(body_api));
/* binary body */
store_entry(opt, &cache, "example.com", "/logo", "example.com/logo.png", 200,
"OK", "image/png", "", "", "etag-bin", "", binary_body,
sizeof(binary_body));
/* error status with a body and a location (non-2xx codes are cached too) */
store_entry(opt, &cache, "example.com", "/gone", "example.com/gone.html", 404,
"Not Found", "text/html", "utf-8", "", "etag-404",
"https://example.com/where-it-went", body_404, strlen(body_404));
/* scale: a few thousand small entries */
for (i = 0; i < SELFTEST_VOLUME; i++) {
char fil[64], save[128], body[64];
sprintf(fil, "/v/%05d", i);
sprintf(save, "example.com/v/%05d.html", i);
sprintf(body, "<html>volume entry %d</html>", i);
store_entry(opt, &cache, "example.com", fil, save, 200, "OK", "text/html",
"utf-8", "", "", "", body, strlen(body));
}
/* compression: a few large bodies */
for (i = 0; i < large_count; i++) {
char fil[64], save[128];
sprintf(fil, "/big/%d.bin", i);
sprintf(save, "example.com/big/%d.bin", i);
store_entry(opt, &cache, "example.com", fil, save, 200, "OK",
"application/octet-stream", "", "", "", "", large_body[i],
large_size[i]);
}
selftest_close(&cache);
/* pass 2: read back and verify everything round-tripped */
selftest_open_for_read(&cache, opt);
failures += check_entry(opt, &cache, "example.com", "/", 200, "OK",
"text/html", "utf-8", "Mon, 01 Jan 2024 00:00:00 GMT",
"etag-normal", "", body_index, strlen(body_index));
failures += check_entry(opt, &cache, "example.com", "/moved", 301,
"Moved Permanently", "text/html", "", "", "",
location_long, NULL, 0);
failures +=
check_entry(opt, &cache, "example.com", "/api", 200, "OK",
"application/json", "utf-8", "Tue, 02 Jan 2024 12:00:00 GMT",
etag_long, "", body_api, strlen(body_api));
failures +=
check_entry(opt, &cache, "example.com", "/logo", 200, "OK", "image/png",
"", "", "etag-bin", "", binary_body, sizeof(binary_body));
failures += check_entry(opt, &cache, "example.com", "/gone", 404, "Not Found",
"text/html", "utf-8", "", "etag-404",
"https://example.com/where-it-went", body_404,
strlen(body_404));
for (i = 0; i < SELFTEST_VOLUME; i++) {
char fil[64], body[64];
sprintf(fil, "/v/%05d", i);
sprintf(body, "<html>volume entry %d</html>", i);
failures +=
check_entry(opt, &cache, "example.com", fil, 200, "OK", "text/html",
"utf-8", "", "", "", body, strlen(body));
}
for (i = 0; i < large_count; i++) {
char fil[64];
sprintf(fil, "/big/%d.bin", i);
failures += check_entry(opt, &cache, "example.com", fil, 200, "OK",
"application/octet-stream", "", "", "", "",
large_body[i], large_size[i]);
}
selftest_close(&cache);
/* pass 3: update one edge entry with new body and headers */
selftest_open_for_write(&cache, opt);
store_entry(opt, &cache, "example.com", "/", "example.com/index.html", 200,
"OK", "text/html", "iso-8859-1", "Wed, 03 Jan 2024 09:30:00 GMT",
"etag-updated", "", body_updated, strlen(body_updated));
selftest_close(&cache);
/* pass 4: re-read and confirm the updated value, not the old one */
selftest_open_for_read(&cache, opt);
failures +=
check_entry(opt, &cache, "example.com", "/", 200, "OK", "text/html",
"iso-8859-1", "Wed, 03 Jan 2024 09:30:00 GMT", "etag-updated",
"", body_updated, strlen(body_updated));
selftest_close(&cache);
for (i = 0; i < large_count; i++) {
freet(large_body[i]);
}
return failures;
}

49
src/htscache_selftest.h Normal file
View File

@@ -0,0 +1,49 @@
/* ------------------------------------------------------------ */
/*
HTTrack Website Copier, Offline Browser for Windows and Unix
Copyright (C) 1998-2017 Xavier Roche and other contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Important notes:
- We hereby ask people using this source NOT to use it in purpose of grabbing
emails addresses, or collecting any other private information on persons.
This would disgrace our work, and spoil the many hours we spent on it.
Please visit our Website: http://www.httrack.com
*/
/* ------------------------------------------------------------ */
/* File: htscache_selftest.h */
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
#ifndef HTSCACHE_SELFTEST_DEFH
#define HTSCACHE_SELFTEST_DEFH
#ifdef HTS_INTERNAL_BYTECODE
#ifndef HTS_DEF_FWSTRUCT_httrackp
#define HTS_DEF_FWSTRUCT_httrackp
typedef struct httrackp httrackp;
#endif
/* Run the cache create/read/update self-test against a working directory.
Returns the number of failed checks (0 == success). */
int cache_selftests(httrackp *opt, const char *dir);
#endif
#endif

View File

@@ -46,6 +46,7 @@ Please visit our Website: http://www.httrack.com
#include "htszlib.h"
#include "htscharset.h"
#include "htsencoding.h"
#include "htscache_selftest.h"
#include "htsmd5.h"
#include <ctype.h>
@@ -2113,6 +2114,19 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
case '#':{ // non documenté
com++;
switch (*com) {
case 'A': // cache self-test: httrack -#A <dir>
if (na + 1 < argc) {
const int err = cache_selftests(opt, argv[na + 1]);
printf("cache-selftest: %s\n", err ? "FAIL" : "OK");
htsmain_free();
return err;
} else {
fprintf(stderr, "Option #A requires a directory argument\n");
htsmain_free();
return 1;
}
break;
case 'C': // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file
{
int hasFilter = 0;
@@ -2155,8 +2169,8 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
char firstline[256];
char *a = cacheNdx;
a += cache_brstr(a, firstline);
a += cache_brstr(a, firstline);
a += cache_brstr(a, firstline, sizeof(firstline));
a += cache_brstr(a, firstline, sizeof(firstline));
while(a != NULL) {
a = strchr(a + 1, '\n'); /* start of line */
if (a) {

46
tests/01_engine-cache.test Executable file
View File

@@ -0,0 +1,46 @@
#!/bin/bash
#
# Cache create/read/update logic (driven by 'httrack -#A <dir>').
#
# The in-process self-test stores several hand-crafted edge entries (normal
# HTML, an empty redirect with a near-limit location, a non-HTML body kept via
# all-in-cache, a binary body with embedded NUL/high bytes), a few thousand
# small entries (index/lookup scale), and a few large compressible and
# incompressible bodies (zlib deflate/inflate). It reads everything back
# asserting every header field and the body round-trip byte for byte, then
# updates one entry and confirms the new value is read back. It exits non-zero
# on the first mismatch.
set -eu
dir=$(mktemp -d)
trap 'rm -rf "$dir"' EXIT
# Like the other -# debug modes, a trailing token (the working directory) is
# required; a bare '-#A' falls through to the usage screen.
out=$(httrack -#A "$dir")
# Match the exact success line, so the test cannot pass for an unrelated reason
# (e.g. the -#A mode being gone and falling through to the usage screen, which
# also exits non-zero but never prints this).
test "$out" = "cache-selftest: OK" || {
echo "expected 'cache-selftest: OK', got: $out" >&2
exit 1
}
# The self-test must have actually produced a ZIP cache on disk.
test -e "$dir/hts-cache/new.zip" || {
echo "no ZIP cache was written by the self-test" >&2
exit 1
}
# Sanity-check the cache footprint: the few-thousand-entry pass is expected to
# weigh ~1-2 MB. Fail if it balloons well past that (e.g. a per-entry overhead
# regression or runaway growth), so the cache size stays bounded.
ceiling=$((4 * 1024 * 1024))
bytes=$(du -sb "$dir/hts-cache" | cut -f1)
test "$bytes" -le "$ceiling" || {
echo "cache footprint $bytes bytes exceeds ${ceiling} ceiling" >&2
exit 1
}

View File

@@ -11,6 +11,7 @@ TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir)
TEST_EXTENSIONS = .test
TESTS = \
00_runnable.test \
01_engine-cache.test \
01_engine-charset.test \
01_engine-cmdline.test \
01_engine-entities.test \