From 83ff148efd115cc2864b8ec4f12070e032a7ccfa Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sun, 14 Jun 2026 17:21:04 +0200 Subject: [PATCH] Add an in-process cache create/read/update self-test Wire a new `httrack -#A ` debug option that exercises the ZIP cache end to end through the public API (cache_init / cache_add / cache_readex), in a dedicated source file (htscache_selftest.c). It stores, then reads back asserting every header field and the body round-trip exactly: - hand-crafted edge cases: a normal HTML page, an empty redirect with a near-limit location, a non-HTML body kept in cache via all-in-cache, and a binary body with embedded NUL and high bytes (compared with memcmp); - a few thousand small entries, to stress the index/lookup at scale; - a few large compressible and incompressible bodies, to exercise zlib deflate/inflate and large-buffer handling. It then updates one entry and confirms the new value is read back. The driver returns the number of mismatches so failures are observable. The whole cache weighs ~1-2 MB and the run takes a fraction of a second. The location case is sized to the cache's real per-header-line round-trip limit: cached headers are parsed through a HTS_URLMAXSIZE-sized line buffer, so a value longer than that is truncated on read regardless of the larger r.location buffer; 1000 bytes stays safely under it. A dedicated test (tests/01_engine-cache.test) drives the option, asserts the success line, that a ZIP cache was written, and that its footprint stays under a sane ceiling. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Roche --- src/Makefile.am | 3 +- src/htscache_selftest.c | 374 +++++++++++++++++++++++++++++++++++++ src/htscache_selftest.h | 49 +++++ src/htscoremain.c | 14 ++ tests/01_engine-cache.test | 46 +++++ tests/Makefile.am | 1 + 6 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 src/htscache_selftest.c create mode 100644 src/htscache_selftest.h create mode 100755 tests/01_engine-cache.test diff --git a/src/Makefile.am b/src/Makefile.am index ffde103..4022d4a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -56,6 +56,7 @@ whttrackrundir = $(bindir) whttrackrun_SCRIPTS = webhttrack libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ + htscache_selftest.c \ htscatchurl.c htsfilters.c htsftp.c htshash.c coucal/coucal.c \ htshelp.c htslib.c htscoremain.c \ htsname.c htsrobots.c htstools.c htswizard.c \ @@ -65,7 +66,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ md5.c \ minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \ hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \ - htsbasenet.h htsbauth.h htscache.h htscatchurl.h \ + htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htscatchurl.h \ htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \ htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \ htshelp.h htsindex.h htslib.h htsmd5.h \ diff --git a/src/htscache_selftest.c b/src/htscache_selftest.c new file mode 100644 index 0000000..c1a7eea --- /dev/null +++ b/src/htscache_selftest.c @@ -0,0 +1,374 @@ +/* ------------------------------------------------------------ */ +/* +HTTrack Website Copier, Offline Browser for Windows and Unix +Copyright (C) 1998-2017 Xavier Roche and other contributors + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +Important notes: + +- We hereby ask people using this source NOT to use it in purpose of grabbing +emails addresses, or collecting any other private information on persons. +This would disgrace our work, and spoil the many hours we spent on it. + +Please visit our Website: http://www.httrack.com +*/ + +/* ------------------------------------------------------------ */ +/* File: htscache_selftest.c subroutines: */ +/* in-process self-test for the (ZIP) cache subsystem */ +/* Author: Xavier Roche */ +/* ------------------------------------------------------------ */ + +/* Drives the public cache API (cache_init / cache_add / cache_readex) + through a create -> read -> update cycle on a real on-disk ZIP cache, + asserting every header field and the (binary-safe) body round-trips. + Besides a few hand-crafted edge cases it stores a few thousand entries + (index/lookup scale) and a handful of large compressible/incompressible + bodies (zlib deflate/inflate). Reached via `httrack -#A `. */ + +#define HTS_INTERNAL_BYTECODE + +#include "htscache_selftest.h" + +#include "htscache.h" +#include "htscore.h" +#include "htslib.h" +#include "htszlib.h" + +#include +#include + +#define SELFTEST_VOLUME 3000 /* number of small entries in the scale pass */ + +/* Open a cache session. A write session (ro=0) rotates new.zip -> old.zip and + opens a fresh new.zip; a read session (ro=1) opens new.zip in place. */ +static void selftest_open(cache_back *cache, httrackp *opt, int ro) { + memset(cache, 0, sizeof(*cache)); + cache->type = 1; + cache->log = stderr; + cache->errlog = stderr; + cache->hashtable = coucal_new(0); + cache->ro = ro; + cache_init(cache, opt); +} + +static void selftest_open_for_write(cache_back *cache, httrackp *opt) { + selftest_open(cache, opt, 0); +} + +static void selftest_open_for_read(cache_back *cache, httrackp *opt) { + selftest_open(cache, opt, 1); +} + +static void selftest_close(cache_back *cache) { + if (cache->dat != NULL) { + fclose(cache->dat); + cache->dat = NULL; + } + if (cache->ndx != NULL) { + fclose(cache->ndx); + cache->ndx = NULL; + } + if (cache->zipOutput != NULL) { + zipClose(cache->zipOutput, + "Created by HTTrack Website Copier (cache self-test)"); + cache->zipOutput = NULL; + } + if (cache->zipInput != NULL) { + unzClose(cache->zipInput); + cache->zipInput = NULL; + } + /* hashtable is intentionally not coucal_delete()d: it would dump a stats + summary to stderr on every call, and this is a one-shot CLI subcommand + that exits right after (same choice as the other -# cache subcommands) */ +} + +/* Store one entry. The body is copied into a private buffer (any size), so + callers may pass const data and cache_add never sees a cast-away qualifier; + it consumes everything synchronously, so the copy is freed on return. */ +static void store_entry(httrackp *opt, cache_back *cache, const char *adr, + const char *fil, const char *save, int statuscode, + const char *msg, const char *contenttype, + const char *charset, const char *lastmodified, + const char *etag, const char *location, + const char *body, size_t body_len) { + htsblk r; + char locbuf[HTS_URLMAXSIZE * 2]; + char *bodycopy = NULL; + + hts_init_htsblk(&r); + r.statuscode = statuscode; + r.size = (LLint) body_len; + strcpybuff(r.msg, msg); + strcpybuff(r.contenttype, contenttype); + strcpybuff(r.charset, charset); + strcpybuff(r.lastmodified, lastmodified); + strcpybuff(r.etag, etag); + strcpybuff(locbuf, location); + r.location = locbuf; + r.is_write = 0; + /* an empty body must be a NULL pointer: cache_add rejects a non-NULL + pointer with size 0 */ + if (body_len != 0) { + bodycopy = malloct(body_len); + memcpy(bodycopy, body, body_len); + r.adr = bodycopy; + } + /* all_in_cache=1: keep the body in the ZIP whatever the content-type, + so the read path never depends on a file on disk */ + cache_add(opt, cache, &r, adr, fil, save, 1, NULL); + if (bodycopy != NULL) { + freet(bodycopy); + } +} + +/* Read one entry back and check every field. Returns the number of + mismatches (0 == success). */ +static int check_entry(httrackp *opt, cache_back *cache, const char *adr, + const char *fil, int statuscode, const char *msg, + const char *contenttype, const char *charset, + const char *lastmodified, const char *etag, + const char *location, const char *body, + size_t body_len) { + int fail = 0; + char *locbuf = malloct(HTS_URLMAXSIZE * 2); + htsblk r; + + locbuf[0] = '\0'; + /* readonly=1: pure read, no rename/disk-write decision logic */ + r = cache_readex(opt, cache, adr, fil, "", locbuf, NULL, 1); + +#define CHECK_STR(field, want) \ + do { \ + if (strcmp((field), (want)) != 0) { \ + fprintf(stderr, \ + "cache-selftest: %s%s: " #field " is '%s', expected '%s'\n", \ + adr, fil, (field), (want)); \ + fail++; \ + } \ + } while (0) + + if (r.statuscode != statuscode) { + fprintf(stderr, "cache-selftest: %s%s: statuscode is %d, expected %d\n", + adr, fil, r.statuscode, statuscode); + fail++; + } + CHECK_STR(r.msg, msg); + CHECK_STR(r.contenttype, contenttype); + CHECK_STR(r.charset, charset); + CHECK_STR(r.lastmodified, lastmodified); + CHECK_STR(r.etag, etag); + CHECK_STR(locbuf, location); + + if (r.size != (LLint) body_len) { + fprintf(stderr, "cache-selftest: %s%s: size is " LLintP ", expected %d\n", + adr, fil, (LLint) r.size, (int) body_len); + fail++; + } else if (body_len != 0 && + (r.adr == NULL || memcmp(r.adr, body, body_len) != 0)) { + fprintf(stderr, "cache-selftest: %s%s: body mismatch\n", adr, fil); + fail++; + } + +#undef CHECK_STR + + if (r.adr != NULL) { + freet(r.adr); + } + freet(locbuf); + return fail; +} + +/* Fill a body of the requested size. kind 0 is highly compressible (a short + repeating pattern), kind 1 is incompressible (a deterministic PRNG), kind 2 + alternates the two -- together they exercise both deflate outcomes. */ +static void gen_body(char *buf, size_t len, int kind) { + unsigned int seed = 0x9e3779b1u ^ (unsigned int) len; + size_t j; + + for (j = 0; j < len; j++) { + if (kind == 0 || (kind == 2 && (j & 1) == 0)) { + buf[j] = (char) ('A' + (j % 26)); + } else { + seed = seed * 1103515245u + 12345u; + buf[j] = (char) (seed >> 16); + } + } +} + +int cache_selftests(httrackp *opt, const char *dir) { + int failures = 0; + cache_back cache; + int i; + + /* near-limit field values. The etag stresses htsblk.etag[256]; the location + stresses a long redirect URL. Each cached header line is read back through + a HTS_URLMAXSIZE-sized parse buffer (": \r\n"), so the + round-trippable value is shorter than HTS_URLMAXSIZE: 1000 stays safely + under that real limit. */ + static char etag_long[251]; + static char location_long[1001]; + + /* a body with embedded NUL and high bytes, to prove binary safety */ + static const char binary_body[] = { + 'P', 'N', 'G', '\0', '\r', '\n', (char) 0xFF, (char) 0x80, + '\0', '\0', 'e', 'n', 'd', (char) 0xCA, (char) 0xFE, '\n'}; + + /* large bodies for the compression pass; kept alive across the write and + read passes so the read can compare against them */ + static const size_t large_size[] = {200000, 200000, 50000}; + const int large_count = (int) (sizeof(large_size) / sizeof(large_size[0])); + char *large_body[3]; + + /* edge-case bodies, named so store and read assert the exact same bytes */ + const char *const body_index = "hello"; + const char *const body_api = "{\"k\":\"v\"}"; + const char *const body_updated = "UPDATED CONTENT"; + const char *const body_404 = "404 Not Found"; + + memset(etag_long, 'E', sizeof(etag_long) - 1); + etag_long[sizeof(etag_long) - 1] = '\0'; + memset(location_long, 'L', sizeof(location_long) - 1); + location_long[sizeof(location_long) - 1] = '\0'; + + for (i = 0; i < large_count; i++) { + large_body[i] = malloct(large_size[i]); + gen_body(large_body[i], large_size[i], i); + } + + /* set up an isolated cache directory */ + { + char base[HTS_URLMAXSIZE]; + + strcpybuff(base, dir); + if (base[0] != '\0' && base[strlen(base) - 1] != '/') { + strcatbuff(base, "/"); + } + StringCopy(opt->path_log, base); + } + opt->cache = 1; + + /* pass 1: create everything in a single write session */ + selftest_open_for_write(&cache, opt); + + /* edge cases: normal HTML page */ + store_entry(opt, &cache, "example.com", "/", "example.com/index.html", 200, + "OK", "text/html", "utf-8", "Mon, 01 Jan 2024 00:00:00 GMT", + "etag-normal", "", body_index, strlen(body_index)); + /* redirect: empty body, empty optional fields, near-limit location */ + store_entry(opt, &cache, "example.com", "/moved", "example.com/moved.html", + 301, "Moved Permanently", "text/html", "", "", "", location_long, + NULL, 0); + /* non-HTML content-type kept in cache via all_in_cache, near-limit etag */ + store_entry(opt, &cache, "example.com", "/api", "example.com/api.json", 200, + "OK", "application/json", "utf-8", + "Tue, 02 Jan 2024 12:00:00 GMT", etag_long, "", body_api, + strlen(body_api)); + /* binary body */ + store_entry(opt, &cache, "example.com", "/logo", "example.com/logo.png", 200, + "OK", "image/png", "", "", "etag-bin", "", binary_body, + sizeof(binary_body)); + /* error status with a body and a location (non-2xx codes are cached too) */ + store_entry(opt, &cache, "example.com", "/gone", "example.com/gone.html", 404, + "Not Found", "text/html", "utf-8", "", "etag-404", + "https://example.com/where-it-went", body_404, strlen(body_404)); + + /* scale: a few thousand small entries */ + for (i = 0; i < SELFTEST_VOLUME; i++) { + char fil[64], save[128], body[64]; + + sprintf(fil, "/v/%05d", i); + sprintf(save, "example.com/v/%05d.html", i); + sprintf(body, "volume entry %d", i); + store_entry(opt, &cache, "example.com", fil, save, 200, "OK", "text/html", + "utf-8", "", "", "", body, strlen(body)); + } + + /* compression: a few large bodies */ + for (i = 0; i < large_count; i++) { + char fil[64], save[128]; + + sprintf(fil, "/big/%d.bin", i); + sprintf(save, "example.com/big/%d.bin", i); + store_entry(opt, &cache, "example.com", fil, save, 200, "OK", + "application/octet-stream", "", "", "", "", large_body[i], + large_size[i]); + } + + selftest_close(&cache); + + /* pass 2: read back and verify everything round-tripped */ + selftest_open_for_read(&cache, opt); + + failures += check_entry(opt, &cache, "example.com", "/", 200, "OK", + "text/html", "utf-8", "Mon, 01 Jan 2024 00:00:00 GMT", + "etag-normal", "", body_index, strlen(body_index)); + failures += check_entry(opt, &cache, "example.com", "/moved", 301, + "Moved Permanently", "text/html", "", "", "", + location_long, NULL, 0); + failures += + check_entry(opt, &cache, "example.com", "/api", 200, "OK", + "application/json", "utf-8", "Tue, 02 Jan 2024 12:00:00 GMT", + etag_long, "", body_api, strlen(body_api)); + failures += + check_entry(opt, &cache, "example.com", "/logo", 200, "OK", "image/png", + "", "", "etag-bin", "", binary_body, sizeof(binary_body)); + failures += check_entry(opt, &cache, "example.com", "/gone", 404, "Not Found", + "text/html", "utf-8", "", "etag-404", + "https://example.com/where-it-went", body_404, + strlen(body_404)); + + for (i = 0; i < SELFTEST_VOLUME; i++) { + char fil[64], body[64]; + + sprintf(fil, "/v/%05d", i); + sprintf(body, "volume entry %d", i); + failures += + check_entry(opt, &cache, "example.com", fil, 200, "OK", "text/html", + "utf-8", "", "", "", body, strlen(body)); + } + + for (i = 0; i < large_count; i++) { + char fil[64]; + + sprintf(fil, "/big/%d.bin", i); + failures += check_entry(opt, &cache, "example.com", fil, 200, "OK", + "application/octet-stream", "", "", "", "", + large_body[i], large_size[i]); + } + + selftest_close(&cache); + + /* pass 3: update one edge entry with new body and headers */ + selftest_open_for_write(&cache, opt); + store_entry(opt, &cache, "example.com", "/", "example.com/index.html", 200, + "OK", "text/html", "iso-8859-1", "Wed, 03 Jan 2024 09:30:00 GMT", + "etag-updated", "", body_updated, strlen(body_updated)); + selftest_close(&cache); + + /* pass 4: re-read and confirm the updated value, not the old one */ + selftest_open_for_read(&cache, opt); + failures += + check_entry(opt, &cache, "example.com", "/", 200, "OK", "text/html", + "iso-8859-1", "Wed, 03 Jan 2024 09:30:00 GMT", "etag-updated", + "", body_updated, strlen(body_updated)); + selftest_close(&cache); + + for (i = 0; i < large_count; i++) { + freet(large_body[i]); + } + + return failures; +} diff --git a/src/htscache_selftest.h b/src/htscache_selftest.h new file mode 100644 index 0000000..c120eb0 --- /dev/null +++ b/src/htscache_selftest.h @@ -0,0 +1,49 @@ +/* ------------------------------------------------------------ */ +/* +HTTrack Website Copier, Offline Browser for Windows and Unix +Copyright (C) 1998-2017 Xavier Roche and other contributors + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +Important notes: + +- We hereby ask people using this source NOT to use it in purpose of grabbing +emails addresses, or collecting any other private information on persons. +This would disgrace our work, and spoil the many hours we spent on it. + +Please visit our Website: http://www.httrack.com +*/ + +/* ------------------------------------------------------------ */ +/* File: htscache_selftest.h */ +/* Author: Xavier Roche */ +/* ------------------------------------------------------------ */ + +#ifndef HTSCACHE_SELFTEST_DEFH +#define HTSCACHE_SELFTEST_DEFH + +#ifdef HTS_INTERNAL_BYTECODE + +#ifndef HTS_DEF_FWSTRUCT_httrackp +#define HTS_DEF_FWSTRUCT_httrackp +typedef struct httrackp httrackp; +#endif + +/* Run the cache create/read/update self-test against a working directory. + Returns the number of failed checks (0 == success). */ +int cache_selftests(httrackp *opt, const char *dir); + +#endif + +#endif diff --git a/src/htscoremain.c b/src/htscoremain.c index 3cf3e48..fe911aa 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -46,6 +46,7 @@ Please visit our Website: http://www.httrack.com #include "htszlib.h" #include "htscharset.h" #include "htsencoding.h" +#include "htscache_selftest.h" #include "htsmd5.h" #include @@ -2113,6 +2114,19 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) { case '#':{ // non documenté com++; switch (*com) { + case 'A': // cache self-test: httrack -#A + if (na + 1 < argc) { + const int err = cache_selftests(opt, argv[na + 1]); + + printf("cache-selftest: %s\n", err ? "FAIL" : "OK"); + htsmain_free(); + return err; + } else { + fprintf(stderr, "Option #A requires a directory argument\n"); + htsmain_free(); + return 1; + } + break; case 'C': // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file { int hasFilter = 0; diff --git a/tests/01_engine-cache.test b/tests/01_engine-cache.test new file mode 100755 index 0000000..90d711f --- /dev/null +++ b/tests/01_engine-cache.test @@ -0,0 +1,46 @@ +#!/bin/bash +# + +# Cache create/read/update logic (driven by 'httrack -#A '). +# +# The in-process self-test stores several hand-crafted edge entries (normal +# HTML, an empty redirect with a near-limit location, a non-HTML body kept via +# all-in-cache, a binary body with embedded NUL/high bytes), a few thousand +# small entries (index/lookup scale), and a few large compressible and +# incompressible bodies (zlib deflate/inflate). It reads everything back +# asserting every header field and the body round-trip byte for byte, then +# updates one entry and confirms the new value is read back. It exits non-zero +# on the first mismatch. + +set -eu + +dir=$(mktemp -d) +trap 'rm -rf "$dir"' EXIT + +# Like the other -# debug modes, a trailing token (the working directory) is +# required; a bare '-#A' falls through to the usage screen. +out=$(httrack -#A "$dir") + +# Match the exact success line, so the test cannot pass for an unrelated reason +# (e.g. the -#A mode being gone and falling through to the usage screen, which +# also exits non-zero but never prints this). +test "$out" = "cache-selftest: OK" || { + echo "expected 'cache-selftest: OK', got: $out" >&2 + exit 1 +} + +# The self-test must have actually produced a ZIP cache on disk. +test -e "$dir/hts-cache/new.zip" || { + echo "no ZIP cache was written by the self-test" >&2 + exit 1 +} + +# Sanity-check the cache footprint: the few-thousand-entry pass is expected to +# weigh ~1-2 MB. Fail if it balloons well past that (e.g. a per-entry overhead +# regression or runaway growth), so the cache size stays bounded. +ceiling=$((4 * 1024 * 1024)) +bytes=$(du -sb "$dir/hts-cache" | cut -f1) +test "$bytes" -le "$ceiling" || { + echo "cache footprint $bytes bytes exceeds ${ceiling} ceiling" >&2 + exit 1 +} diff --git a/tests/Makefile.am b/tests/Makefile.am index 5c1dbb3..fc08fee 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -11,6 +11,7 @@ TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir) TEST_EXTENSIONS = .test TESTS = \ 00_runnable.test \ + 01_engine-cache.test \ 01_engine-charset.test \ 01_engine-cmdline.test \ 01_engine-entities.test \