mirror of
https://github.com/xroche/httrack.git
synced 2026-06-14 22:33:54 +03:00
Compare commits
10 Commits
cleanup/ht
...
tests/cach
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83ff148efd | ||
|
|
50bb02e729 | ||
|
|
b80ee793ac | ||
|
|
d12456c1e8 | ||
|
|
a52a2b146c | ||
|
|
226a38d3d0 | ||
|
|
1e463f65a5 | ||
|
|
09ed9968cd | ||
|
|
ad6915e3cc | ||
|
|
4a5580dec0 |
@@ -56,6 +56,7 @@ whttrackrundir = $(bindir)
|
||||
whttrackrun_SCRIPTS = webhttrack
|
||||
|
||||
libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
htscache_selftest.c \
|
||||
htscatchurl.c htsfilters.c htsftp.c htshash.c coucal/coucal.c \
|
||||
htshelp.c htslib.c htscoremain.c \
|
||||
htsname.c htsrobots.c htstools.c htswizard.c \
|
||||
@@ -65,7 +66,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
md5.c \
|
||||
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
||||
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
||||
htsbasenet.h htsbauth.h htscache.h htscatchurl.h \
|
||||
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htscatchurl.h \
|
||||
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
||||
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
||||
htshelp.h htsindex.h htslib.h htsmd5.h \
|
||||
|
||||
@@ -102,7 +102,8 @@ int cookie_add(t_cookie * cookie, const char *cook_name, const char *cook_value,
|
||||
strcatbuff(cook, "\n");
|
||||
if (!((strlen(cookie->data) + strlen(cook)) < cookie->max_len))
|
||||
return -1; // impossible d'ajouter
|
||||
cookie_insert(insert, cook);
|
||||
cookie_insert(insert, cookie->max_len - (size_t) (insert - cookie->data),
|
||||
cook);
|
||||
#if DEBUG_COOK
|
||||
printf("add_new cookie: name=\"%s\" value=\"%s\" domain=\"%s\" path=\"%s\"\n",
|
||||
cook_name, cook_value, domain, path);
|
||||
@@ -118,7 +119,7 @@ int cookie_del(t_cookie * cookie, const char *cook_name, const char *domain, con
|
||||
b = cookie_find(cookie->data, cook_name, domain, path);
|
||||
if (b) {
|
||||
a = cookie_nextfield(b);
|
||||
cookie_delete(b, a - b);
|
||||
cookie_delete(b, cookie->max_len - (size_t) (b - cookie->data), a - b);
|
||||
#if DEBUG_COOK
|
||||
printf("deleted old cookie: %s %s %s\n", cook_name, domain, path);
|
||||
#endif
|
||||
@@ -336,41 +337,44 @@ int cookie_save(t_cookie * cookie, const char *name) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// insertion chaine ins avant s
|
||||
void cookie_insert(char *s, const char *ins) {
|
||||
// Insert string ins before s. s_size is the capacity of the buffer at s.
|
||||
void cookie_insert(char *s, size_t s_size, const char *ins) {
|
||||
char *buff;
|
||||
|
||||
if (strnotempty(s) == 0) { // rien à faire, juste concat
|
||||
strcatbuff(s, ins);
|
||||
if (strnotempty(s) == 0) { // nothing there yet: just concatenate
|
||||
strlcatbuff(s, ins, s_size);
|
||||
} else {
|
||||
buff = (char *) malloct(strlen(s) + 1);
|
||||
if (buff) {
|
||||
strcpybuff(buff, s); // copie temporaire
|
||||
strcpybuff(s, ins); // insérer
|
||||
strcatbuff(s, buff); // copier
|
||||
strlcpybuff(buff, s, strlen(s) + 1); // temporary copy of s
|
||||
strlcpybuff(s, ins, s_size); // write ins
|
||||
strlcatbuff(s, buff, s_size); // then the saved content
|
||||
freet(buff);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// destruction chaine dans s position pos
|
||||
void cookie_delete(char *s, size_t pos) {
|
||||
// Delete the substring of s at position pos. s_size is the capacity at s.
|
||||
void cookie_delete(char *s, size_t s_size, size_t pos) {
|
||||
char *buff;
|
||||
|
||||
if (strnotempty(s + pos) == 0) { // rien à faire, effacer
|
||||
if (strnotempty(s + pos) == 0) { // nothing after pos: truncate
|
||||
s[0] = '\0';
|
||||
} else {
|
||||
buff = (char *) malloct(strlen(s + pos) + 1);
|
||||
if (buff) {
|
||||
strcpybuff(buff, s + pos); // copie temporaire
|
||||
strcpybuff(s, buff); // copier
|
||||
strlcpybuff(buff, s + pos, strlen(s + pos) + 1); // temporary copy
|
||||
strlcpybuff(s, buff, s_size); // overwrite from start
|
||||
freet(buff);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// renvoie champ param de la chaine cookie_base
|
||||
// ex: cookie_get("ceci est<tab>un<tab>exemple",1) renvoi "un"
|
||||
// Return field <param> (0-based, tab-separated) of the cookie line cookie_base,
|
||||
// into buffer. ex: cookie_get("ceci est<tab>un<tab>exemple", 1) returns "un".
|
||||
// buffer must hold at least COOKIE_FIELD_BUFFER_SIZE bytes (all callers use
|
||||
// char[8192]).
|
||||
#define COOKIE_FIELD_BUFFER_SIZE 8192
|
||||
const char *cookie_get(char *buffer, const char *cookie_base, int param) {
|
||||
const char *limit;
|
||||
|
||||
@@ -394,11 +398,11 @@ const char *cookie_get(char *buffer, const char *cookie_base, int param) {
|
||||
if (cookie_base) {
|
||||
if (cookie_base < limit) {
|
||||
const char *a = cookie_base;
|
||||
htsbuff b = htsbuff_ptr(buffer, COOKIE_FIELD_BUFFER_SIZE);
|
||||
|
||||
while((*a) && (*a != '\t') && (*a != '\n'))
|
||||
a++;
|
||||
buffer[0] = '\0';
|
||||
strncatbuff(buffer, cookie_base, (int) (a - cookie_base));
|
||||
htsbuff_catn(&b, cookie_base, (size_t) (a - cookie_base));
|
||||
return buffer;
|
||||
} else
|
||||
return "";
|
||||
@@ -458,11 +462,13 @@ char *bauth_check(t_cookie * cookie, const char *adr, const char *fil) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Build the auth prefix (host + path, query stripped) into prefix.
|
||||
Callers pass a buffer of HTS_URLMAXSIZE * 2 bytes. */
|
||||
char *bauth_prefix(char *prefix, const char *adr, const char *fil) {
|
||||
char *a;
|
||||
|
||||
strcpybuff(prefix, jump_identification_const(adr));
|
||||
strcatbuff(prefix, fil);
|
||||
strlcpybuff(prefix, jump_identification_const(adr), HTS_URLMAXSIZE * 2);
|
||||
strlcatbuff(prefix, fil, HTS_URLMAXSIZE * 2);
|
||||
a = strchr(prefix, '?');
|
||||
if (a)
|
||||
*a = '\0';
|
||||
|
||||
@@ -67,8 +67,8 @@ int cookie_add(t_cookie * cookie, const char *cook_name, const char *cook_valu
|
||||
int cookie_del(t_cookie * cookie, const char *cook_name, const char *domain, const char *path);
|
||||
int cookie_load(t_cookie * cookie, const char *path, const char *name);
|
||||
int cookie_save(t_cookie * cookie, const char *name);
|
||||
void cookie_insert(char *s, const char *ins);
|
||||
void cookie_delete(char *s, size_t pos);
|
||||
void cookie_insert(char *s, size_t s_size, const char *ins);
|
||||
void cookie_delete(char *s, size_t s_size, size_t pos);
|
||||
const char *cookie_get(char *buffer, const char *cookie_base, int param);
|
||||
char *cookie_find(char *s, const char *cook_name, const char *domain, const char *path);
|
||||
char *cookie_nextfield(char *a);
|
||||
|
||||
140
src/htscache.c
140
src/htscache.c
@@ -196,12 +196,13 @@ struct cache_back_zip_entry {
|
||||
int compressionMethod;
|
||||
};
|
||||
|
||||
#define ZIP_READFIELD_STRING(line, value, refline, refvalue) do { \
|
||||
if (line[0] != '\0' && strfield2(line, refline)) { \
|
||||
strcpybuff(refvalue, value); \
|
||||
line[0] = '\0'; \
|
||||
} \
|
||||
} while(0)
|
||||
#define ZIP_READFIELD_STRING(line, value, refline, refvalue, refvalue_size) \
|
||||
do { \
|
||||
if (line[0] != '\0' && strfield2(line, refline)) { \
|
||||
strlcpybuff(refvalue, value, refvalue_size); \
|
||||
line[0] = '\0'; \
|
||||
} \
|
||||
} while (0)
|
||||
#define ZIP_READFIELD_INT(line, value, refline, refvalue) do { \
|
||||
if (line[0] != '\0' && strfield2(line, refline)) { \
|
||||
int intval = 0; \
|
||||
@@ -643,7 +644,7 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
} else {
|
||||
r.location = location_default;
|
||||
}
|
||||
strcpybuff(r.location, "");
|
||||
r.location[0] = '\0';
|
||||
strcpybuff(buff, adr);
|
||||
strcatbuff(buff, fil);
|
||||
hash_pos_return = coucal_read(cache->hashtable, buff, &hash_pos);
|
||||
@@ -706,17 +707,25 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
value++;
|
||||
ZIP_READFIELD_INT(line, value, "X-In-Cache", dataincache);
|
||||
ZIP_READFIELD_INT(line, value, "X-Statuscode", r.statuscode);
|
||||
ZIP_READFIELD_STRING(line, value, "X-StatusMessage", r.msg); // msg
|
||||
ZIP_READFIELD_STRING(line, value, "X-StatusMessage", r.msg,
|
||||
sizeof(r.msg));
|
||||
ZIP_READFIELD_LLINT(line, value, "X-Size", r.size); // size
|
||||
ZIP_READFIELD_STRING(line, value, "Content-Type", r.contenttype); // contenttype
|
||||
ZIP_READFIELD_STRING(line, value, "X-Charset", r.charset); // contenttype
|
||||
ZIP_READFIELD_STRING(line, value, "Last-Modified", r.lastmodified); // last-modified
|
||||
ZIP_READFIELD_STRING(line, value, "Etag", r.etag); // Etag
|
||||
ZIP_READFIELD_STRING(line, value, "Location", r.location); // 'location' pour moved
|
||||
ZIP_READFIELD_STRING(line, value, "Content-Disposition", r.cdispo); // Content-disposition
|
||||
ZIP_READFIELD_STRING(line, value, "Content-Type", r.contenttype,
|
||||
sizeof(r.contenttype));
|
||||
ZIP_READFIELD_STRING(line, value, "X-Charset", r.charset,
|
||||
sizeof(r.charset));
|
||||
ZIP_READFIELD_STRING(line, value, "Last-Modified", r.lastmodified,
|
||||
sizeof(r.lastmodified));
|
||||
ZIP_READFIELD_STRING(line, value, "Etag", r.etag, sizeof(r.etag));
|
||||
// r.location is a char* pointing into a HTS_URLMAXSIZE*2 buffer
|
||||
ZIP_READFIELD_STRING(line, value, "Location", r.location,
|
||||
HTS_URLMAXSIZE * 2);
|
||||
ZIP_READFIELD_STRING(line, value, "Content-Disposition", r.cdispo,
|
||||
sizeof(r.cdispo));
|
||||
//ZIP_READFIELD_STRING(line, value, "X-Addr", ..); // Original address
|
||||
//ZIP_READFIELD_STRING(line, value, "X-Fil", ..); // Original URI filename
|
||||
ZIP_READFIELD_STRING(line, value, "X-Save", previous_save_); // Original save filename
|
||||
ZIP_READFIELD_STRING(line, value, "X-Save", previous_save_,
|
||||
sizeof(previous_save_));
|
||||
}
|
||||
} while(offset < readSizeHeader && !lineEof);
|
||||
//totalHeader = offset;
|
||||
@@ -733,7 +742,7 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
}
|
||||
}
|
||||
if (return_save != NULL) {
|
||||
strcpybuff(return_save, previous_save);
|
||||
strlcpybuff(return_save, previous_save, HTS_URLMAXSIZE * 2);
|
||||
}
|
||||
|
||||
/* Complete fields */
|
||||
@@ -1025,7 +1034,7 @@ static htsblk cache_readex_old(httrackp * opt, cache_back * cache,
|
||||
} else {
|
||||
r.location = location_default;
|
||||
}
|
||||
strcpybuff(r.location, "");
|
||||
r.location[0] = '\0';
|
||||
#if HTS_FAST_CACHE
|
||||
strcpybuff(buff, adr);
|
||||
strcatbuff(buff, fil);
|
||||
@@ -1096,30 +1105,34 @@ static htsblk cache_readex_old(httrackp * opt, cache_back * cache,
|
||||
//
|
||||
cache_rint(cache->olddat, &r.statuscode);
|
||||
cache_rLLint(cache->olddat, &r.size);
|
||||
cache_rstr(cache->olddat, r.msg);
|
||||
cache_rstr(cache->olddat, r.contenttype);
|
||||
cache_rstr(cache->olddat, r.msg, sizeof(r.msg));
|
||||
cache_rstr(cache->olddat, r.contenttype, sizeof(r.contenttype));
|
||||
if (cache->version >= 3)
|
||||
cache_rstr(cache->olddat, r.charset);
|
||||
cache_rstr(cache->olddat, r.lastmodified);
|
||||
cache_rstr(cache->olddat, r.etag);
|
||||
cache_rstr(cache->olddat, r.location);
|
||||
cache_rstr(cache->olddat, r.charset, sizeof(r.charset));
|
||||
cache_rstr(cache->olddat, r.lastmodified, sizeof(r.lastmodified));
|
||||
cache_rstr(cache->olddat, r.etag, sizeof(r.etag));
|
||||
// r.location points into a HTS_URLMAXSIZE*2 buffer
|
||||
cache_rstr(cache->olddat, r.location, HTS_URLMAXSIZE * 2);
|
||||
if (cache->version >= 2)
|
||||
cache_rstr(cache->olddat, r.cdispo);
|
||||
cache_rstr(cache->olddat, r.cdispo, sizeof(r.cdispo));
|
||||
if (cache->version >= 4) {
|
||||
cache_rstr(cache->olddat, previous_save); // adr
|
||||
cache_rstr(cache->olddat, previous_save); // fil
|
||||
cache_rstr(cache->olddat, previous_save,
|
||||
sizeof(previous_save)); // adr
|
||||
cache_rstr(cache->olddat, previous_save,
|
||||
sizeof(previous_save)); // fil
|
||||
previous_save[0] = '\0';
|
||||
cache_rstr(cache->olddat, previous_save); // save
|
||||
cache_rstr(cache->olddat, previous_save,
|
||||
sizeof(previous_save)); // save
|
||||
if (return_save != NULL) {
|
||||
strcpybuff(return_save, previous_save);
|
||||
strlcpybuff(return_save, previous_save, HTS_URLMAXSIZE * 2);
|
||||
}
|
||||
}
|
||||
if (cache->version >= 5) {
|
||||
r.headers = cache_rstr_addr(cache->olddat);
|
||||
}
|
||||
//
|
||||
cache_rstr(cache->olddat, check);
|
||||
if (strcmp(check, "HTS") == 0) { /* intégrité OK */
|
||||
cache_rstr(cache->olddat, check, sizeof(check));
|
||||
if (strcmp(check, "HTS") == 0) { /* integrity OK */
|
||||
ok = 1;
|
||||
}
|
||||
cache_rLLint(cache->olddat, &size_read); /* lire size pour être sûr de la taille déclarée (réécrire) */
|
||||
@@ -1758,12 +1771,12 @@ void cache_init(cache_back * cache, httrackp * opt) {
|
||||
char firstline[256];
|
||||
char *a = cache->use;
|
||||
|
||||
a += cache_brstr(a, firstline);
|
||||
if (strncmp(firstline, "CACHE-", 6) == 0) { // Nouvelle version du cache
|
||||
if (strncmp(firstline, "CACHE-1.", 8) == 0) { // Version 1.1x
|
||||
a += cache_brstr(a, firstline, sizeof(firstline));
|
||||
if (strncmp(firstline, "CACHE-", 6) == 0) { // new cache format
|
||||
if (strncmp(firstline, "CACHE-1.", 8) == 0) { // version 1.1x
|
||||
cache->version = (int) (firstline[8] - '0'); // cache 1.x
|
||||
if (cache->version <= 5) {
|
||||
a += cache_brstr(a, firstline);
|
||||
a += cache_brstr(a, firstline, sizeof(firstline));
|
||||
strcpybuff(cache->lastmodified, firstline);
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
@@ -1774,7 +1787,7 @@ void cache_init(cache_back * cache, httrackp * opt) {
|
||||
freet(cache->use);
|
||||
cache->use = NULL;
|
||||
}
|
||||
} else { // non supporté
|
||||
} else { // non supporté
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Cache: %s not supported, ignoring current cache",
|
||||
firstline);
|
||||
@@ -1784,7 +1797,7 @@ void cache_init(cache_back * cache, httrackp * opt) {
|
||||
cache->use = NULL;
|
||||
}
|
||||
/* */
|
||||
} else { // Vieille version du cache
|
||||
} else { // Vieille version du cache
|
||||
/* */
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Cache: importing old cache format");
|
||||
@@ -2088,7 +2101,7 @@ char *readfile_or(const char *fil, const char *defaultdata) {
|
||||
char *adr = malloct(strlen(defaultdata) + 1);
|
||||
|
||||
if (adr) {
|
||||
strcpybuff(adr, defaultdata);
|
||||
strlcpybuff(adr, defaultdata, strlen(defaultdata) + 1);
|
||||
return adr;
|
||||
}
|
||||
}
|
||||
@@ -2109,7 +2122,7 @@ int cache_wstr(FILE * fp, const char *s) {
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
void cache_rstr(FILE * fp, char *s) {
|
||||
void cache_rstr(FILE *fp, char *s, size_t s_size) {
|
||||
INTsys i;
|
||||
char buff[256 + 4];
|
||||
|
||||
@@ -2118,13 +2131,26 @@ void cache_rstr(FILE * fp, char *s) {
|
||||
if (i < 0 || i > 32768) /* error, something nasty happened */
|
||||
i = 0;
|
||||
if (i > 0) {
|
||||
if ((int) fread(s, 1, i, fp) != i) {
|
||||
/* Store at most s_size-1 bytes into s, but consume all i bytes from the
|
||||
stream so the next field stays aligned (the field may be longer than the
|
||||
destination in a tampered/old cache). */
|
||||
const size_t want = (size_t) i;
|
||||
const size_t store = want < s_size ? want : s_size - 1;
|
||||
|
||||
if (fread(s, 1, store, fp) != store) {
|
||||
int fread_cache_failed = 0;
|
||||
|
||||
assertf(fread_cache_failed);
|
||||
}
|
||||
if (want > store && fseek(fp, (long) (want - store), SEEK_CUR) != 0) {
|
||||
int fseek_cache_failed = 0;
|
||||
|
||||
assertf(fseek_cache_failed);
|
||||
}
|
||||
s[store] = '\0';
|
||||
} else {
|
||||
s[0] = '\0';
|
||||
}
|
||||
*(s + i) = '\0';
|
||||
}
|
||||
char *cache_rstr_addr(FILE * fp) {
|
||||
INTsys i;
|
||||
@@ -2148,7 +2174,7 @@ char *cache_rstr_addr(FILE * fp) {
|
||||
}
|
||||
return addr;
|
||||
}
|
||||
int cache_brstr(char *adr, char *s) {
|
||||
int cache_brstr(char *adr, char *s, size_t s_size) {
|
||||
int i;
|
||||
int off;
|
||||
char buff[256 + 4];
|
||||
@@ -2156,23 +2182,17 @@ int cache_brstr(char *adr, char *s) {
|
||||
off = binput(adr, buff, 256);
|
||||
adr += off;
|
||||
sscanf(buff, "%d", &i);
|
||||
if (i > 0)
|
||||
strncpy(s, adr, i);
|
||||
*(s + i) = '\0';
|
||||
off += i;
|
||||
return off;
|
||||
}
|
||||
int cache_quickbrstr(char *adr, char *s) {
|
||||
int i;
|
||||
int off;
|
||||
char buff[256 + 4];
|
||||
if (i < 0 || i > 32768) /* guard a corrupt length */
|
||||
i = 0;
|
||||
if (i > 0) {
|
||||
/* copy at most s_size-1 bytes; advance past the full field regardless */
|
||||
const size_t store = (size_t) i < s_size ? (size_t) i : s_size - 1;
|
||||
|
||||
off = binput(adr, buff, 256);
|
||||
adr += off;
|
||||
sscanf(buff, "%d", &i);
|
||||
if (i > 0)
|
||||
strncpy(s, adr, i);
|
||||
*(s + i) = '\0';
|
||||
strncpy(s, adr, store);
|
||||
s[store] = '\0';
|
||||
} else {
|
||||
s[0] = '\0';
|
||||
}
|
||||
off += i;
|
||||
return off;
|
||||
}
|
||||
@@ -2180,7 +2200,7 @@ int cache_quickbrstr(char *adr, char *s) {
|
||||
/* idem, mais en int */
|
||||
int cache_brint(char *adr, int *i) {
|
||||
char s[256];
|
||||
int r = cache_brstr(adr, s);
|
||||
int r = cache_brstr(adr, s, sizeof(s));
|
||||
|
||||
if (r != -1)
|
||||
sscanf(s, "%d", i);
|
||||
@@ -2189,7 +2209,7 @@ int cache_brint(char *adr, int *i) {
|
||||
void cache_rint(FILE * fp, int *i) {
|
||||
char s[256];
|
||||
|
||||
cache_rstr(fp, s);
|
||||
cache_rstr(fp, s, sizeof(s));
|
||||
sscanf(s, "%d", i);
|
||||
}
|
||||
int cache_wint(FILE * fp, int i) {
|
||||
@@ -2201,7 +2221,7 @@ int cache_wint(FILE * fp, int i) {
|
||||
void cache_rLLint(FILE * fp, LLint * i) {
|
||||
char s[256];
|
||||
|
||||
cache_rstr(fp, s);
|
||||
cache_rstr(fp, s, sizeof(s));
|
||||
sscanf(s, LLintP, i);
|
||||
}
|
||||
int cache_wLLint(FILE * fp, LLint i) {
|
||||
|
||||
@@ -80,10 +80,9 @@ int cache_writedata(FILE * cache_ndx, FILE * cache_dat, const char *str1,
|
||||
int cache_readdata(cache_back * cache, const char *str1, const char *str2,
|
||||
char **inbuff, int *len);
|
||||
|
||||
void cache_rstr(FILE * fp, char *s);
|
||||
void cache_rstr(FILE *fp, char *s, size_t s_size);
|
||||
char *cache_rstr_addr(FILE * fp);
|
||||
int cache_brstr(char *adr, char *s);
|
||||
int cache_quickbrstr(char *adr, char *s);
|
||||
int cache_brstr(char *adr, char *s, size_t s_size);
|
||||
int cache_brint(char *adr, int *i);
|
||||
void cache_rint(FILE * fp, int *i);
|
||||
void cache_rLLint(FILE * fp, LLint * i);
|
||||
|
||||
374
src/htscache_selftest.c
Normal file
374
src/htscache_selftest.c
Normal file
@@ -0,0 +1,374 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2017 Xavier Roche and other contributors
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Important notes:
|
||||
|
||||
- We hereby ask people using this source NOT to use it in purpose of grabbing
|
||||
emails addresses, or collecting any other private information on persons.
|
||||
This would disgrace our work, and spoil the many hours we spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: htscache_selftest.c subroutines: */
|
||||
/* in-process self-test for the (ZIP) cache subsystem */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/* Drives the public cache API (cache_init / cache_add / cache_readex)
|
||||
through a create -> read -> update cycle on a real on-disk ZIP cache,
|
||||
asserting every header field and the (binary-safe) body round-trips.
|
||||
Besides a few hand-crafted edge cases it stores a few thousand entries
|
||||
(index/lookup scale) and a handful of large compressible/incompressible
|
||||
bodies (zlib deflate/inflate). Reached via `httrack -#A <dir>`. */
|
||||
|
||||
#define HTS_INTERNAL_BYTECODE
|
||||
|
||||
#include "htscache_selftest.h"
|
||||
|
||||
#include "htscache.h"
|
||||
#include "htscore.h"
|
||||
#include "htslib.h"
|
||||
#include "htszlib.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#define SELFTEST_VOLUME 3000 /* number of small entries in the scale pass */
|
||||
|
||||
/* Open a cache session. A write session (ro=0) rotates new.zip -> old.zip and
|
||||
opens a fresh new.zip; a read session (ro=1) opens new.zip in place. */
|
||||
static void selftest_open(cache_back *cache, httrackp *opt, int ro) {
|
||||
memset(cache, 0, sizeof(*cache));
|
||||
cache->type = 1;
|
||||
cache->log = stderr;
|
||||
cache->errlog = stderr;
|
||||
cache->hashtable = coucal_new(0);
|
||||
cache->ro = ro;
|
||||
cache_init(cache, opt);
|
||||
}
|
||||
|
||||
static void selftest_open_for_write(cache_back *cache, httrackp *opt) {
|
||||
selftest_open(cache, opt, 0);
|
||||
}
|
||||
|
||||
static void selftest_open_for_read(cache_back *cache, httrackp *opt) {
|
||||
selftest_open(cache, opt, 1);
|
||||
}
|
||||
|
||||
static void selftest_close(cache_back *cache) {
|
||||
if (cache->dat != NULL) {
|
||||
fclose(cache->dat);
|
||||
cache->dat = NULL;
|
||||
}
|
||||
if (cache->ndx != NULL) {
|
||||
fclose(cache->ndx);
|
||||
cache->ndx = NULL;
|
||||
}
|
||||
if (cache->zipOutput != NULL) {
|
||||
zipClose(cache->zipOutput,
|
||||
"Created by HTTrack Website Copier (cache self-test)");
|
||||
cache->zipOutput = NULL;
|
||||
}
|
||||
if (cache->zipInput != NULL) {
|
||||
unzClose(cache->zipInput);
|
||||
cache->zipInput = NULL;
|
||||
}
|
||||
/* hashtable is intentionally not coucal_delete()d: it would dump a stats
|
||||
summary to stderr on every call, and this is a one-shot CLI subcommand
|
||||
that exits right after (same choice as the other -# cache subcommands) */
|
||||
}
|
||||
|
||||
/* Store one entry. The body is copied into a private buffer (any size), so
|
||||
callers may pass const data and cache_add never sees a cast-away qualifier;
|
||||
it consumes everything synchronously, so the copy is freed on return. */
|
||||
static void store_entry(httrackp *opt, cache_back *cache, const char *adr,
|
||||
const char *fil, const char *save, int statuscode,
|
||||
const char *msg, const char *contenttype,
|
||||
const char *charset, const char *lastmodified,
|
||||
const char *etag, const char *location,
|
||||
const char *body, size_t body_len) {
|
||||
htsblk r;
|
||||
char locbuf[HTS_URLMAXSIZE * 2];
|
||||
char *bodycopy = NULL;
|
||||
|
||||
hts_init_htsblk(&r);
|
||||
r.statuscode = statuscode;
|
||||
r.size = (LLint) body_len;
|
||||
strcpybuff(r.msg, msg);
|
||||
strcpybuff(r.contenttype, contenttype);
|
||||
strcpybuff(r.charset, charset);
|
||||
strcpybuff(r.lastmodified, lastmodified);
|
||||
strcpybuff(r.etag, etag);
|
||||
strcpybuff(locbuf, location);
|
||||
r.location = locbuf;
|
||||
r.is_write = 0;
|
||||
/* an empty body must be a NULL pointer: cache_add rejects a non-NULL
|
||||
pointer with size 0 */
|
||||
if (body_len != 0) {
|
||||
bodycopy = malloct(body_len);
|
||||
memcpy(bodycopy, body, body_len);
|
||||
r.adr = bodycopy;
|
||||
}
|
||||
/* all_in_cache=1: keep the body in the ZIP whatever the content-type,
|
||||
so the read path never depends on a file on disk */
|
||||
cache_add(opt, cache, &r, adr, fil, save, 1, NULL);
|
||||
if (bodycopy != NULL) {
|
||||
freet(bodycopy);
|
||||
}
|
||||
}
|
||||
|
||||
/* Read one entry back and check every field. Returns the number of
|
||||
mismatches (0 == success). */
|
||||
static int check_entry(httrackp *opt, cache_back *cache, const char *adr,
|
||||
const char *fil, int statuscode, const char *msg,
|
||||
const char *contenttype, const char *charset,
|
||||
const char *lastmodified, const char *etag,
|
||||
const char *location, const char *body,
|
||||
size_t body_len) {
|
||||
int fail = 0;
|
||||
char *locbuf = malloct(HTS_URLMAXSIZE * 2);
|
||||
htsblk r;
|
||||
|
||||
locbuf[0] = '\0';
|
||||
/* readonly=1: pure read, no rename/disk-write decision logic */
|
||||
r = cache_readex(opt, cache, adr, fil, "", locbuf, NULL, 1);
|
||||
|
||||
#define CHECK_STR(field, want) \
|
||||
do { \
|
||||
if (strcmp((field), (want)) != 0) { \
|
||||
fprintf(stderr, \
|
||||
"cache-selftest: %s%s: " #field " is '%s', expected '%s'\n", \
|
||||
adr, fil, (field), (want)); \
|
||||
fail++; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
if (r.statuscode != statuscode) {
|
||||
fprintf(stderr, "cache-selftest: %s%s: statuscode is %d, expected %d\n",
|
||||
adr, fil, r.statuscode, statuscode);
|
||||
fail++;
|
||||
}
|
||||
CHECK_STR(r.msg, msg);
|
||||
CHECK_STR(r.contenttype, contenttype);
|
||||
CHECK_STR(r.charset, charset);
|
||||
CHECK_STR(r.lastmodified, lastmodified);
|
||||
CHECK_STR(r.etag, etag);
|
||||
CHECK_STR(locbuf, location);
|
||||
|
||||
if (r.size != (LLint) body_len) {
|
||||
fprintf(stderr, "cache-selftest: %s%s: size is " LLintP ", expected %d\n",
|
||||
adr, fil, (LLint) r.size, (int) body_len);
|
||||
fail++;
|
||||
} else if (body_len != 0 &&
|
||||
(r.adr == NULL || memcmp(r.adr, body, body_len) != 0)) {
|
||||
fprintf(stderr, "cache-selftest: %s%s: body mismatch\n", adr, fil);
|
||||
fail++;
|
||||
}
|
||||
|
||||
#undef CHECK_STR
|
||||
|
||||
if (r.adr != NULL) {
|
||||
freet(r.adr);
|
||||
}
|
||||
freet(locbuf);
|
||||
return fail;
|
||||
}
|
||||
|
||||
/* Fill a body of the requested size. kind 0 is highly compressible (a short
|
||||
repeating pattern), kind 1 is incompressible (a deterministic PRNG), kind 2
|
||||
alternates the two -- together they exercise both deflate outcomes. */
|
||||
static void gen_body(char *buf, size_t len, int kind) {
|
||||
unsigned int seed = 0x9e3779b1u ^ (unsigned int) len;
|
||||
size_t j;
|
||||
|
||||
for (j = 0; j < len; j++) {
|
||||
if (kind == 0 || (kind == 2 && (j & 1) == 0)) {
|
||||
buf[j] = (char) ('A' + (j % 26));
|
||||
} else {
|
||||
seed = seed * 1103515245u + 12345u;
|
||||
buf[j] = (char) (seed >> 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int cache_selftests(httrackp *opt, const char *dir) {
|
||||
int failures = 0;
|
||||
cache_back cache;
|
||||
int i;
|
||||
|
||||
/* near-limit field values. The etag stresses htsblk.etag[256]; the location
|
||||
stresses a long redirect URL. Each cached header line is read back through
|
||||
a HTS_URLMAXSIZE-sized parse buffer ("<field>: <value>\r\n"), so the
|
||||
round-trippable value is shorter than HTS_URLMAXSIZE: 1000 stays safely
|
||||
under that real limit. */
|
||||
static char etag_long[251];
|
||||
static char location_long[1001];
|
||||
|
||||
/* a body with embedded NUL and high bytes, to prove binary safety */
|
||||
static const char binary_body[] = {
|
||||
'P', 'N', 'G', '\0', '\r', '\n', (char) 0xFF, (char) 0x80,
|
||||
'\0', '\0', 'e', 'n', 'd', (char) 0xCA, (char) 0xFE, '\n'};
|
||||
|
||||
/* large bodies for the compression pass; kept alive across the write and
|
||||
read passes so the read can compare against them */
|
||||
static const size_t large_size[] = {200000, 200000, 50000};
|
||||
const int large_count = (int) (sizeof(large_size) / sizeof(large_size[0]));
|
||||
char *large_body[3];
|
||||
|
||||
/* edge-case bodies, named so store and read assert the exact same bytes */
|
||||
const char *const body_index = "<html><body>hello</body></html>";
|
||||
const char *const body_api = "{\"k\":\"v\"}";
|
||||
const char *const body_updated = "<html><body>UPDATED CONTENT</body></html>";
|
||||
const char *const body_404 = "<html><body>404 Not Found</body></html>";
|
||||
|
||||
memset(etag_long, 'E', sizeof(etag_long) - 1);
|
||||
etag_long[sizeof(etag_long) - 1] = '\0';
|
||||
memset(location_long, 'L', sizeof(location_long) - 1);
|
||||
location_long[sizeof(location_long) - 1] = '\0';
|
||||
|
||||
for (i = 0; i < large_count; i++) {
|
||||
large_body[i] = malloct(large_size[i]);
|
||||
gen_body(large_body[i], large_size[i], i);
|
||||
}
|
||||
|
||||
/* set up an isolated cache directory */
|
||||
{
|
||||
char base[HTS_URLMAXSIZE];
|
||||
|
||||
strcpybuff(base, dir);
|
||||
if (base[0] != '\0' && base[strlen(base) - 1] != '/') {
|
||||
strcatbuff(base, "/");
|
||||
}
|
||||
StringCopy(opt->path_log, base);
|
||||
}
|
||||
opt->cache = 1;
|
||||
|
||||
/* pass 1: create everything in a single write session */
|
||||
selftest_open_for_write(&cache, opt);
|
||||
|
||||
/* edge cases: normal HTML page */
|
||||
store_entry(opt, &cache, "example.com", "/", "example.com/index.html", 200,
|
||||
"OK", "text/html", "utf-8", "Mon, 01 Jan 2024 00:00:00 GMT",
|
||||
"etag-normal", "", body_index, strlen(body_index));
|
||||
/* redirect: empty body, empty optional fields, near-limit location */
|
||||
store_entry(opt, &cache, "example.com", "/moved", "example.com/moved.html",
|
||||
301, "Moved Permanently", "text/html", "", "", "", location_long,
|
||||
NULL, 0);
|
||||
/* non-HTML content-type kept in cache via all_in_cache, near-limit etag */
|
||||
store_entry(opt, &cache, "example.com", "/api", "example.com/api.json", 200,
|
||||
"OK", "application/json", "utf-8",
|
||||
"Tue, 02 Jan 2024 12:00:00 GMT", etag_long, "", body_api,
|
||||
strlen(body_api));
|
||||
/* binary body */
|
||||
store_entry(opt, &cache, "example.com", "/logo", "example.com/logo.png", 200,
|
||||
"OK", "image/png", "", "", "etag-bin", "", binary_body,
|
||||
sizeof(binary_body));
|
||||
/* error status with a body and a location (non-2xx codes are cached too) */
|
||||
store_entry(opt, &cache, "example.com", "/gone", "example.com/gone.html", 404,
|
||||
"Not Found", "text/html", "utf-8", "", "etag-404",
|
||||
"https://example.com/where-it-went", body_404, strlen(body_404));
|
||||
|
||||
/* scale: a few thousand small entries */
|
||||
for (i = 0; i < SELFTEST_VOLUME; i++) {
|
||||
char fil[64], save[128], body[64];
|
||||
|
||||
sprintf(fil, "/v/%05d", i);
|
||||
sprintf(save, "example.com/v/%05d.html", i);
|
||||
sprintf(body, "<html>volume entry %d</html>", i);
|
||||
store_entry(opt, &cache, "example.com", fil, save, 200, "OK", "text/html",
|
||||
"utf-8", "", "", "", body, strlen(body));
|
||||
}
|
||||
|
||||
/* compression: a few large bodies */
|
||||
for (i = 0; i < large_count; i++) {
|
||||
char fil[64], save[128];
|
||||
|
||||
sprintf(fil, "/big/%d.bin", i);
|
||||
sprintf(save, "example.com/big/%d.bin", i);
|
||||
store_entry(opt, &cache, "example.com", fil, save, 200, "OK",
|
||||
"application/octet-stream", "", "", "", "", large_body[i],
|
||||
large_size[i]);
|
||||
}
|
||||
|
||||
selftest_close(&cache);
|
||||
|
||||
/* pass 2: read back and verify everything round-tripped */
|
||||
selftest_open_for_read(&cache, opt);
|
||||
|
||||
failures += check_entry(opt, &cache, "example.com", "/", 200, "OK",
|
||||
"text/html", "utf-8", "Mon, 01 Jan 2024 00:00:00 GMT",
|
||||
"etag-normal", "", body_index, strlen(body_index));
|
||||
failures += check_entry(opt, &cache, "example.com", "/moved", 301,
|
||||
"Moved Permanently", "text/html", "", "", "",
|
||||
location_long, NULL, 0);
|
||||
failures +=
|
||||
check_entry(opt, &cache, "example.com", "/api", 200, "OK",
|
||||
"application/json", "utf-8", "Tue, 02 Jan 2024 12:00:00 GMT",
|
||||
etag_long, "", body_api, strlen(body_api));
|
||||
failures +=
|
||||
check_entry(opt, &cache, "example.com", "/logo", 200, "OK", "image/png",
|
||||
"", "", "etag-bin", "", binary_body, sizeof(binary_body));
|
||||
failures += check_entry(opt, &cache, "example.com", "/gone", 404, "Not Found",
|
||||
"text/html", "utf-8", "", "etag-404",
|
||||
"https://example.com/where-it-went", body_404,
|
||||
strlen(body_404));
|
||||
|
||||
for (i = 0; i < SELFTEST_VOLUME; i++) {
|
||||
char fil[64], body[64];
|
||||
|
||||
sprintf(fil, "/v/%05d", i);
|
||||
sprintf(body, "<html>volume entry %d</html>", i);
|
||||
failures +=
|
||||
check_entry(opt, &cache, "example.com", fil, 200, "OK", "text/html",
|
||||
"utf-8", "", "", "", body, strlen(body));
|
||||
}
|
||||
|
||||
for (i = 0; i < large_count; i++) {
|
||||
char fil[64];
|
||||
|
||||
sprintf(fil, "/big/%d.bin", i);
|
||||
failures += check_entry(opt, &cache, "example.com", fil, 200, "OK",
|
||||
"application/octet-stream", "", "", "", "",
|
||||
large_body[i], large_size[i]);
|
||||
}
|
||||
|
||||
selftest_close(&cache);
|
||||
|
||||
/* pass 3: update one edge entry with new body and headers */
|
||||
selftest_open_for_write(&cache, opt);
|
||||
store_entry(opt, &cache, "example.com", "/", "example.com/index.html", 200,
|
||||
"OK", "text/html", "iso-8859-1", "Wed, 03 Jan 2024 09:30:00 GMT",
|
||||
"etag-updated", "", body_updated, strlen(body_updated));
|
||||
selftest_close(&cache);
|
||||
|
||||
/* pass 4: re-read and confirm the updated value, not the old one */
|
||||
selftest_open_for_read(&cache, opt);
|
||||
failures +=
|
||||
check_entry(opt, &cache, "example.com", "/", 200, "OK", "text/html",
|
||||
"iso-8859-1", "Wed, 03 Jan 2024 09:30:00 GMT", "etag-updated",
|
||||
"", body_updated, strlen(body_updated));
|
||||
selftest_close(&cache);
|
||||
|
||||
for (i = 0; i < large_count; i++) {
|
||||
freet(large_body[i]);
|
||||
}
|
||||
|
||||
return failures;
|
||||
}
|
||||
49
src/htscache_selftest.h
Normal file
49
src/htscache_selftest.h
Normal file
@@ -0,0 +1,49 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2017 Xavier Roche and other contributors
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Important notes:
|
||||
|
||||
- We hereby ask people using this source NOT to use it in purpose of grabbing
|
||||
emails addresses, or collecting any other private information on persons.
|
||||
This would disgrace our work, and spoil the many hours we spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: htscache_selftest.h */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#ifndef HTSCACHE_SELFTEST_DEFH
|
||||
#define HTSCACHE_SELFTEST_DEFH
|
||||
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_httrackp
|
||||
#define HTS_DEF_FWSTRUCT_httrackp
|
||||
typedef struct httrackp httrackp;
|
||||
#endif
|
||||
|
||||
/* Run the cache create/read/update self-test against a working directory.
|
||||
Returns the number of failed checks (0 == success). */
|
||||
int cache_selftests(httrackp *opt, const char *dir);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -40,11 +40,13 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htscore.h"
|
||||
#include "htsdefines.h"
|
||||
#include "htsalias.h"
|
||||
#include "htsbauth.h"
|
||||
#include "htswrap.h"
|
||||
#include "htsmodules.h"
|
||||
#include "htszlib.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htscache_selftest.h"
|
||||
#include "htsmd5.h"
|
||||
|
||||
#include <ctype.h>
|
||||
@@ -138,6 +140,19 @@ static void basic_selftests(void) {
|
||||
fil_normalized(source, buffer);
|
||||
// MD5 selftests
|
||||
md5selftest();
|
||||
// cookie_get field extraction (tab-separated, 0-based)
|
||||
{
|
||||
char cbuf[8192];
|
||||
|
||||
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 0), "a") == 0);
|
||||
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 1), "b") == 0);
|
||||
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 2), "c") == 0);
|
||||
// multi-char fields catch length/boundary bugs that 1-char fields hide
|
||||
assertf(strcmp(cookie_get(cbuf, "host\tx\t/path/to", 0), "host") == 0);
|
||||
assertf(strcmp(cookie_get(cbuf, "host\tx\t/path/to", 2), "/path/to") == 0);
|
||||
assertf(strcmp(cookie_get(cbuf, "a\t\tc", 1), "") == 0); // empty field
|
||||
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 9), "") == 0); // beyond last
|
||||
}
|
||||
}
|
||||
|
||||
/* Self-tests for the htssafe.h bounded string ops (driven by httrack -#8).
|
||||
@@ -2099,6 +2114,19 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
case '#':{ // non documenté
|
||||
com++;
|
||||
switch (*com) {
|
||||
case 'A': // cache self-test: httrack -#A <dir>
|
||||
if (na + 1 < argc) {
|
||||
const int err = cache_selftests(opt, argv[na + 1]);
|
||||
|
||||
printf("cache-selftest: %s\n", err ? "FAIL" : "OK");
|
||||
htsmain_free();
|
||||
return err;
|
||||
} else {
|
||||
fprintf(stderr, "Option #A requires a directory argument\n");
|
||||
htsmain_free();
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
case 'C': // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file
|
||||
{
|
||||
int hasFilter = 0;
|
||||
@@ -2141,8 +2169,8 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
char firstline[256];
|
||||
char *a = cacheNdx;
|
||||
|
||||
a += cache_brstr(a, firstline);
|
||||
a += cache_brstr(a, firstline);
|
||||
a += cache_brstr(a, firstline, sizeof(firstline));
|
||||
a += cache_brstr(a, firstline, sizeof(firstline));
|
||||
while(a != NULL) {
|
||||
a = strchr(a + 1, '\n'); /* start of line */
|
||||
if (a) {
|
||||
|
||||
46
tests/01_engine-cache.test
Executable file
46
tests/01_engine-cache.test
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# Cache create/read/update logic (driven by 'httrack -#A <dir>').
|
||||
#
|
||||
# The in-process self-test stores several hand-crafted edge entries (normal
|
||||
# HTML, an empty redirect with a near-limit location, a non-HTML body kept via
|
||||
# all-in-cache, a binary body with embedded NUL/high bytes), a few thousand
|
||||
# small entries (index/lookup scale), and a few large compressible and
|
||||
# incompressible bodies (zlib deflate/inflate). It reads everything back
|
||||
# asserting every header field and the body round-trip byte for byte, then
|
||||
# updates one entry and confirms the new value is read back. It exits non-zero
|
||||
# on the first mismatch.
|
||||
|
||||
set -eu
|
||||
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
# Like the other -# debug modes, a trailing token (the working directory) is
|
||||
# required; a bare '-#A' falls through to the usage screen.
|
||||
out=$(httrack -#A "$dir")
|
||||
|
||||
# Match the exact success line, so the test cannot pass for an unrelated reason
|
||||
# (e.g. the -#A mode being gone and falling through to the usage screen, which
|
||||
# also exits non-zero but never prints this).
|
||||
test "$out" = "cache-selftest: OK" || {
|
||||
echo "expected 'cache-selftest: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# The self-test must have actually produced a ZIP cache on disk.
|
||||
test -e "$dir/hts-cache/new.zip" || {
|
||||
echo "no ZIP cache was written by the self-test" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Sanity-check the cache footprint: the few-thousand-entry pass is expected to
|
||||
# weigh ~1-2 MB. Fail if it balloons well past that (e.g. a per-entry overhead
|
||||
# regression or runaway growth), so the cache size stays bounded.
|
||||
ceiling=$((4 * 1024 * 1024))
|
||||
bytes=$(du -sb "$dir/hts-cache" | cut -f1)
|
||||
test "$bytes" -le "$ceiling" || {
|
||||
echo "cache footprint $bytes bytes exceeds ${ceiling} ceiling" >&2
|
||||
exit 1
|
||||
}
|
||||
62
tests/02_update-cache.test
Executable file
62
tests/02_update-cache.test
Executable file
@@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# Update path: re-mirroring a site reads the cache (cache_readex) to decide what
|
||||
# is up to date -- a path the one-shot crawl tests never exercise. Offline
|
||||
# (file://), so it always runs.
|
||||
#
|
||||
# 1. mirror, then re-mirror unchanged -> the cache-read pass must complete clean
|
||||
# (guards against a crash/abort/error in cache_readex).
|
||||
# 2. change a source file, re-mirror -> the update must pick up the new content
|
||||
# (guards the update decision that reads the cached metadata).
|
||||
|
||||
set -eu
|
||||
|
||||
site=$(mktemp -d)
|
||||
out=$(mktemp -d)
|
||||
trap 'rm -rf "$site" "$out"' EXIT
|
||||
|
||||
cat >"$site/index.html" <<EOF
|
||||
<a href="a.html">a</a> <a href="sub/b.html">b</a>
|
||||
EOF
|
||||
echo 'OLDCONTENT' >"$site/a.html"
|
||||
mkdir -p "$site/sub"
|
||||
echo '<p>bbb</p>' >"$site/sub/b.html"
|
||||
|
||||
url="file://$site/index.html"
|
||||
|
||||
# count Error: lines in the log (grep -c exits 1 on zero matches: guard it)
|
||||
errors() { grep -ciE '^[0-9:]*[[:space:]]Error:' "$out/hts-log.txt" || true; }
|
||||
|
||||
# 1. fresh mirror writes the cache
|
||||
httrack "$url" -O "$out" -q -%v0 -r3 >/dev/null 2>&1
|
||||
test -e "$out/hts-cache/new.zip" || {
|
||||
echo "no cache was written" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 2. re-mirror unchanged: the update reads the cache and must complete cleanly
|
||||
httrack "$url" -O "$out" -q -%v0 -r3 >/dev/null 2>&1
|
||||
test "$(errors)" = 0 || {
|
||||
echo "update (unchanged) reported errors" >&2
|
||||
exit 1
|
||||
}
|
||||
for suffix in a.html sub/b.html; do
|
||||
find "$out" -path "*/$suffix" | grep -q . || {
|
||||
echo "missing $suffix after update" >&2
|
||||
exit 1
|
||||
}
|
||||
done
|
||||
|
||||
# 3. change a source file: the update must pick up the new content
|
||||
sleep 1
|
||||
echo 'NEWCONTENT' >"$site/a.html"
|
||||
httrack "$url" -O "$out" -q -%v0 -r3 >/dev/null 2>&1
|
||||
test "$(errors)" = 0 || {
|
||||
echo "update (changed) reported errors" >&2
|
||||
exit 1
|
||||
}
|
||||
grep -q NEWCONTENT "$(find "$out" -path '*/a.html')" || {
|
||||
echo "update did not pick up the changed source" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -11,6 +11,7 @@ TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir)
|
||||
TEST_EXTENSIONS = .test
|
||||
TESTS = \
|
||||
00_runnable.test \
|
||||
01_engine-cache.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-entities.test \
|
||||
@@ -22,6 +23,7 @@ TESTS = \
|
||||
01_engine-simplify.test \
|
||||
01_engine-strsafe.test \
|
||||
02_manpage-regen.test \
|
||||
02_update-cache.test \
|
||||
10_crawl-simple.test \
|
||||
11_crawl-cookies.test \
|
||||
11_crawl-idna.test \
|
||||
|
||||
Reference in New Issue
Block a user