mirror of
https://github.com/xroche/httrack.git
synced 2026-07-04 16:14:47 +03:00
Compare commits
2 Commits
maxsize-te
...
naming-con
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
92ad109c30 | ||
|
|
56b809c82d |
@@ -62,7 +62,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
htsname.c htsrobots.c htstools.c htswizard.c \
|
||||
htsalias.c htsthread.c htsindex.c htsbauth.c \
|
||||
htsmd5.c htszlib.c htswrap.c htsconcat.c \
|
||||
htsmodules.c htscharset.c punycode.c htsencoding.c htssniff.c \
|
||||
htsmodules.c htscharset.c punycode.c htsencoding.c \
|
||||
md5.c \
|
||||
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
||||
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
||||
@@ -70,7 +70,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
||||
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
||||
htshelp.h htsindex.h htslib.h htsmd5.h \
|
||||
htsmodules.h htsname.h htsnet.h htssniff.h \
|
||||
htsmodules.h htsname.h htsnet.h \
|
||||
htsopt.h htsrobots.h htsthread.h \
|
||||
htstools.h htswizard.h htswrap.h htszlib.h \
|
||||
htsstrings.h htsarrays.h httrack-library.h \
|
||||
|
||||
@@ -1359,18 +1359,6 @@ int back_flush_output(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
}
|
||||
|
||||
// effacer entrée
|
||||
/* Discard a cancelled mid-write .delayed placeholder (unusable across runs). */
|
||||
static void back_delayed_discard(httrackp *opt, lien_back *back) {
|
||||
if (back->r.out != NULL) {
|
||||
fclose(back->r.out);
|
||||
back->r.out = NULL;
|
||||
}
|
||||
back->r.is_write = 0;
|
||||
if (opt != NULL)
|
||||
url_savename_refname_remove(opt, back->url_adr, back->url_fil);
|
||||
(void) UNLINK(back->url_sav);
|
||||
}
|
||||
|
||||
int back_delete(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
const int p) {
|
||||
lien_back *const back = sback->lnk;
|
||||
@@ -1378,12 +1366,6 @@ int back_delete(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
|
||||
assertf(p >= 0 && p < back_max);
|
||||
if (p >= 0 && p < sback->count) { // on sait jamais..
|
||||
/* mid-write cancel: drop a .delayed placeholder; real-named partials
|
||||
survive for resume (--continue) */
|
||||
if (back[p].r.is_write && IS_DELAYED_EXT(back[p].url_sav) &&
|
||||
(back[p].status != STATUS_READY || back[p].r.statuscode <= 0)) {
|
||||
back_delayed_discard(opt, &back[p]);
|
||||
}
|
||||
// Vérificateur d'intégrité
|
||||
#if DEBUG_CHECKINT
|
||||
_CHECKINT(&back[p], "Appel back_delete")
|
||||
@@ -2437,34 +2419,6 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
back_clean(opt, cache, sback);
|
||||
#endif
|
||||
|
||||
/* Time limit exceeded past grace: abort in-flight transfers so no wait loop
|
||||
starves (#481). FTP slots stay, their thread owns the socket. */
|
||||
if (!back_checkmirror(opt)) {
|
||||
int aborted = 0;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < (unsigned int) back_max; i++) {
|
||||
if (back[i].status > 0 && back[i].status < STATUS_FTP_TRANSFER) {
|
||||
if (back[i].r.soc != INVALID_SOCKET) {
|
||||
deletehttp(&back[i].r);
|
||||
}
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
/* drop a .delayed placeholder; real partials survive for resume */
|
||||
if (back[i].r.is_write && IS_DELAYED_EXT(back[i].url_sav))
|
||||
back_delayed_discard(opt, &back[i]);
|
||||
back[i].r.statuscode = STATUSCODE_TIMEOUT;
|
||||
strcpybuff(back[i].r.msg, "Mirror Time Out");
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
aborted++;
|
||||
}
|
||||
}
|
||||
if (aborted > 0)
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"time limit reached, %d transfer(s) aborted", aborted);
|
||||
return;
|
||||
}
|
||||
|
||||
// recevoir tant qu'il y a des données (avec un maximum de max_loop boucles)
|
||||
do_wait = 0;
|
||||
gestion_timeout = 0;
|
||||
@@ -4210,11 +4164,6 @@ int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Grace left to the smooth stop before in-flight transfers are aborted. */
|
||||
static int back_maxtime_grace(const int maxtime) {
|
||||
return maximum(5, minimum(30, maxtime / 10));
|
||||
}
|
||||
|
||||
int back_checkmirror(httrackp * opt) {
|
||||
// Check max size
|
||||
if ((opt->maxsite > 0) && (HTS_STAT.stat_bytes >= opt->maxsite)) {
|
||||
@@ -4231,19 +4180,13 @@ int back_checkmirror(httrackp * opt) {
|
||||
*/
|
||||
}
|
||||
// Check max time
|
||||
if (opt->maxtime > 0) {
|
||||
const TStamp elapsed = time_local() - HTS_STAT.stat_timestart;
|
||||
|
||||
if (elapsed >= opt->maxtime) {
|
||||
if (!opt->state.stop) { /* not yet stopped */
|
||||
hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up",
|
||||
opt->maxtime);
|
||||
/* cancel mirror smoothly */
|
||||
hts_request_stop(opt, 0);
|
||||
}
|
||||
/* smooth stop starved past the grace period: stop waiting (#481) */
|
||||
if (elapsed - opt->maxtime >= back_maxtime_grace(opt->maxtime))
|
||||
return 0;
|
||||
if ((opt->maxtime > 0)
|
||||
&& ((time_local() - HTS_STAT.stat_timestart) >= opt->maxtime)) {
|
||||
if (!opt->state.stop) { /* not yet stopped */
|
||||
hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up",
|
||||
opt->maxtime);
|
||||
/* cancel mirror smoothly */
|
||||
hts_request_stop(opt, 0);
|
||||
}
|
||||
}
|
||||
return 1; /* Ok, go on */
|
||||
|
||||
@@ -136,8 +136,6 @@ void back_solve(httrackp * opt, lien_back * sback);
|
||||
int host_wait(httrackp * opt, lien_back * sback);
|
||||
#endif
|
||||
int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize);
|
||||
/* Enforce -M/-E quotas: requests a smooth stop when reached; returns 0 once
|
||||
the -E deadline overran its grace period (callers must stop waiting). */
|
||||
int back_checkmirror(httrackp * opt);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -596,18 +596,15 @@ htsblk cache_read_ro(httrackp * opt, cache_back * cache, const char *adr,
|
||||
return cache_readex(opt, cache, adr, fil, save, location, NULL, 1);
|
||||
}
|
||||
|
||||
htsblk cache_read_including_broken(httrackp *opt, cache_back *cache,
|
||||
const char *adr, const char *fil,
|
||||
char *return_save) {
|
||||
htsblk r = cache_readex(opt, cache, adr, fil, NULL, NULL, return_save, 0);
|
||||
htsblk cache_read_including_broken(httrackp * opt, cache_back * cache,
|
||||
const char *adr, const char *fil) {
|
||||
htsblk r = cache_read(opt, cache, adr, fil, NULL, NULL);
|
||||
|
||||
if (r.statuscode == -1) {
|
||||
lien_back *itemback = NULL;
|
||||
|
||||
if (back_unserialize_ref(opt, adr, fil, &itemback) == 0) {
|
||||
r = itemback->r;
|
||||
if (return_save != NULL)
|
||||
strlcpybuff(return_save, itemback->url_sav, HTS_URLMAXSIZE * 2);
|
||||
/* cleanup */
|
||||
back_clear_entry(itemback); /* delete entry content */
|
||||
freet(itemback); /* delete item */
|
||||
|
||||
@@ -66,11 +66,8 @@ htsblk cache_read(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, const char *save, char *location);
|
||||
htsblk cache_read_ro(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, const char *save, char *location);
|
||||
/* Like cache_read, but also yields entries whose transfer broke; return_save
|
||||
(optional, HTS_URLMAXSIZE*2) receives the entry's recorded save name. */
|
||||
htsblk cache_read_including_broken(httrackp *opt, cache_back *cache,
|
||||
const char *adr, const char *fil,
|
||||
char *return_save);
|
||||
htsblk cache_read_including_broken(httrackp * opt, cache_back * cache,
|
||||
const char *adr, const char *fil);
|
||||
htsblk cache_readex(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, const char *save, char *location,
|
||||
char *return_save, int readonly);
|
||||
|
||||
220
src/htsname.c
220
src/htsname.c
@@ -41,10 +41,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htstools.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htssniff.h"
|
||||
#if HTS_USEZLIB
|
||||
#include "htszlib.h"
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
|
||||
#define ADD_STANDARD_PATH \
|
||||
@@ -74,36 +70,30 @@ static const char *hts_tbdev[] = {
|
||||
""
|
||||
};
|
||||
|
||||
#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() \
|
||||
do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while (back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state._hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback, opt, cache, 0); \
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */ \
|
||||
if (!back_checkmirror(opt)) \
|
||||
break; \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error"); \
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning"); \
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info"); \
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr); \
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback); \
|
||||
/* Check */ \
|
||||
{ \
|
||||
if (!RUN_CALLBACK7( \
|
||||
opt, loop, sback->lnk, sback->count, -1, ptr, opt->lien_tot, \
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while (0)
|
||||
#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while(back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state. _hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback,opt,cache,0); \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket=back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \
|
||||
HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \
|
||||
HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \
|
||||
HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \
|
||||
HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \
|
||||
/* Check */ \
|
||||
{ \
|
||||
if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count,-1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while(0)
|
||||
|
||||
/* Strip all // */
|
||||
static void cleanDoubleSlash(char *s) {
|
||||
@@ -150,7 +140,8 @@ static void cleanEndingSpaceOrDot(char *s) {
|
||||
|
||||
/* Wire Content-Type vs URL extension: a patchable wire type wins over an
|
||||
unspecific ext, the HTS_UNKNOWN_MIME sentinel keeps a specific non-HTML ext
|
||||
(#267 guard), a declared disagreement is CONTESTED (sniffed below). */
|
||||
(#267 guard), a declared disagreement is CONTESTED. Sentinel and verdict
|
||||
ride the cache, so updates stay consistent. */
|
||||
typedef enum wire_verdict {
|
||||
WIRE_KEEPS_EXT,
|
||||
WIRE_WINS,
|
||||
@@ -174,105 +165,8 @@ static wire_verdict wire_ext_verdict(httrackp *opt, const char *wiremime,
|
||||
return WIRE_CONTESTED;
|
||||
}
|
||||
|
||||
/* Optional evidence for a contested wire-vs-ext verdict. */
|
||||
typedef struct sniff_src {
|
||||
struct_back *sback; /* live backing (looked up by adr/fil) */
|
||||
const lien_back *headers; /* snapshot: r.adr, else the url_sav file */
|
||||
const char *adr, *fil;
|
||||
const char *prev_save; /* previous run's save name (cache X-Save) */
|
||||
} sniff_src;
|
||||
|
||||
#if HTS_USEZLIB
|
||||
/* Inflate the head of a gzip/zlib stream; 0 when undecodable. */
|
||||
static size_t sniff_inflate_head(const void *in, size_t in_len, void *out,
|
||||
size_t out_len) {
|
||||
z_stream zs;
|
||||
size_t n = 0;
|
||||
int err;
|
||||
|
||||
memset(&zs, 0, sizeof(zs));
|
||||
if (inflateInit2(&zs, 47) != Z_OK) /* 47: gzip or zlib, autodetected */
|
||||
return 0;
|
||||
zs.next_in = (const Bytef *) in;
|
||||
zs.avail_in = (uInt) in_len;
|
||||
zs.next_out = (Bytef *) out;
|
||||
zs.avail_out = (uInt) out_len;
|
||||
err = inflate(&zs, Z_SYNC_FLUSH);
|
||||
if (err == Z_OK || err == Z_STREAM_END || err == Z_BUF_ERROR)
|
||||
n = out_len - zs.avail_out;
|
||||
inflateEnd(&zs);
|
||||
return n;
|
||||
}
|
||||
#endif
|
||||
|
||||
static size_t sniff_read_head(const char *path, void *buf, size_t len) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
FILE *const fp = FOPEN(fconv(catbuff, sizeof(catbuff), path), "rb");
|
||||
size_t n = 0;
|
||||
|
||||
if (fp != NULL) {
|
||||
n = fread(buf, 1, len, fp);
|
||||
fclose(fp);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Body head of one slot: memory, else its flushed on-disk file (url_sav, or
|
||||
tmpfile for a compressed stream); inflated so the sniff sees the final body.
|
||||
*/
|
||||
static size_t sniff_slot_head(const lien_back *slot, void *buf, size_t len) {
|
||||
const htsblk *const r = &slot->r;
|
||||
size_t n = 0;
|
||||
|
||||
if (r->adr != NULL && r->size > 0) {
|
||||
n = (size_t) r->size < len ? (size_t) r->size : len;
|
||||
memcpy(buf, r->adr, n);
|
||||
} else {
|
||||
if (r->out != NULL)
|
||||
fflush(r->out);
|
||||
if (slot->url_sav[0] != '\0')
|
||||
n = sniff_read_head(slot->url_sav, buf, len);
|
||||
if (n == 0 && slot->tmpfile != NULL && slot->tmpfile[0] != '\0')
|
||||
n = sniff_read_head(slot->tmpfile, buf, len);
|
||||
}
|
||||
if (n > 0 && r->compressed) {
|
||||
#if HTS_USEZLIB
|
||||
unsigned char raw[HTS_SNIFF_LEN];
|
||||
|
||||
if (n > sizeof(raw))
|
||||
n = sizeof(raw);
|
||||
memcpy(raw, buf, n);
|
||||
n = sniff_inflate_head(raw, n, buf, len);
|
||||
#else
|
||||
n = 0;
|
||||
#endif
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Up to len leading body bytes; 0 when unavailable, and always in
|
||||
non-delayed mode (its HEAD-probe first run couldn't sniff either). */
|
||||
static size_t sniff_body_head(httrackp *opt, const sniff_src *src, void *buf,
|
||||
size_t len) {
|
||||
size_t n = 0;
|
||||
|
||||
if (src == NULL || opt->savename_delayed == HTS_SAVENAME_DELAYED_NONE)
|
||||
return 0;
|
||||
/* live backing slot: a snapshot (back_copy_static) loses r.adr/r.out */
|
||||
if (src->sback != NULL && src->adr != NULL && src->fil != NULL) {
|
||||
const int b = back_index(opt, src->sback, src->adr, src->fil, NULL);
|
||||
|
||||
if (b >= 0)
|
||||
n = sniff_slot_head(&src->sback->lnk[b], buf, len);
|
||||
}
|
||||
if (n == 0 && src->headers != NULL)
|
||||
n = sniff_slot_head(src->headers, buf, len);
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Contested verdicts: magic proving the URL ext keeps it, else wire wins. */
|
||||
static int wire_patches_ext(httrackp *opt, const sniff_src *src,
|
||||
const char *wiremime, const char *file) {
|
||||
static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
char urlmime[256];
|
||||
|
||||
switch (wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime))) {
|
||||
@@ -281,51 +175,22 @@ static int wire_patches_ext(httrackp *opt, const sniff_src *src,
|
||||
case WIRE_WINS:
|
||||
return 1;
|
||||
case WIRE_CONTESTED:
|
||||
break;
|
||||
}
|
||||
if (src != NULL) {
|
||||
if (hts_sniff_mime_known(urlmime)) {
|
||||
unsigned char head[HTS_SNIFF_LEN];
|
||||
const size_t n = sniff_body_head(opt, src, head, sizeof(head));
|
||||
|
||||
if (n > 0)
|
||||
return hts_sniff_mime_consistent(head, n, urlmime) ? 0 : 1;
|
||||
}
|
||||
/* no bytes: reproduce the previous run's verdict (cached X-Save name) */
|
||||
if (src->prev_save != NULL && src->prev_save[0] != '\0') {
|
||||
char prevmime[256];
|
||||
|
||||
prevmime[0] = '\0';
|
||||
if (get_httptype_sized(opt, prevmime, sizeof(prevmime), src->prev_save,
|
||||
0) &&
|
||||
strfield2(prevmime, urlmime))
|
||||
return 0;
|
||||
}
|
||||
break; /* no content evidence is consulted today: trust the wire */
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int hts_ext_sniff_wanted(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
char urlmime[256];
|
||||
|
||||
return wiremime != NULL && strnotempty(wiremime) &&
|
||||
wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime)) ==
|
||||
WIRE_CONTESTED &&
|
||||
hts_sniff_mime_known(urlmime);
|
||||
}
|
||||
|
||||
/* Wire-metadata name change: a Content-Disposition filename wins (returns 2),
|
||||
else the declared type's ext when wire_patches_ext() allows (returns 1),
|
||||
else 0. ext receives the new extension or replacement filename. */
|
||||
static int resolve_extension(httrackp *opt, const sniff_src *src,
|
||||
const char *cdispo, const char *contenttype,
|
||||
const char *fil, char *ext, size_t ext_size) {
|
||||
static int resolve_extension(httrackp *opt, const char *cdispo,
|
||||
const char *contenttype, const char *fil,
|
||||
char *ext, size_t ext_size) {
|
||||
if (strnotempty(cdispo)) {
|
||||
strlcpybuff(ext, cdispo, ext_size);
|
||||
return 2;
|
||||
}
|
||||
if (wire_patches_ext(opt, src, contenttype, fil) &&
|
||||
if (wire_patches_ext(opt, contenttype, fil) &&
|
||||
give_mimext(ext, ext_size, contenttype))
|
||||
return 1;
|
||||
return 0;
|
||||
@@ -577,21 +442,14 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD ||
|
||||
ishtml(opt, fil) < 0) { // unsure whether it's html or a file
|
||||
// lire dans le cache
|
||||
char BIGSTK previous_save[HTS_URLMAXSIZE * 2];
|
||||
htsblk r;
|
||||
|
||||
previous_save[0] = '\0';
|
||||
r = cache_read_including_broken(opt, cache, adr, fil,
|
||||
previous_save); // test uniquement
|
||||
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
|
||||
|
||||
if (r.statuscode != -1) { // cache entry read OK
|
||||
hts_log_print(opt, LOG_DEBUG, "Testing link type (from cache) %s%s",
|
||||
adr_complete, fil_complete);
|
||||
if (!HTTP_IS_REDIRECT(r.statuscode)) {
|
||||
const sniff_src src = {sback, NULL, adr, fil, previous_save};
|
||||
|
||||
ext_chg = resolve_extension(opt, &src, r.cdispo, r.contenttype,
|
||||
fil, ext, sizeof(ext));
|
||||
ext_chg = resolve_extension(opt, r.cdispo, r.contenttype, fil,
|
||||
ext, sizeof(ext));
|
||||
}
|
||||
} else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD &&
|
||||
is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
||||
@@ -618,9 +476,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
!opt->state.stop) {
|
||||
// Check if the file is ready in backing.
|
||||
if (headers != NULL && headers->status >= 0 && !is_redirect) {
|
||||
const sniff_src src = {sback, headers, adr, fil, NULL};
|
||||
|
||||
ext_chg = resolve_extension(opt, &src, headers->r.cdispo,
|
||||
ext_chg = resolve_extension(opt, headers->r.cdispo,
|
||||
headers->r.contenttype,
|
||||
headers->url_fil, ext, sizeof(ext));
|
||||
}
|
||||
@@ -831,7 +687,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
|
||||
// no error: change the type?
|
||||
ext_chg = resolve_extension(
|
||||
opt, NULL, back[b].r.cdispo, back[b].r.contenttype,
|
||||
opt, back[b].r.cdispo, back[b].r.contenttype,
|
||||
back[b].url_fil, ext, sizeof(ext));
|
||||
}
|
||||
// FIN Si non déplacé, forcer type?
|
||||
|
||||
@@ -100,8 +100,6 @@ void standard_name(char *b, size_t bsize, const char *dot_pos,
|
||||
const char *nom_pos, const char *fil_complete,
|
||||
int short_ver);
|
||||
void url_savename_addstr(char *d, const char *s);
|
||||
/* Contested wire-vs-ext verdict that a body sniff could settle (htssniff.h). */
|
||||
int hts_ext_sniff_wanted(httrackp *opt, const char *wiremime, const char *file);
|
||||
char *url_md5(char *digest_buffer, const char *fil_complete);
|
||||
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
||||
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||
|
||||
@@ -49,7 +49,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htsindex.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htssniff.h"
|
||||
|
||||
/* external modules */
|
||||
#include "htsmodules.h"
|
||||
@@ -4077,9 +4076,6 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
|
||||
while(opt->state._hts_setpause || back_pluggable_sockets_strict(sback, opt) <= 0) { // on fait la pause..
|
||||
opt->state._hts_in_html_parsing = 6;
|
||||
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */
|
||||
if (!back_checkmirror(opt))
|
||||
break;
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
@@ -4699,26 +4695,22 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
back_set_unlocked(sback, b);
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (
|
||||
/* dns/connect/request */
|
||||
(back[b].status >= 99 && back[b].status <= 101) ||
|
||||
/* For redirects, wait for request to be terminated */
|
||||
(HTTP_IS_REDIRECT(back[b].r.statuscode) && back[b].status > 0) ||
|
||||
/* Same for errors */
|
||||
(HTTP_IS_ERROR(back[b].r.statuscode) && back[b].status > 0) ||
|
||||
/* Contested type: wait for a sniffable body head (or EOF) */
|
||||
(back[b].r.statuscode == HTTP_OK && back[b].status > 0 &&
|
||||
strnotempty(back[b].r.cdispo) == 0 &&
|
||||
back[b].r.size < HTS_SNIFF_LEN &&
|
||||
hts_ext_sniff_wanted(opt, back[b].r.contenttype,
|
||||
back[b].url_fil)));
|
||||
} while(
|
||||
/* dns/connect/request */
|
||||
(back[b].status >= 99 && back[b].status <= 101)
|
||||
||
|
||||
/* For redirects, wait for request to be terminated */
|
||||
(HTTP_IS_REDIRECT(back[b].r.statuscode) && back[b].status > 0)
|
||||
||
|
||||
/* Same for errors */
|
||||
(HTTP_IS_ERROR(back[b].r.statuscode) && back[b].status > 0)
|
||||
);
|
||||
if (b >= 0) {
|
||||
back_set_unlocked(sback, b); // Unlocked entry
|
||||
}
|
||||
@@ -4853,8 +4845,8 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
|
||||
/* Still have a back reference */
|
||||
if (b >= 0) {
|
||||
/* patch url_sav BEFORE finalize: it records/caches under this name
|
||||
*/
|
||||
/* Patch destination filename for direct-to-disk mode, BEFORE any
|
||||
finalize: it records and caches the entry under url_sav */
|
||||
strcpybuff(back[b].url_sav, afs->save);
|
||||
/* Finalize now as we have the type */
|
||||
if (back[b].status == STATUS_READY) {
|
||||
|
||||
@@ -175,33 +175,27 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
/* Apply changes */ \
|
||||
* str->ptr_ = ptr
|
||||
|
||||
#define WAIT_FOR_AVAILABLE_SOCKET() \
|
||||
do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while (back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state._hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback, opt, cache, 0); \
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */ \
|
||||
if (!back_checkmirror(opt)) \
|
||||
break; \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error"); \
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning"); \
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info"); \
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr); \
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback); \
|
||||
/* Check */ \
|
||||
if (!RUN_CALLBACK7( \
|
||||
opt, loop, sback->lnk, sback->count, -1, ptr, opt->lien_tot, \
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while (0)
|
||||
#define WAIT_FOR_AVAILABLE_SOCKET() do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while(back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state._hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback,opt,cache,0); \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket=back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \
|
||||
HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \
|
||||
HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \
|
||||
HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \
|
||||
HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \
|
||||
/* Check */ \
|
||||
if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, -1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -52,7 +52,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htsencoding.h"
|
||||
#include "htsftp.h"
|
||||
#include "htsmd5.h"
|
||||
#include "htssniff.h"
|
||||
#if HTS_USEZLIB
|
||||
#include "htszlib.h"
|
||||
#endif
|
||||
@@ -1142,22 +1141,6 @@ static size_t st_decode_body(const char *arg, char *buf, size_t size) {
|
||||
return n;
|
||||
}
|
||||
|
||||
static int st_sniff(httrackp *opt, int argc, char **argv) {
|
||||
char BIGSTK body[1024];
|
||||
size_t n;
|
||||
|
||||
(void) opt;
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "sniff: needs a content-type and a body\n");
|
||||
return 1;
|
||||
}
|
||||
n = st_decode_body(argv[1], body, sizeof(body));
|
||||
printf("sniff: known=%d consistent=%d\n",
|
||||
hts_sniff_mime_known(argv[0]) == HTS_TRUE,
|
||||
hts_sniff_mime_consistent(body, n, argv[0]) == HTS_TRUE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int st_savename(httrackp *opt, int argc, char **argv) {
|
||||
lien_adrfilsave afs;
|
||||
cache_back cache;
|
||||
@@ -1228,7 +1211,7 @@ static int st_savename(httrackp *opt, int argc, char **argv) {
|
||||
}
|
||||
*sep = '\0';
|
||||
/* one-entry cache in cwd, reopened read-only; body is PNG magic on
|
||||
purpose: only the recorded name (X-Save) may drive the naming */
|
||||
purpose: naming must not depend on stored content */
|
||||
StringCopy(opt->path_log, "");
|
||||
cache.type = 1;
|
||||
cache.log = cache.errlog = stderr;
|
||||
@@ -1289,7 +1272,7 @@ static int st_savename(httrackp *opt, int argc, char **argv) {
|
||||
if (cdispo != NULL)
|
||||
strcpybuff(headers.r.cdispo, cdispo);
|
||||
strcpybuff(headers.url_fil, argv[0]);
|
||||
if (body != NULL) { /* leading body bytes, read via url_sav */
|
||||
if (body != NULL) { /* leading body bytes, exposed via url_sav */
|
||||
char BIGSTK data[1024];
|
||||
const size_t n = st_decode_body(body, data, sizeof(data));
|
||||
FILE *const fp = fopen(bodyfile, "wb");
|
||||
@@ -2112,8 +2095,6 @@ static const struct selftest_entry {
|
||||
st_header},
|
||||
{"savename", "<fil> <content-type> [key=value ...]",
|
||||
"local save-name for a URL", st_savename},
|
||||
{"sniff", "<content-type> <hex:..|text>", "MIME magic consistency",
|
||||
st_sniff},
|
||||
{"cache", "<dir>", "cache read/write round-trip self-test", st_cache},
|
||||
{"cache-golden", "<dir> [regen]", "frozen cache-format read self-test",
|
||||
st_cache_golden},
|
||||
|
||||
352
src/htssniff.c
352
src/htssniff.c
@@ -1,352 +0,0 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2017 Xavier Roche and other contributors
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Important notes:
|
||||
|
||||
- We hereby ask people using this source NOT to use it in purpose of grabbing
|
||||
emails addresses, or collecting any other private information on persons.
|
||||
This would disgrace our work, and spoil the many hours we spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: MIME magic-byte consistency checks */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#include "htssniff.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "htslib.h"
|
||||
|
||||
/* One magic rule: `len` bytes at `off` confirm `mime`. */
|
||||
typedef struct sniff_magic {
|
||||
const char *mime;
|
||||
unsigned short off;
|
||||
unsigned char len;
|
||||
const char *bytes;
|
||||
} sniff_magic;
|
||||
|
||||
/* Direction is mime -> magic (verify a claim, never classify); types with
|
||||
no reliable magic (plain text, css, js..) are deliberately absent. Patterns
|
||||
follow the WHATWG MIME Sniffing Standard tables where it defines them
|
||||
(https://mimesniff.spec.whatwg.org/); the rest covers httrack's wider MIME
|
||||
set. Spec-only types absent from our MIME tables (EOT, font/collection)
|
||||
are omitted as unreachable. */
|
||||
static const sniff_magic sniff_table[] = {
|
||||
/* images */
|
||||
{"image/jpeg", 0, 3, "\xff\xd8\xff"},
|
||||
{"image/pipeg", 0, 3, "\xff\xd8\xff"},
|
||||
{"image/pjpeg", 0, 3, "\xff\xd8\xff"},
|
||||
{"image/png", 0, 8, "\x89PNG\r\n\x1a\n"},
|
||||
{"image/gif", 0, 6, "GIF87a"},
|
||||
{"image/gif", 0, 6, "GIF89a"},
|
||||
{"image/bmp", 0, 2, "BM"},
|
||||
{"image/tiff", 0, 4, "II*\0"},
|
||||
{"image/tiff", 0, 4, "MM\0*"},
|
||||
{"image/x-icon", 0, 4, "\0\0\1\0"},
|
||||
{"image/x-icon", 0, 4, "\0\0\2\0"}, /* Windows cursor, per the spec */
|
||||
{"image/x-portable-bitmap", 0, 2, "P1"},
|
||||
{"image/x-portable-bitmap", 0, 2, "P4"},
|
||||
{"image/x-portable-pixmap", 0, 2, "P3"},
|
||||
{"image/x-portable-pixmap", 0, 2, "P6"},
|
||||
{"image/x-xpixmap", 0, 9, "/* XPM */"},
|
||||
{"image/x-xbitmap", 0, 7, "#define"},
|
||||
{"image/x-rgb", 0, 2, "\x01\xda"},
|
||||
{"image/x-cmu-raster", 0, 4, "\xf1\x00\x40\xbb"},
|
||||
/* audio */
|
||||
{"audio/mpeg", 0, 3, "ID3"},
|
||||
{"audio/basic", 0, 4, ".snd"},
|
||||
{"audio/mid", 0, 8, "MThd\0\0\0\6"},
|
||||
{"audio/midi", 0, 8, "MThd\0\0\0\6"},
|
||||
{"audio/x-pn-realaudio", 0, 4, ".ra\xfd"},
|
||||
{"audio/x-pn-realaudio", 0, 4, ".RMF"},
|
||||
{"audio/x-pn-realaudio-plugin", 0, 4, ".ra\xfd"},
|
||||
{"audio/x-pn-realaudio-plugin", 0, 4, ".RMF"},
|
||||
{"audio/flac", 0, 4, "fLaC"},
|
||||
{"audio/aac", 0, 4, "ADIF"},
|
||||
/* video */
|
||||
{"video/mpeg", 0, 4, "\x00\x00\x01\xba"},
|
||||
{"video/mpeg", 0, 4, "\x00\x00\x01\xb3"},
|
||||
{"video/x-sgi-movie", 0, 4, "MOVI"},
|
||||
/* archives / compression */
|
||||
{"application/x-gzip", 0, 3, "\x1f\x8b\x08"},
|
||||
{"multipart/x-gzip", 0, 3, "\x1f\x8b\x08"},
|
||||
{"application/x-compressed", 0, 3, "\x1f\x8b\x08"},
|
||||
{"application/x-compress", 0, 2, "\x1f\x9d"},
|
||||
{"application/x-bzip2", 0, 3, "BZh"},
|
||||
{"application/x-7z-compressed", 0, 6, "7z\xbc\xaf\x27\x1c"},
|
||||
/* 6-byte prefix common to RAR4 (spec) and RAR5 */
|
||||
{"application/x-rar-compressed", 0, 6, "Rar!\x1a\x07"},
|
||||
{"application/zstd", 0, 4, "\x28\xb5\x2f\xfd"},
|
||||
{"application/arj", 0, 2, "\x60\xea"},
|
||||
{"application/x-cpio", 0, 6, "070701"},
|
||||
{"application/x-cpio", 0, 6, "070707"},
|
||||
{"application/x-cpio", 0, 2, "\xc7\x71"},
|
||||
{"application/x-sv4cpio", 0, 6, "070701"},
|
||||
{"application/x-sv4crc", 0, 6, "070702"},
|
||||
{"application/x-stuffit", 0, 8, "StuffIt "},
|
||||
{"application/x-stuffit", 0, 4, "SIT!"},
|
||||
{"application/mac-binhex40", 0, 10, "(This file"},
|
||||
/* documents */
|
||||
{"application/pdf", 0, 5, "%PDF-"},
|
||||
{"application/postscript", 0, 2, "%!"},
|
||||
{"application/rtf", 0, 5, "{\\rtf"},
|
||||
{"application/x-dvi", 0, 2, "\xf7\x02"},
|
||||
{"application/x-hdf", 0, 4, "\x0e\x03\x13\x01"},
|
||||
{"application/x-hdf", 0, 8, "\x89HDF\r\n\x1a\n"},
|
||||
{"application/x-netcdf", 0, 4, "CDF\x01"},
|
||||
{"application/x-netcdf", 0, 4, "CDF\x02"},
|
||||
{"application/x-msaccess", 0, 19, "\0\1\0\0Standard Jet DB"},
|
||||
/* fonts */
|
||||
{"font/woff", 0, 4, "wOFF"},
|
||||
{"font/woff2", 0, 4, "wOF2"},
|
||||
{"font/ttf", 0, 4, "\0\1\0\0"},
|
||||
{"font/ttf", 0, 4, "true"},
|
||||
{"font/otf", 0, 4, "OTTO"},
|
||||
/* misc */
|
||||
{"application/x-shockwave-flash", 0, 3, "FWS"},
|
||||
{"application/x-shockwave-flash", 0, 3, "CWS"},
|
||||
{"application/x-shockwave-flash", 0, 3, "ZWS"},
|
||||
{"application/futuresplash", 0, 3, "FWS"},
|
||||
{"application/x-director", 0, 4, "RIFX"},
|
||||
{"application/x-director", 0, 4, "XFIR"},
|
||||
{"application/x-java-vm", 0, 4, "\xca\xfe\xba\xbe"},
|
||||
{"application/wasm", 0, 4, "\0asm"},
|
||||
{"application/x-msmetafile", 0, 4, "\xd7\xcd\xc6\x9a"},
|
||||
{"application/x-msmetafile", 0, 4, "\x01\x00\x09\x00"},
|
||||
{"application/x-x509-ca-cert", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs12", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs7-mime", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs7-signature", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs7-certificates", 0, 2, "\x30\x82"},
|
||||
{"x-world/x-vrml", 0, 5, "#VRML"},
|
||||
{"application/x-bittorrent", 0, 11, "d8:announce"},
|
||||
{"drawing/x-dwf", 0, 4, "(DWF"},
|
||||
{"application/acad", 0, 4, "AC10"},
|
||||
{NULL, 0, 0, NULL}};
|
||||
|
||||
/* MIME families sharing a container magic */
|
||||
static const char *const zip_mimes[] = {
|
||||
"application/zip", "application/x-zip-compressed", "multipart/x-zip", NULL};
|
||||
static const char *const zip_mime_prefixes[] = {
|
||||
"application/vnd.openxmlformats-officedocument.",
|
||||
"application/vnd.oasis.opendocument.", NULL};
|
||||
static const char *const ole_mimes[] = {"application/msword",
|
||||
"application/excel",
|
||||
"application/vnd.ms-excel",
|
||||
"application/powerpoint",
|
||||
"application/vnd.ms-powerpoint",
|
||||
"application/vnd.ms-project",
|
||||
"application/vnd.ms-works",
|
||||
"application/x-msmoney",
|
||||
"application/x-mspublisher",
|
||||
NULL};
|
||||
static const char *const tar_mimes[] = {
|
||||
"application/x-tar", "application/x-ustar", "application/x-gtar", NULL};
|
||||
static const char *const ogg_mimes[] = {"application/ogg", "audio/ogg",
|
||||
"video/ogg", "audio/opus", NULL};
|
||||
static const char *const ebml_mimes[] = {"video/webm", "audio/webm", NULL};
|
||||
/* ISO-BMFF, any 'ftyp' brand: containers overlap too much to split */
|
||||
static const char *const bmff_mimes[] = {"video/mp4", "audio/mp4",
|
||||
"video/quicktime", NULL};
|
||||
static const char *const avif_mimes[] = {"image/avif", NULL};
|
||||
static const char *const heic_mimes[] = {"image/heic", NULL};
|
||||
static const char *const asf_mimes[] = {"video/x-ms-asf", "video/x-ms-wmv",
|
||||
"video/x-la-asf", NULL};
|
||||
static const char *const xml_mimes[] = {"application/xml", "text/xml",
|
||||
"image/svg+xml", "image/svg-xml", NULL};
|
||||
static const char *const svg_mimes[] = {"image/svg+xml", "image/svg-xml", NULL};
|
||||
static const char *const html_mimes[] = {"text/html", NULL};
|
||||
static const char *const pem_mimes[] = {
|
||||
"application/x-x509-ca-cert", "application/x-pkcs7-certificates",
|
||||
"application/x-pkcs7-mime", "application/x-pkcs7-signature", NULL};
|
||||
|
||||
static hts_boolean mime_in(const char *const *list, const char *mime) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; list[i] != NULL; i++)
|
||||
if (strfield2(list[i], mime))
|
||||
return HTS_TRUE;
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static hts_boolean mime_in_prefix(const char *const *list, const char *mime) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; list[i] != NULL; i++)
|
||||
if (strfield(mime, list[i]))
|
||||
return HTS_TRUE;
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static hts_boolean has_bytes(const unsigned char *d, size_t n, size_t off,
|
||||
const char *bytes, size_t len) {
|
||||
/* overflow-safe: untrusted n alone on one side */
|
||||
return n >= off && len <= n - off && memcmp(d + off, bytes, len) == 0
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
}
|
||||
|
||||
static unsigned char ascii_lower(unsigned char c) {
|
||||
return c >= 'A' && c <= 'Z' ? (unsigned char) (c + 32) : c;
|
||||
}
|
||||
|
||||
/* Case-insensitive text prefix after an optional UTF-8 BOM and whitespace. */
|
||||
static hts_boolean has_text_prefix(const unsigned char *d, size_t n,
|
||||
const char *prefix) {
|
||||
const size_t len = strlen(prefix);
|
||||
size_t i, k;
|
||||
|
||||
i = n >= 3 && memcmp(d, "\xef\xbb\xbf", 3) == 0 ? 3 : 0;
|
||||
while (i < n && (d[i] == ' ' || d[i] == '\t' || d[i] == '\r' || d[i] == '\n'))
|
||||
i++;
|
||||
if (len > n - i) /* i <= n from the loop above */
|
||||
return HTS_FALSE;
|
||||
for (k = 0; k < len; k++)
|
||||
if (ascii_lower(d[i + k]) != ascii_lower((unsigned char) prefix[k]))
|
||||
return HTS_FALSE;
|
||||
return HTS_TRUE;
|
||||
}
|
||||
|
||||
typedef enum sniff_op {
|
||||
SNIFF_QUERY_KNOWN, /* is any rule defined for this MIME? */
|
||||
SNIFF_QUERY_MATCH /* do the bytes confirm this MIME? */
|
||||
} sniff_op;
|
||||
|
||||
/* Single walk for both queries so the rule set can't drift apart. */
|
||||
static hts_boolean sniff_eval(sniff_op op, const unsigned char *d, size_t n,
|
||||
const char *mime) {
|
||||
size_t i;
|
||||
|
||||
/* KNOWN short-circuits; MATCH tests the magic */
|
||||
#define SNIFF_RULE(cond) \
|
||||
do { \
|
||||
if (op == SNIFF_QUERY_KNOWN) \
|
||||
return HTS_TRUE; \
|
||||
if (cond) \
|
||||
return HTS_TRUE; \
|
||||
} while (0)
|
||||
|
||||
for (i = 0; sniff_table[i].mime != NULL; i++) {
|
||||
if (strfield2(sniff_table[i].mime, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, sniff_table[i].off, sniff_table[i].bytes,
|
||||
sniff_table[i].len));
|
||||
}
|
||||
}
|
||||
if (mime_in(zip_mimes, mime) || mime_in_prefix(zip_mime_prefixes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "PK\3\4", 4) ||
|
||||
has_bytes(d, n, 0, "PK\5\6", 4));
|
||||
}
|
||||
if (mime_in(ole_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8));
|
||||
}
|
||||
if (mime_in(tar_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 257, "ustar", 5));
|
||||
}
|
||||
if (mime_in(ogg_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "OggS\0", 5));
|
||||
}
|
||||
if (mime_in(ebml_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "\x1a\x45\xdf\xa3", 4));
|
||||
}
|
||||
if (mime_in(bmff_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 4, "ftyp", 4));
|
||||
}
|
||||
if (mime_in(avif_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 4, "ftypavif", 8) ||
|
||||
has_bytes(d, n, 4, "ftypavis", 8));
|
||||
}
|
||||
if (mime_in(heic_mimes, mime)) {
|
||||
SNIFF_RULE(
|
||||
has_bytes(d, n, 4, "ftyphei", 7) || has_bytes(d, n, 4, "ftyphev", 7) ||
|
||||
has_bytes(d, n, 4, "ftypmif1", 8) || has_bytes(d, n, 4, "ftypmsf1", 8));
|
||||
}
|
||||
if (mime_in(asf_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "\x30\x26\xb2\x75\x8e\x66\xcf\x11", 8));
|
||||
}
|
||||
if (strfield2("audio/x-wav", mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) && has_bytes(d, n, 8, "WAVE", 4));
|
||||
}
|
||||
if (strfield2("video/x-msvideo", mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) && has_bytes(d, n, 8, "AVI ", 4));
|
||||
}
|
||||
if (strfield2("image/webp", mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) &&
|
||||
has_bytes(d, n, 8, "WEBPVP", 6));
|
||||
}
|
||||
if (strfield2("image/x-portable-anymap", mime)) {
|
||||
SNIFF_RULE(n >= 2 && d[0] == 'P' && d[1] >= '1' && d[1] <= '6');
|
||||
}
|
||||
if (strfield2("audio/x-aiff", mime)) {
|
||||
SNIFF_RULE(
|
||||
has_bytes(d, n, 0, "FORM", 4) &&
|
||||
(has_bytes(d, n, 8, "AIFF", 4) || has_bytes(d, n, 8, "AIFC", 4)));
|
||||
}
|
||||
if (strfield2("audio/mpeg", mime)) {
|
||||
/* MPEG audio frame sync (11 bits), valid layer and bitrate fields */
|
||||
SNIFF_RULE(n >= 2 && d[0] == 0xff && (d[1] & 0xe0) == 0xe0 &&
|
||||
(d[1] & 0x06) != 0);
|
||||
}
|
||||
if (strfield2("audio/aac", mime)) {
|
||||
/* ADTS sync */
|
||||
SNIFF_RULE(n >= 2 && d[0] == 0xff && (d[1] & 0xf6) == 0xf0);
|
||||
}
|
||||
if (strfield2("video/mp2t", mime)) {
|
||||
SNIFF_RULE(n >= 1 && d[0] == 0x47 && (n <= 188 || d[188] == 0x47));
|
||||
}
|
||||
if (mime_in(xml_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "<?xml"));
|
||||
}
|
||||
if (mime_in(svg_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "<svg") ||
|
||||
has_text_prefix(d, n, "<!DOCTYPE svg"));
|
||||
}
|
||||
if (mime_in(html_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "<!DOCTYPE") ||
|
||||
has_text_prefix(d, n, "<html") ||
|
||||
has_text_prefix(d, n, "<head"));
|
||||
}
|
||||
if (mime_in(pem_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "-----BEGIN"));
|
||||
}
|
||||
if (strfield2("audio/x-mpegurl", mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "#EXTM3U"));
|
||||
}
|
||||
if (strfield2("text/x-vcard", mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "BEGIN:VCARD"));
|
||||
}
|
||||
#undef SNIFF_RULE
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
hts_boolean hts_sniff_mime_known(const char *mime) {
|
||||
if (mime == NULL || *mime == '\0')
|
||||
return HTS_FALSE;
|
||||
return sniff_eval(SNIFF_QUERY_KNOWN, NULL, 0, mime);
|
||||
}
|
||||
|
||||
hts_boolean hts_sniff_mime_consistent(const void *data, size_t size,
|
||||
const char *mime) {
|
||||
if (data == NULL || size == 0 || mime == NULL || *mime == '\0')
|
||||
return HTS_FALSE;
|
||||
return sniff_eval(SNIFF_QUERY_MATCH, (const unsigned char *) data, size,
|
||||
mime);
|
||||
}
|
||||
@@ -1,50 +0,0 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2017 Xavier Roche and other contributors
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Important notes:
|
||||
|
||||
- We hereby ask people using this source NOT to use it in purpose of grabbing
|
||||
emails addresses, or collecting any other private information on persons.
|
||||
This would disgrace our work, and spoil the many hours we spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: MIME magic-byte consistency checks */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#ifndef HTSSNIFF_DEFH
|
||||
#define HTSSNIFF_DEFH
|
||||
|
||||
#include <stddef.h>
|
||||
#include "htsglobal.h"
|
||||
|
||||
/* Leading-body window read to arbitrate a wire/extension MIME conflict. */
|
||||
#define HTS_SNIFF_LEN 512
|
||||
|
||||
/* Can a magic rule ever confirm this MIME? (whether sniffing is worth it) */
|
||||
hts_boolean hts_sniff_mime_known(const char *mime);
|
||||
|
||||
/* TRUE when the leading body bytes are consistent with the claimed MIME;
|
||||
FALSE on unknown MIME, unknown magic, or too-short data (fail-safe). */
|
||||
hts_boolean hts_sniff_mime_consistent(const void *data, size_t size,
|
||||
const char *mime);
|
||||
|
||||
#endif
|
||||
@@ -81,14 +81,15 @@ name '/x.pdf' 'text/html' 'x.html' status=-1
|
||||
name '/x.html' 'text/html' 'x.html' status=-1
|
||||
name '/x.php' 'application/pdf' 'x.pdf' status=-1 cdispo=report.pdf
|
||||
|
||||
# Contested type (wire disagrees with a specific ext): magic bytes proving the
|
||||
# extension right keep it, anything else trusts the wire as before.
|
||||
name '/photo.jpg' 'image/png' 'photo.jpg' body=hex:FFD8FFE000104A46
|
||||
# Contested type (wire disagrees with a specific ext): the wire is trusted and
|
||||
# body bytes are not consulted; pinned so a content-based tie-break shows up
|
||||
# as an explicit flip of these rows.
|
||||
name '/photo.jpg' 'image/png' 'photo.png' body=hex:FFD8FFE000104A46
|
||||
name '/photo.jpg' 'image/png' 'photo.png' body=hex:89504E470D0A1A0A
|
||||
name '/photo.jpg' 'image/png' 'photo.png'
|
||||
name '/doc.pdf' 'text/html' 'doc.pdf' body=hex:255044462D312E34
|
||||
name '/doc.pdf' 'text/html' 'doc.html' body=hex:255044462D312E34
|
||||
name '/doc.pdf' 'text/html' 'doc.html' 'body=<html><body>soft 404</body></html>'
|
||||
name '/style.css' 'image/png' 'style.png' 'body=body { }' # no rule for css: wire wins
|
||||
name '/style.css' 'image/png' 'style.png' 'body=body { }'
|
||||
|
||||
# A redirect answer resolves nothing: delayed placeholder name.
|
||||
name '/x.php' 'text/html' 'x.0.delayed' statuscode=301
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# MIME magic consistency (-#test=sniff <content-type> <hex:..|text>), the
|
||||
# tie-break behind htsname's wire-vs-extension naming.
|
||||
|
||||
chk() {
|
||||
local mime="$1" body="$2" want="$3"
|
||||
out="$(httrack -#test=sniff "$mime" "$body" | sed -n 's/^sniff: //p')"
|
||||
test "$out" == "$want" || {
|
||||
echo "FAIL: '$mime' '$body' -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
yes='known=1 consistent=1'
|
||||
no='known=1 consistent=0'
|
||||
unk='known=0 consistent=0'
|
||||
|
||||
# images
|
||||
chk image/jpeg hex:FFD8FFE000104A46 "$yes"
|
||||
chk image/png hex:89504E470D0A1A0A "$yes"
|
||||
chk image/png hex:FFD8FFE000104A46 "$no" # jpeg bytes are not a png
|
||||
chk image/gif 'GIF89a' "$yes"
|
||||
chk image/bmp 'BMxxxx' "$yes"
|
||||
chk image/tiff hex:49492A00 "$yes"
|
||||
chk image/tiff hex:4D4D002A "$yes" # both endians
|
||||
chk image/x-icon hex:00000100 "$yes"
|
||||
chk image/x-icon hex:00000200 "$yes" # Windows cursor, spec maps to x-icon
|
||||
chk image/webp 'RIFFxxxxWEBPVP' "$yes"
|
||||
chk image/webp 'RIFFxxxxWAVE' "$no" # riff subtype discriminates
|
||||
chk image/avif hex:0000001C6674797061766966 "$yes"
|
||||
chk image/avif hex:0000001C6674797068656963 "$no" # heic brand is not avif
|
||||
chk image/heic hex:0000001C6674797068656963 "$yes"
|
||||
chk image/svg+xml '<svg xmlns="x">' "$yes"
|
||||
chk image/svg+xml $'\xef\xbb\xbf <?xml version="1.0"?>' "$yes" # BOM+ws skip
|
||||
|
||||
# audio / video
|
||||
chk audio/mpeg 'ID3xxx' "$yes"
|
||||
chk audio/mpeg hex:FFFB9000 "$yes" # bare frame sync
|
||||
chk audio/aac hex:FFF15080 "$yes"
|
||||
chk audio/flac 'fLaC' "$yes"
|
||||
chk audio/ogg hex:4F67675300 "$yes"
|
||||
chk audio/x-wav 'RIFFxxxxWAVE' "$yes"
|
||||
chk video/x-msvideo 'RIFFxxxxAVI ' "$yes"
|
||||
chk video/x-msvideo 'RIFFxxxxWAVE' "$no"
|
||||
chk video/mp4 hex:000000186674797069736F6D "$yes"
|
||||
chk video/webm hex:1A45DFA3 "$yes"
|
||||
chk video/mpeg hex:000001BA "$yes"
|
||||
chk video/x-ms-wmv hex:3026B2758E66CF11 "$yes"
|
||||
|
||||
# archives; zip magic covers the office-container families
|
||||
chk application/zip hex:504B0304 "$yes"
|
||||
chk application/vnd.openxmlformats-officedocument.wordprocessingml.document hex:504B0304 "$yes"
|
||||
chk application/vnd.oasis.opendocument.text hex:504B0304 "$yes"
|
||||
chk application/msword hex:D0CF11E0A1B11AE1 "$yes"
|
||||
chk application/msword hex:504B0304 "$no" # legacy .doc is OLE, not zip
|
||||
chk application/x-gzip hex:1F8B08 "$yes"
|
||||
chk application/x-bzip2 'BZh9' "$yes"
|
||||
chk application/x-7z-compressed hex:377ABCAF271C "$yes"
|
||||
chk application/x-rar-compressed hex:526172211A07 "$yes"
|
||||
chk application/zstd hex:28B52FFD "$yes"
|
||||
chk application/x-tar "hex:$(printf '00%.0s' {1..257})7573746172" "$yes" # ustar at 257
|
||||
chk application/x-tar hex:7573746172 "$no"
|
||||
|
||||
# documents, fonts, misc
|
||||
chk application/pdf '%PDF-1.7' "$yes"
|
||||
chk application/pdf '<html><body>soft 404</body></html>' "$no"
|
||||
chk application/postscript '%!PS-Adobe' "$yes"
|
||||
chk application/rtf '{\rtf1' "$yes"
|
||||
chk font/woff2 'wOF2' "$yes"
|
||||
chk font/otf 'OTTO' "$yes"
|
||||
chk font/ttf hex:0001000000 "$yes"
|
||||
chk application/x-shockwave-flash 'CWSx' "$yes"
|
||||
chk application/x-java-vm hex:CAFEBABE "$yes"
|
||||
chk application/wasm hex:0061736D "$yes"
|
||||
chk text/html $' \r\n<!DOCTYPE html><html>' "$yes"
|
||||
chk text/html '<html lang="en">' "$yes"
|
||||
chk text/html 'plain text, no markup' "$no"
|
||||
chk text/xml '<?xml version="1.0"?>' "$yes"
|
||||
|
||||
# no magic rule at all: never confirmed, never blocks the wire type
|
||||
chk text/css 'body { }' "$unk"
|
||||
chk text/plain 'hello' "$unk"
|
||||
chk application/x-javascript 'var x;' "$unk"
|
||||
@@ -23,11 +23,11 @@ name() {
|
||||
}
|
||||
}
|
||||
|
||||
# No live bytes: the recorded save name (X-Save) reproduces the previous
|
||||
# verdict; cached body bytes (PNG magic) are ignored; css has no magic rule.
|
||||
name '/photo.jpg' 'image/png' 'photo.jpg' 'cached=image/png|www.example.com/photo.jpg'
|
||||
# Names are re-derived from the stored headers on every run: neither the
|
||||
# recorded save name nor the cached body bytes change the verdict (pinned).
|
||||
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.jpg'
|
||||
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.png'
|
||||
name '/photo.jpg' 'image/jpeg' 'photo.jpg' 'cached=image/jpeg|www.example.com/photo.png'
|
||||
name '/style.css' 'image/png' 'style.css' 'cached=image/png|www.example.com/style.css'
|
||||
name '/style.css' 'image/png' 'style.png' 'cached=image/png|www.example.com/style.css'
|
||||
# agreement keeps the URL ext verbatim (.jpeg), never canonicalized to .jpg
|
||||
name '/photo.jpeg' 'image/jpeg' 'photo.jpeg' 'cached=image/jpeg|www.example.com/photo.jpeg'
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Content-Type vs URL-extension naming (#267 family, default -%N2). A MISSING
|
||||
# type keeps a specific non-HTML ext; a DECLARED disagreeing type is trusted
|
||||
# unless magic bytes prove the ext right (lie/wrongtype/packed keep theirs),
|
||||
# so a real HTML body (report.pdf) still becomes .html. Wrong names are
|
||||
# asserted absent so a regression in either direction fails.
|
||||
# Content-Type vs URL-extension naming (issue #267 family) under the default
|
||||
# delayed type check (-%N2). Policy: a MISSING Content-Type must not clobber a
|
||||
# URL extension that maps to a specific non-HTML type (.png/.pdf stay as-is);
|
||||
# an explicitly DECLARED type is trusted, so a binary-looking URL that really
|
||||
# serves HTML (text/html on .pdf/.jpg) is named .html. The "wrong" names are
|
||||
# asserted absent so a regression in either direction fails here.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
@@ -13,11 +14,11 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/notype.pdf' --not-found 'types/notype.html' \
|
||||
--found 'types/photo.png' \
|
||||
--found 'types/doc.pdf' \
|
||||
--found 'types/lie.png' --not-found 'types/lie.html' \
|
||||
--found 'types/wrongtype.jpg' --not-found 'types/wrongtype.png' \
|
||||
--found 'types/bigtype.jpg' --not-found 'types/bigtype.png' \
|
||||
--found 'types/mutant.jpg' --not-found 'types/mutant.png' \
|
||||
--found 'types/packed.jpg' --not-found 'types/packed.png' \
|
||||
--found 'types/lie.html' --not-found 'types/lie.png' \
|
||||
--found 'types/wrongtype.png' --not-found 'types/wrongtype.jpg' \
|
||||
--found 'types/bigtype.png' --not-found 'types/bigtype.jpg' \
|
||||
--found 'types/packed.png' --not-found 'types/packed.jpg' \
|
||||
--found 'types/mutant.png' --not-found 'types/mutant.jpg' \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/page.htm' --not-found 'types/page.html' \
|
||||
--found 'types/script.js' \
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# An update pass keeps the names the first crawl chose: type and save name
|
||||
# ride the cache, so a declared-text/html .pdf stays .html, a typeless .png
|
||||
# stays .png, and a sniff-kept ext is reproduced from X-Save even when the
|
||||
# refetched content changed (mutant.jpg serves PNG bytes on the rerun).
|
||||
# A second (update) pass must keep the names the first crawl chose. The stored
|
||||
# Content-Type rides the cache, so the update reads back the same value -- the
|
||||
# unknown/unknown sentinel for a typeless response, the declared type otherwise
|
||||
# -- and names consistently: a declared-text/html .pdf stays .html and a
|
||||
# typeless .png stays .png across the update rather than reverting.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/lie.png' --not-found 'types/lie.html' \
|
||||
--found 'types/wrongtype.jpg' --not-found 'types/wrongtype.png' \
|
||||
--found 'types/bigtype.jpg' --not-found 'types/bigtype.png' \
|
||||
--found 'types/packed.jpg' --not-found 'types/packed.png' \
|
||||
--found 'types/mutant.jpg' --not-found 'types/mutant.png' \
|
||||
--found 'types/lie.html' \
|
||||
--found 'types/wrongtype.png' --not-found 'types/wrongtype.jpg' \
|
||||
--found 'types/packed.png' --not-found 'types/packed.jpg' \
|
||||
--found 'types/mutant.png' --not-found 'types/mutant.jpg' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# -E time limit (#481): server pages trickle for minutes; the engine must stop
|
||||
# on its own at -E plus grace, aborting the in-flight transfers.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# cancelled crawls can orphan .delayed placeholders (#483): skip that audit
|
||||
start=$(date +%s)
|
||||
bash "$top_srcdir/tests/local-crawl.sh" \
|
||||
--skip-delayed-audit \
|
||||
--log-found 'More than 2 seconds passed' \
|
||||
httrack 'BASEURL/trickle/index.html' -E2 -c4
|
||||
wall=$(($(date +%s) - start))
|
||||
# hard stop is due at -E2 + 5s grace; near TRICKLE_SECONDS means it never fired
|
||||
if [ "$wall" -ge 30 ]; then
|
||||
echo "crawl took ${wall}s, -E hard stop did not engage" >&2
|
||||
exit 1
|
||||
fi
|
||||
@@ -1,13 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# -M byte cap (#77): the crawl must stop with the "giving up" error and keep
|
||||
# the mirror well under the 8 x 640KB the fixture totals uncapped.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" \
|
||||
--log-found 'More than 400000 bytes have been transferred.. giving up' \
|
||||
--max-mirror-bytes 4000000 \
|
||||
httrack 'BASEURL/bigfiles/index.html' -M400000 -c4
|
||||
@@ -54,7 +54,6 @@ TESTS = \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-sniff.test \
|
||||
01_engine-status.test \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
@@ -96,8 +95,6 @@ TESTS = \
|
||||
30_local-fragment-link.test \
|
||||
31_local-javaclass.test \
|
||||
32_local-cdispo.test \
|
||||
33_local-delayed.test \
|
||||
34_local-maxtime.test \
|
||||
35_local-maxsize.test
|
||||
33_local-delayed.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
Binary file not shown.
@@ -16,10 +16,8 @@
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
|
||||
# --max-mirror-bytes N \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --max-mirror-bytes asserts the mirrored content (host root) stays under N.
|
||||
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
|
||||
# host root), to assert rewritten link/content survived the crawl.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
@@ -94,7 +92,6 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create
|
||||
# --- parse leading control flags --------------------------------------------
|
||||
declare -a audit=()
|
||||
declare -a cookies=()
|
||||
skip_delayed_audit=""
|
||||
scheme=http
|
||||
pos=0
|
||||
args=("$@")
|
||||
@@ -119,14 +116,11 @@ while test "$pos" -lt "$nargs"; do
|
||||
pos=$((pos + 1))
|
||||
cookies+=("${args[$pos]}")
|
||||
;;
|
||||
--skip-delayed-audit)
|
||||
skip_delayed_audit=1
|
||||
;;
|
||||
--errors | --files)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--found | --not-found | --directory | --log-found | --log-not-found | --max-mirror-bytes)
|
||||
--found | --not-found | --directory | --log-found | --log-not-found)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
@@ -252,15 +246,12 @@ done
|
||||
test -n "$hostroot" || die "could not find host root under $out"
|
||||
debug "host root: $hostroot"
|
||||
|
||||
# A completed crawl must leave no .delayed temporaries (issue #107).
|
||||
# --skip-delayed-audit: a cancelled crawl can orphan placeholders (issue #483)
|
||||
if test -z "$skip_delayed_audit"; then
|
||||
info "checking for leftover .delayed files"
|
||||
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
|
||||
if test -z "$leftovers"; then result "OK"; else
|
||||
result "leftover: $leftovers"
|
||||
exit 1
|
||||
fi
|
||||
# A completed crawl must leave no .delayed temporaries (issue #107)
|
||||
info "checking for leftover .delayed files"
|
||||
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
|
||||
if test -z "$leftovers"; then result "OK"; else
|
||||
result "leftover: $leftovers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- audit -------------------------------------------------------------------
|
||||
@@ -318,15 +309,6 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
--max-mirror-bytes)
|
||||
i=$((i + 1))
|
||||
sz=$(find "$hostroot" -type f -exec cat {} + | wc -c | tr -d '[:space:]')
|
||||
info "checking mirror size ${sz} <= ${audit[$i]} bytes"
|
||||
if test "$sz" -le "${audit[$i]}"; then result "OK"; else
|
||||
result "mirror too big"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--file-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
|
||||
@@ -464,41 +464,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def route_delayed_empty(self):
|
||||
self.send_raw(b"", "text/html") # 200 + Content-Length: 0
|
||||
|
||||
# -E time-limit (#481): pages that trickle far longer than any -E budget,
|
||||
# so only an engine-side abort can end the crawl.
|
||||
TRICKLE_SECONDS = 60
|
||||
|
||||
def route_trickle_index(self):
|
||||
self.send_html(
|
||||
"".join('\t<a href="p%d.bin">p%d</a>\n' % (i, i) for i in range(8))
|
||||
)
|
||||
|
||||
def route_trickle_page(self):
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", str(2 * self.TRICKLE_SECONDS))
|
||||
self.end_headers()
|
||||
if self.command == "HEAD":
|
||||
return
|
||||
try:
|
||||
for _ in range(self.TRICKLE_SECONDS):
|
||||
self.wfile.write(b"xy")
|
||||
self.wfile.flush()
|
||||
time.sleep(1.0)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# -M byte cap (#77): large fast files so a crawl overruns -M immediately.
|
||||
BIGFILE_BYTES = 640 * 1024
|
||||
|
||||
def route_bigfiles_index(self):
|
||||
self.send_html(
|
||||
"".join('\t<a href="p%d.bin">p%d</a>\n' % (i, i) for i in range(8))
|
||||
)
|
||||
|
||||
def route_bigfile(self):
|
||||
self.send_raw(b"x" * self.BIGFILE_BYTES, "application/octet-stream")
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
@@ -544,24 +509,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/cdispo/fetch.php": route_cdispo,
|
||||
"/cdispo/evil.php": route_cdispo,
|
||||
"/delayed/index.html": route_delayed_index,
|
||||
"/trickle/index.html": route_trickle_index,
|
||||
"/trickle/p0.bin": route_trickle_page,
|
||||
"/trickle/p1.bin": route_trickle_page,
|
||||
"/trickle/p2.bin": route_trickle_page,
|
||||
"/trickle/p3.bin": route_trickle_page,
|
||||
"/trickle/p4.bin": route_trickle_page,
|
||||
"/trickle/p5.bin": route_trickle_page,
|
||||
"/trickle/p6.bin": route_trickle_page,
|
||||
"/trickle/p7.bin": route_trickle_page,
|
||||
"/bigfiles/index.html": route_bigfiles_index,
|
||||
"/bigfiles/p0.bin": route_bigfile,
|
||||
"/bigfiles/p1.bin": route_bigfile,
|
||||
"/bigfiles/p2.bin": route_bigfile,
|
||||
"/bigfiles/p3.bin": route_bigfile,
|
||||
"/bigfiles/p4.bin": route_bigfile,
|
||||
"/bigfiles/p5.bin": route_bigfile,
|
||||
"/bigfiles/p6.bin": route_bigfile,
|
||||
"/bigfiles/p7.bin": route_bigfile,
|
||||
"/delayed/noloc.php": route_delayed_noloc,
|
||||
"/delayed/selfloop.php": route_delayed_selfloop,
|
||||
"/delayed/redir.php": route_delayed_redir,
|
||||
|
||||
Reference in New Issue
Block a user