Compare commits

..

1 Commits

Author SHA1 Message Date
Xavier Roche
09a2048509 htsparse: convert JS-detection automaton macros to static functions
AUTOMATE_LOOKUP_CURRENT_ADR and INCREMENT_CURRENT_ADR captured four parser
locals (inscript, inscript_state, inscript_state_pos, html) by lexical scope.
Replace them with two static helpers driven through a small script_automate
struct of pointers set up once, and lift the INSCRIPT enum to file scope so the
helpers can name it. The obscure `new_state_pos*sizeof(row) < sizeof(table)`
bound becomes the equivalent `next < INSCRIPT_NSTATES`.

Behavior-preserving: a JS-heavy crawl (document.write, quoted/escaped strings,
// and /* */ comments, onclick handlers) mirrors byte-identically against
master.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-01 21:17:22 +02:00
45 changed files with 431 additions and 1955 deletions

View File

@@ -63,16 +63,6 @@ AC_SUBST(LT_CV_OBJDIR,$lt_cv_objdir)
# Export version info
AC_SUBST(VERSION_INFO)
# Versioned plugin name for dlopen() in hts_create_opt(); soname major is
# libtool's current - age, so this tracks VERSION_INFO bumps automatically.
HTS_SONAME_MAJOR=$((${VERSION_INFO%%:*} - ${VERSION_INFO##*:}))
case "$host_os" in
darwin*) HTS_LIBHTSJAVA_NAME="libhtsjava.$HTS_SONAME_MAJOR.dylib" ;;
*) HTS_LIBHTSJAVA_NAME="libhtsjava.so.$HTS_SONAME_MAJOR" ;;
esac
AC_DEFINE_UNQUOTED([HTS_LIBHTSJAVA_NAME], ["$HTS_LIBHTSJAVA_NAME"],
[Versioned libhtsjava runtime name, derived from VERSION_INFO])
### Default CFLAGS
DEFAULT_CFLAGS="-Wall -Wformat -Wformat-security \
-Wmultichar -Wwrite-strings -Wcast-qual -Wcast-align \

View File

@@ -62,7 +62,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
htsname.c htsrobots.c htstools.c htswizard.c \
htsalias.c htsthread.c htsindex.c htsbauth.c \
htsmd5.c htszlib.c htswrap.c htsconcat.c \
htsmodules.c htscharset.c punycode.c htsencoding.c htssniff.c \
htsmodules.c htscharset.c punycode.c htsencoding.c \
md5.c \
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
@@ -70,7 +70,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
htshelp.h htsindex.h htslib.h htsmd5.h \
htsmodules.h htsname.h htsnet.h htssniff.h \
htsmodules.h htsname.h htsnet.h \
htsopt.h htsrobots.h htsthread.h \
htstools.h htswizard.h htswrap.h htszlib.h \
htsstrings.h htsarrays.h httrack-library.h \

View File

@@ -1359,18 +1359,6 @@ int back_flush_output(httrackp * opt, cache_back * cache, struct_back * sback,
}
// effacer entrée
/* Discard a cancelled mid-write .delayed placeholder (unusable across runs). */
static void back_delayed_discard(httrackp *opt, lien_back *back) {
if (back->r.out != NULL) {
fclose(back->r.out);
back->r.out = NULL;
}
back->r.is_write = 0;
if (opt != NULL)
url_savename_refname_remove(opt, back->url_adr, back->url_fil);
(void) UNLINK(back->url_sav);
}
int back_delete(httrackp * opt, cache_back * cache, struct_back * sback,
const int p) {
lien_back *const back = sback->lnk;
@@ -1378,12 +1366,6 @@ int back_delete(httrackp * opt, cache_back * cache, struct_back * sback,
assertf(p >= 0 && p < back_max);
if (p >= 0 && p < sback->count) { // on sait jamais..
/* mid-write cancel: drop a .delayed placeholder; real-named partials
survive for resume (--continue) */
if (back[p].r.is_write && IS_DELAYED_EXT(back[p].url_sav) &&
(back[p].status != STATUS_READY || back[p].r.statuscode <= 0)) {
back_delayed_discard(opt, &back[p]);
}
// Vérificateur d'intégrité
#if DEBUG_CHECKINT
_CHECKINT(&back[p], "Appel back_delete")
@@ -2255,13 +2237,12 @@ int host_wait(httrackp * opt, lien_back * back) {
static int slot_can_be_cleaned(const lien_back * back) {
return (back->status == STATUS_READY) // ready
/* Check autoclean */
&& (!back->locked) // not held by hts_wait_delayed (name pending)
&& (!back->testmode) // not test mode
&& (strnotempty(back->url_sav)) // filename exists
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
&& (back->r.size >= 0) // size>=0
;
/* Check autoclean */
&& (!back->testmode) // not test mode
&& (strnotempty(back->url_sav)) // filename exists
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
&& (back->r.size >= 0) // size>=0
;
}
static int slot_can_be_finalized(httrackp * opt, const lien_back * back) {
@@ -2437,34 +2418,6 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
back_clean(opt, cache, sback);
#endif
/* Time limit exceeded past grace: abort in-flight transfers so no wait loop
starves (#481). FTP slots stay, their thread owns the socket. */
if (!back_checkmirror(opt)) {
int aborted = 0;
unsigned int i;
for (i = 0; i < (unsigned int) back_max; i++) {
if (back[i].status > 0 && back[i].status < STATUS_FTP_TRANSFER) {
if (back[i].r.soc != INVALID_SOCKET) {
deletehttp(&back[i].r);
}
back[i].r.soc = INVALID_SOCKET;
/* drop a .delayed placeholder; real partials survive for resume */
if (back[i].r.is_write && IS_DELAYED_EXT(back[i].url_sav))
back_delayed_discard(opt, &back[i]);
back[i].r.statuscode = STATUSCODE_TIMEOUT;
strcpybuff(back[i].r.msg, "Mirror Time Out");
back[i].status = STATUS_READY;
back_set_finished(sback, i);
aborted++;
}
}
if (aborted > 0)
hts_log_print(opt, LOG_WARNING,
"time limit reached, %d transfer(s) aborted", aborted);
return;
}
// recevoir tant qu'il y a des données (avec un maximum de max_loop boucles)
do_wait = 0;
gestion_timeout = 0;
@@ -2938,10 +2891,10 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
// range size hack old location
#if HTS_DIRECTDISK
// Shortcut: store the file directly on disk when possible,
// sparing memory
if (back[i].status &&
!back[i].locked) { // name still pending when locked
// Court-circuit:
// Peut-on stocker le fichier directement sur disque?
// Ahh que ca serait vachement mieux et que ahh que la mémoire vous dit merci!
if (back[i].status) {
if (back[i].r.is_write == 0) { // mode mémoire
if (back[i].r.adr == NULL) { // rien n'a été écrit
if (!back[i].testmode) { // pas mode test
@@ -4007,12 +3960,8 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
&& (back[i].r.adr = (char *) malloct(2))) {
back[i].r.adr[0] = 0;
}
/* locked = name pending; the waiter finalizes after
patching url_sav (else: cached as .delayed, #5) */
if (!back[i].locked) {
hts_log_print(opt, LOG_TRACE, "finalizing empty");
back_finalize(opt, cache, sback, i);
}
hts_log_print(opt, LOG_TRACE, "finalizing empty");
back_finalize(opt, cache, sback, i);
} else if (!back[i].r.is_chunk) { // pas de chunk
//if (back[i].r.http11!=2) { // pas de chunk
back[i].is_chunk = 0;
@@ -4210,11 +4159,6 @@ int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize)
return 1;
}
/* Grace left to the smooth stop before in-flight transfers are aborted. */
static int back_maxtime_grace(const int maxtime) {
return maximum(5, minimum(30, maxtime / 10));
}
int back_checkmirror(httrackp * opt) {
// Check max size
if ((opt->maxsite > 0) && (HTS_STAT.stat_bytes >= opt->maxsite)) {
@@ -4231,19 +4175,13 @@ int back_checkmirror(httrackp * opt) {
*/
}
// Check max time
if (opt->maxtime > 0) {
const TStamp elapsed = time_local() - HTS_STAT.stat_timestart;
if (elapsed >= opt->maxtime) {
if (!opt->state.stop) { /* not yet stopped */
hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up",
opt->maxtime);
/* cancel mirror smoothly */
hts_request_stop(opt, 0);
}
/* smooth stop starved past the grace period: stop waiting (#481) */
if (elapsed - opt->maxtime >= back_maxtime_grace(opt->maxtime))
return 0;
if ((opt->maxtime > 0)
&& ((time_local() - HTS_STAT.stat_timestart) >= opt->maxtime)) {
if (!opt->state.stop) { /* not yet stopped */
hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up",
opt->maxtime);
/* cancel mirror smoothly */
hts_request_stop(opt, 0);
}
}
return 1; /* Ok, go on */

View File

@@ -136,8 +136,6 @@ void back_solve(httrackp * opt, lien_back * sback);
int host_wait(httrackp * opt, lien_back * sback);
#endif
int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize);
/* Enforce -M/-E quotas: requests a smooth stop when reached; returns 0 once
the -E deadline overran its grace period (callers must stop waiting). */
int back_checkmirror(httrackp * opt);
#endif

View File

@@ -596,18 +596,15 @@ htsblk cache_read_ro(httrackp * opt, cache_back * cache, const char *adr,
return cache_readex(opt, cache, adr, fil, save, location, NULL, 1);
}
htsblk cache_read_including_broken(httrackp *opt, cache_back *cache,
const char *adr, const char *fil,
char *return_save) {
htsblk r = cache_readex(opt, cache, adr, fil, NULL, NULL, return_save, 0);
htsblk cache_read_including_broken(httrackp * opt, cache_back * cache,
const char *adr, const char *fil) {
htsblk r = cache_read(opt, cache, adr, fil, NULL, NULL);
if (r.statuscode == -1) {
lien_back *itemback = NULL;
if (back_unserialize_ref(opt, adr, fil, &itemback) == 0) {
r = itemback->r;
if (return_save != NULL)
strlcpybuff(return_save, itemback->url_sav, HTS_URLMAXSIZE * 2);
/* cleanup */
back_clear_entry(itemback); /* delete entry content */
freet(itemback); /* delete item */

View File

@@ -66,11 +66,8 @@ htsblk cache_read(httrackp * opt, cache_back * cache, const char *adr,
const char *fil, const char *save, char *location);
htsblk cache_read_ro(httrackp * opt, cache_back * cache, const char *adr,
const char *fil, const char *save, char *location);
/* Like cache_read, but also yields entries whose transfer broke; return_save
(optional, HTS_URLMAXSIZE*2) receives the entry's recorded save name. */
htsblk cache_read_including_broken(httrackp *opt, cache_back *cache,
const char *adr, const char *fil,
char *return_save);
htsblk cache_read_including_broken(httrackp * opt, cache_back * cache,
const char *adr, const char *fil);
htsblk cache_readex(httrackp * opt, cache_back * cache, const char *adr,
const char *fil, const char *save, char *location,
char *return_save, int readonly);

View File

@@ -175,9 +175,7 @@ HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
//
socinput(soc, line, 1000);
if (strnotempty(line)) {
/* widths bound the caller buffers: method[32], url[HTS_URLMAXSIZE*2],
protocol[256] */
if (sscanf(line, "%31s %2047s %255s", method, url, protocol) == 3) {
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
lien_adrfil af;
// méthode en majuscule

View File

@@ -3371,41 +3371,6 @@ int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
return n;
}
/* One engine-loop tick: refresh the transfer stats and run the loop callback
for slot b (-1 = none). HTS_FALSE = the callback requested an abort. */
hts_boolean hts_loop_tick(struct_back *sback, httrackp *opt, int b, int ptr) {
engine_stats();
HTS_STAT.stat_nsocket = back_nsoc(sback);
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
return RUN_CALLBACK7(
opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)
? HTS_TRUE
: HTS_FALSE;
}
/* Single implementation of the historical WAIT_FOR_AVAILABLE_SOCKET macros. */
hts_boolean hts_wait_available_socket(struct_back *sback, httrackp *opt,
cache_back *cache, int ptr) {
const int prev = opt->state._hts_in_html_parsing;
while (back_pluggable_sockets_strict(sback, opt) <= 0) {
opt->state._hts_in_html_parsing = 6;
back_wait(sback, opt, cache, 0);
/* time limit (-E) exceeded: stop waiting for a socket (#481) */
if (!back_checkmirror(opt))
break;
if (!hts_loop_tick(sback, opt, -1, ptr))
return HTS_FALSE;
}
opt->state._hts_in_html_parsing = prev;
return HTS_TRUE;
}
int back_pluggable_sockets(struct_back * sback, httrackp * opt) {
int n;

View File

@@ -432,15 +432,6 @@ int back_pluggable_sockets(struct_back * sback, httrackp * opt);
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt);
/* One engine-loop tick: refresh the transfer stats and run the loop callback
for slot b (-1 = none). HTS_FALSE = the callback requested an abort. */
hts_boolean hts_loop_tick(struct_back *sback, httrackp *opt, int b, int ptr);
/* Wait until a test socket can be plugged, pumping transfers, stats and the
loop callback; gives up past the -E deadline. HTS_FALSE = callback abort. */
hts_boolean hts_wait_available_socket(struct_back *sback, httrackp *opt,
cache_back *cache, int ptr);
/* Randomized inter-file pause target in [min_ms,max_ms] (#185), derived from a
timestamp seed so it is stable within one gap and rerolls per launch. */
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms);

View File

@@ -69,15 +69,11 @@ typedef struct t_hts_callbackarg t_hts_callbackarg;
typedef struct t_hts_callbackarg t_hts_callbackarg;
#endif
/* Marks a symbol an external wrapper module exports back to the engine.
Must override -fvisibility=hidden on ELF, or dlopen()ed plugins (htsjava)
hide their own hts_plug()/hts_unplug() entry points. */
/* Marks a symbol an external wrapper module exports back to the engine
(dllexport on Windows, nothing elsewhere). */
#ifndef EXTERNAL_FUNCTION
#ifdef _WIN32
#define EXTERNAL_FUNCTION __declspec(dllexport)
#elif ((defined(__GNUC__) && (__GNUC__ >= 4)) || \
(defined(HAVE_VISIBILITY) && HAVE_VISIBILITY))
#define EXTERNAL_FUNCTION __attribute__((visibility("default")))
#else
#define EXTERNAL_FUNCTION
#endif

View File

@@ -190,9 +190,9 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
}
}
}
/* reserve one byte for the trailing NUL written after the loop */
if (j + 1 >= max) {
/* copy */
if (j + 1 > max) {
/* overflow */
return -1;
}
@@ -300,11 +300,6 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
/* Was the character read successfully ? */
if (nRead == utfBufferSize) {
/* the 'continue' below skips the NUL-reserve guard: re-check */
if (utfBufferJ + utfBufferSize >= max) {
return -1;
}
/* Rollback write position to sequence start write position */
j = utfBufferJ;
@@ -319,8 +314,8 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
}
}
/* reserve one byte for the trailing NUL written after the loop */
if (j + 1 >= max) {
/* Check for overflow */
if (j + 1 > max) {
return -1;
}

View File

@@ -128,33 +128,6 @@ void launch_ftp(FTPDownloadStruct * params) {
return 0; \
}
/* Bounded split of a hostile-URL "user[:pass]@" prefix (see htsftp.h). */
void ftp_split_userpass(const char *src, const char *end, char *user,
size_t user_size, char *pass, size_t pass_size) {
size_t n = 0;
assertf(user_size > 0 && pass_size > 0); /* the size-1 math underflows on 0 */
while (src[n] != '\0' && src[n] != ':') {
if (n < user_size - 1)
user[n] = src[n];
n++;
}
user[n < user_size ? n : user_size - 1] = '\0';
pass[0] = '\0';
if (src[n] == ':') { // password follows the colon
const size_t base = n + 1;
size_t k = 0;
while (&src[base + k + 1] < end && src[base + k] != '\0') {
if (k < pass_size - 1)
pass[k] = src[base + k];
k++;
}
pass[k < pass_size ? k : pass_size - 1] = '\0';
}
}
// la véritable fonction une fois lancées les routines thread/fork
int run_launch_ftp(FTPDownloadStruct * pStruct) {
lien_back *back = pStruct->pBack;
@@ -200,7 +173,24 @@ int run_launch_ftp(FTPDownloadStruct * pStruct) {
while(*real_adr == '/')
real_adr++; // sauter /
if ((adr = jump_identification(real_adr)) != real_adr) { // user
ftp_split_userpass(real_adr, adr, user, sizeof(user), pass, sizeof(pass));
int i = -1;
pass[0] = '\0';
do {
i++;
user[i] = real_adr[i];
} while((real_adr[i] != ':') && (real_adr[i]));
user[i] = '\0';
if (real_adr[i] == ':') { // pass
int j = -1;
i++; // oui on saute aussi le :
do {
j++;
pass[j] = real_adr[i + j];
} while(((&real_adr[i + j + 1]) < adr) && (real_adr[i + j]));
pass[j] = '\0';
}
}
// Calculer RETR <nom>
{
@@ -994,8 +984,8 @@ int get_ftp_line(T_SOC soc, char *ptrline, size_t line_size, int timeout) {
//case 0: break; // pas encore --> erreur (on attend)!
case 1:
HTS_STAT.HTS_TOTAL_RECV += 1; // compter flux entrant
if ((b != 10) && (b != 13) && (i < (int) sizeof(data) - 1))
data[i++] = b; // truncate hostile over-long reply lines
if ((b != 10) && (b != 13))
data[i++] = b;
break;
default:
if (ptrline)

View File

@@ -70,11 +70,6 @@ int back_launch_ftp(FTPDownloadStruct * params);
int run_launch_ftp(FTPDownloadStruct * params);
int send_line(T_SOC soc, const char *data);
int get_ftp_line(T_SOC soc, char *line, size_t line_size, int timeout);
/* Split a "user[:pass]@" prefix (end = jump_identification result) into
bounded, NUL-terminated user/pass buffers, truncating to fit.
Both sizes must be nonzero. */
void ftp_split_userpass(const char *src, const char *end, char *user,
size_t user_size, char *pass, size_t pass_size);
T_SOC get_datasocket(char *to_send, size_t to_send_size);
int stop_ftp(lien_back * back);
char *linejmp(char *line);

View File

@@ -63,9 +63,6 @@ Please visit our Website: http://www.httrack.com
/* This file */
#include "htsjava.h"
/* calloct/freet wrappers */
#include "htssafe.h"
static int reverse_endian(void) {
int endian = 1;
@@ -207,16 +204,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
return 0;
}
/* A constant-pool entry is >= 1 byte on disk; reject a count exceeding
the file size (hostile .class ~68 MB alloc DoS). */
if (!hts_count_fits(header.count, (LLint) fsize(file))) {
fclose(fpout);
sprintf(str->err_msg,
"Invalid constant pool count %u (file len " LLintP ")",
(unsigned) header.count, (LLint) fsize(file));
return 0;
}
tab = (RESP_STRUCT *) calloct(header.count, sizeof(RESP_STRUCT));
tab = (RESP_STRUCT *) calloc(header.count, sizeof(RESP_STRUCT));
if (!tab) {
sprintf(str->err_msg, "Unable to alloc %d bytes",
(int) sizeof(RESP_STRUCT));
@@ -242,7 +230,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
} else { // ++ une erreur est survenue!
if (strnotempty(str->err_msg) == 0)
strcpy(str->err_msg, "Internal readtable error");
freet(tab);
free(tab);
if (fpout) {
fclose(fpout);
fpout = NULL;
@@ -300,7 +288,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
#if JAVADEBUG
printf("end\n");
#endif
freet(tab);
free(tab);
if (fpout) {
fclose(fpout);
fpout = NULL;

View File

@@ -33,19 +33,15 @@ Please visit our Website: http://www.httrack.com
#ifndef HTSJAVA_DEFH
#define HTSJAVA_DEFH
#include <stdint.h>
#ifndef HTS_DEF_FWSTRUCT_JAVA_HEADER
#define HTS_DEF_FWSTRUCT_JAVA_HEADER
typedef struct JAVA_HEADER JAVA_HEADER;
#endif
/* 10-byte on-disk .class header image, fread() directly: fields need exact
widths (LP64's 8-byte 'unsigned long' magic never matched 0xCAFEBABE). */
struct JAVA_HEADER {
uint32_t magic;
uint16_t minor;
uint16_t major;
uint16_t count;
unsigned long int magic;
unsigned short int minor;
unsigned short int major;
unsigned short int count;
};
#ifndef HTS_DEF_FWSTRUCT_RESP_STRUCT

View File

@@ -1149,8 +1149,7 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
char BIGSTK protocol[256], url[HTS_URLMAXSIZE * 2], method[256];
linput(fp, line, 1000);
/* widths bound method[256], url[HTS_URLMAXSIZE*2], protocol[256] */
if (sscanf(line, "%255s %2047s %255s", method, url, protocol) == 3) {
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
size_t ret;
// selon que l'on a ou pas un proxy
if (retour->req.proxy.active) {
@@ -6023,11 +6022,9 @@ HTSEXT_API httrackp *hts_create_opt(void) {
"htsswf", "htsjava", "httrack-plugin", NULL
};
#else
#ifndef HTS_LIBHTSJAVA_NAME
#define HTS_LIBHTSJAVA_NAME "libhtsjava.so" /* non-autoconf fallback */
#endif
static const char *defaultModules[] = {"libhtsswf.so.1", HTS_LIBHTSJAVA_NAME,
"httrack-plugin", NULL};
static const char *defaultModules[] = {
"libhtsswf.so.1", "libhtsjava.so.2", "httrack-plugin", NULL
};
#endif
httrackp *opt = malloc(sizeof(httrackp));

View File

@@ -41,10 +41,6 @@ Please visit our Website: http://www.httrack.com
#include "htstools.h"
#include "htscharset.h"
#include "htsencoding.h"
#include "htssniff.h"
#if HTS_USEZLIB
#include "htszlib.h"
#endif
#include <ctype.h>
#define ADD_STANDARD_PATH \
@@ -74,6 +70,31 @@ static const char *hts_tbdev[] = {
""
};
#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() do { \
int prev = opt->state._hts_in_html_parsing; \
while(back_pluggable_sockets_strict(sback, opt) <= 0) { \
opt->state. _hts_in_html_parsing = 6; \
/* Wait .. */ \
back_wait(sback,opt,cache,0); \
/* Transfer rate */ \
engine_stats(); \
/* Refresh various stats */ \
HTS_STAT.stat_nsocket=back_nsoc(sback); \
HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \
HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \
HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \
HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \
HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \
/* Check */ \
{ \
if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count,-1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
return -1; \
} \
} \
} \
opt->state._hts_in_html_parsing = prev; \
} while(0)
/* Strip all // */
static void cleanDoubleSlash(char *s) {
int i, j;
@@ -117,191 +138,37 @@ static void cleanEndingSpaceOrDot(char *s) {
}
}
/* Wire Content-Type vs URL extension: a patchable wire type wins over an
unspecific ext, the HTS_UNKNOWN_MIME sentinel keeps a specific non-HTML ext
(#267 guard), a declared disagreement is CONTESTED (sniffed below). */
typedef enum wire_verdict {
WIRE_KEEPS_EXT,
WIRE_WINS,
WIRE_CONTESTED
} wire_verdict;
static wire_verdict wire_ext_verdict(httrackp *opt, const char *wiremime,
const char *file, char *urlmime,
size_t urlmime_size) {
if (may_unknown2(opt, wiremime, file))
return WIRE_KEEPS_EXT; /* type kept verbatim (keep-list / bogus-multiple) */
urlmime[0] = '\0';
/* type implied by the URL extension, only when confidently known (flag 0) */
if (!get_httptype_sized(opt, urlmime, urlmime_size, file, 0))
return WIRE_WINS; /* URL ext implies no known type */
if (strfield2(wiremime, urlmime))
return WIRE_KEEPS_EXT; /* agreement (no .htm->.html churn) */
if (!is_hypertext_mime(opt, urlmime, file) &&
strfield2(wiremime, HTS_UNKNOWN_MIME))
return WIRE_KEEPS_EXT; /* no declared type */
return WIRE_CONTESTED;
}
/* Optional evidence for a contested wire-vs-ext verdict. */
typedef struct sniff_src {
struct_back *sback; /* live backing (looked up by adr/fil) */
const lien_back *headers; /* snapshot: r.adr, else the url_sav file */
const char *adr, *fil;
const char *prev_save; /* previous run's save name (cache X-Save) */
} sniff_src;
#if HTS_USEZLIB
/* Inflate the head of a gzip/zlib stream; 0 when undecodable. */
static size_t sniff_inflate_head(const void *in, size_t in_len, void *out,
size_t out_len) {
z_stream zs;
size_t n = 0;
int err;
memset(&zs, 0, sizeof(zs));
if (inflateInit2(&zs, 47) != Z_OK) /* 47: gzip or zlib, autodetected */
return 0;
zs.next_in = (const Bytef *) in;
zs.avail_in = (uInt) in_len;
zs.next_out = (Bytef *) out;
zs.avail_out = (uInt) out_len;
err = inflate(&zs, Z_SYNC_FLUSH);
if (err == Z_OK || err == Z_STREAM_END || err == Z_BUF_ERROR)
n = out_len - zs.avail_out;
inflateEnd(&zs);
return n;
}
#endif
static size_t sniff_read_head(const char *path, void *buf, size_t len) {
char catbuff[CATBUFF_SIZE];
FILE *const fp = FOPEN(fconv(catbuff, sizeof(catbuff), path), "rb");
size_t n = 0;
if (fp != NULL) {
n = fread(buf, 1, len, fp);
fclose(fp);
}
return n;
}
/* Body head of one slot: memory, else its flushed on-disk file (url_sav, or
tmpfile for a compressed stream); inflated so the sniff sees the final body.
*/
static size_t sniff_slot_head(const lien_back *slot, void *buf, size_t len) {
const htsblk *const r = &slot->r;
size_t n = 0;
if (r->adr != NULL && r->size > 0) {
n = (size_t) r->size < len ? (size_t) r->size : len;
memcpy(buf, r->adr, n);
} else {
if (r->out != NULL)
fflush(r->out);
if (slot->url_sav[0] != '\0')
n = sniff_read_head(slot->url_sav, buf, len);
if (n == 0 && slot->tmpfile != NULL && slot->tmpfile[0] != '\0')
n = sniff_read_head(slot->tmpfile, buf, len);
}
if (n > 0 && r->compressed) {
#if HTS_USEZLIB
unsigned char raw[HTS_SNIFF_LEN];
if (n > sizeof(raw))
n = sizeof(raw);
memcpy(raw, buf, n);
n = sniff_inflate_head(raw, n, buf, len);
#else
n = 0;
#endif
}
return n;
}
/* Up to len leading body bytes; 0 when unavailable, and always in
non-delayed mode (its HEAD-probe first run couldn't sniff either). */
static size_t sniff_body_head(httrackp *opt, const sniff_src *src, void *buf,
size_t len) {
size_t n = 0;
if (src == NULL || opt->savename_delayed == HTS_SAVENAME_DELAYED_NONE)
return 0;
/* live backing slot: a snapshot (back_copy_static) loses r.adr/r.out */
if (src->sback != NULL && src->adr != NULL && src->fil != NULL) {
const int b = back_index(opt, src->sback, src->adr, src->fil, NULL);
if (b >= 0)
n = sniff_slot_head(&src->sback->lnk[b], buf, len);
}
if (n == 0 && src->headers != NULL)
n = sniff_slot_head(src->headers, buf, len);
return n;
}
/* Contested verdicts: magic proving the URL ext keeps it, else wire wins. */
static int wire_patches_ext(httrackp *opt, const sniff_src *src,
const char *wiremime, const char *file) {
/* Should the wire Content-Type override the URL's own extension when naming the
saved file? True when the type is patchable (may_unknown2) and either the URL
extension implies no specific type or the server declared a disagreeing one.
A URL extension mapping to a specific non-HTML type is kept only when the
server declared NO type (the HTS_UNKNOWN_MIME sentinel; the #267 mangle
guard): a typeless .png stays .png, but a .pdf explicitly served as text/html
is named .html. The sentinel rides the cache, so updates stay consistent. */
static int wire_patches_ext(httrackp *opt, const char *wiremime,
const char *file) {
char urlmime[256];
switch (wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime))) {
case WIRE_KEEPS_EXT:
if (may_unknown2(opt, wiremime, file))
return 0; /* type kept verbatim (keep-list / bogus-multiple) */
urlmime[0] = '\0';
/* type implied by the URL extension, only when confidently known (flag 0) */
if (!get_httptype_sized(opt, urlmime, sizeof(urlmime), file, 0))
return 1; /* URL ext implies no known type: trust the wire type */
if (strfield2(wiremime, urlmime))
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
/* wire disagrees with a specific non-HTML URL ext. Keep the ext only when
the server declared no type (the sentinel); an explicitly declared type,
even text/html, is trusted, so a binary-looking URL that really serves
HTML (login/error interstitial, soft-404) is named .html. */
if (!is_hypertext_mime(opt, urlmime, file) &&
strfield2(wiremime, HTS_UNKNOWN_MIME))
return 0;
case WIRE_WINS:
return 1;
case WIRE_CONTESTED:
break;
}
if (src != NULL) {
if (hts_sniff_mime_known(urlmime)) {
unsigned char head[HTS_SNIFF_LEN];
const size_t n = sniff_body_head(opt, src, head, sizeof(head));
if (n > 0)
return hts_sniff_mime_consistent(head, n, urlmime) ? 0 : 1;
}
/* no bytes: reproduce the previous run's verdict (cached X-Save name) */
if (src->prev_save != NULL && src->prev_save[0] != '\0') {
char prevmime[256];
prevmime[0] = '\0';
if (get_httptype_sized(opt, prevmime, sizeof(prevmime), src->prev_save,
0) &&
strfield2(prevmime, urlmime))
return 0;
}
}
return 1;
}
int hts_ext_sniff_wanted(httrackp *opt, const char *wiremime,
const char *file) {
char urlmime[256];
return wiremime != NULL && strnotempty(wiremime) &&
wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime)) ==
WIRE_CONTESTED &&
hts_sniff_mime_known(urlmime);
}
/* Wire-metadata name change: a Content-Disposition filename wins (returns 2),
else the declared type's ext when wire_patches_ext() allows (returns 1),
else 0. ext receives the new extension or replacement filename. */
static int resolve_extension(httrackp *opt, const sniff_src *src,
const char *cdispo, const char *contenttype,
const char *fil, char *ext, size_t ext_size) {
if (strnotempty(cdispo)) {
strlcpybuff(ext, cdispo, ext_size);
return 2;
}
if (wire_patches_ext(opt, src, contenttype, fil) &&
give_mimext(ext, ext_size, contenttype))
return 1;
return 0;
}
// Build the local save name (save) from adr/fil; renames on collision
// (e.g. INDEX.HTML vs index.html).
// forme le nom du fichier à sauver (save) à partir de fil et adr
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
int url_savename(lien_adrfilsave *const afs,
lien_adrfil *const former,
const char *referer_adr, const char *referer_fil,
@@ -538,30 +405,45 @@ int url_savename(lien_adrfilsave *const afs,
// si option check_type activée
if (is_html < 0 && opt->check_type && !ext_chg) {
int ishtest = 0;
if (protocol != PROTOCOL_FILE
&& protocol != PROTOCOL_FTP
) {
// tester type avec requète HEAD si on ne connait pas le type du fichier
if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD ||
ishtml(opt, fil) < 0) { // unsure whether it's html or a file
(ishtest = ishtml(opt, fil)) <
0) { // unsure whether it's html or a file
// lire dans le cache
char BIGSTK previous_save[HTS_URLMAXSIZE * 2];
htsblk r;
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
previous_save[0] = '\0';
r = cache_read_including_broken(opt, cache, adr, fil,
previous_save); // test uniquement
if (r.statuscode != -1) { // pas d'erreur de lecture cache
char s[32];
if (r.statuscode != -1) { // cache entry read OK
s[0] = '\0';
hts_log_print(opt, LOG_DEBUG, "Testing link type (from cache) %s%s",
adr_complete, fil_complete);
if (!HTTP_IS_REDIRECT(r.statuscode)) {
const sniff_src src = {sback, NULL, adr, fil, previous_save};
ext_chg = resolve_extension(opt, &src, r.cdispo, r.contenttype,
fil, ext, sizeof(ext));
if (strnotempty(r.cdispo)) { /* filename given */
ext_chg = 2; /* change filename */
strcpybuff(ext, r.cdispo);
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
if (give_mimext(s, sizeof(s),
r.contenttype)) { // recognized extension
ext_chg = 1;
strcpybuff(ext, s);
}
}
}
#ifdef DEFAULT_BIN_EXT
// no extension and potentially bogus
else if (ishtest == -2) {
ext_chg = 1;
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
}
#endif
//
} else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD &&
is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
Lookup mimetype not only by extension,
@@ -585,13 +467,22 @@ int url_savename(lien_adrfilsave *const afs,
// fail later
else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
!opt->state.stop) {
// Check if the file is ready in backing.
// Check if the file is ready in backing. We basically take the same logic as later.
// FIXME: we should cleanup and factorize this unholy mess
if (headers != NULL && headers->status >= 0 && !is_redirect) {
const sniff_src src = {sback, headers, adr, fil, NULL};
ext_chg = resolve_extension(opt, &src, headers->r.cdispo,
headers->r.contenttype,
headers->url_fil, ext, sizeof(ext));
if (strnotempty(headers->r.cdispo)) { /* filename given */
ext_chg = 2; /* change filename */
strcpybuff(ext, headers->r.cdispo);
} else if (wire_patches_ext(opt, headers->r.contenttype,
headers->url_fil)) {
char s[16];
if (give_mimext(
s, sizeof(s),
headers->r.contenttype)) { // recognized extension
ext_chg = 1;
strcpybuff(ext, s);
}
}
}
else if (mime_type != NULL) {
ext[0] = '\0';
@@ -609,6 +500,13 @@ int url_savename(lien_adrfilsave *const afs,
if (!may_unknown2(opt, mime_type, fil)) {
ext_chg = 1;
}
#ifdef DEFAULT_BIN_EXT
// no extension and potentially bogus
else if (ishtml(opt, fil) == -2) {
ext_chg = 1;
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
}
#endif
} else {
ext_chg = 0;
}
@@ -627,10 +525,11 @@ int url_savename(lien_adrfilsave *const afs,
int has_been_moved = 0;
lien_adrfil current;
/* Wait for an available test slot, honoring the connection limits
/* Ensure we don't use too many sockets by using a "testing" one
If we have only 1 simultaneous connection authorized, wait for pending download
Wait for an available slot
*/
if (!hts_wait_available_socket(sback, opt, cache, ptr))
return -1;
URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
/* Rock'in */
current.adr[0] = current.fil[0] = '\0';
@@ -660,11 +559,24 @@ int url_savename(lien_adrfilsave *const afs,
if (ptr >= 0) {
back_fillmax(sback, opt, cache, ptr, numero_passe);
}
if (!hts_loop_tick(sback, opt, b, ptr)) {
// on est obligé d'appeler le shell pour le refresh..
// Transfer rate
engine_stats();
// Refresh various stats
HTS_STAT.stat_nsocket = back_nsoc(sback);
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
if (!RUN_CALLBACK7
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
(int) (time_local() - HTS_STAT.stat_timestart),
&HTS_STAT)) {
return -1;
} else if (opt->state._hts_cancel ||
!back_checkmirror(
opt)) { // cancel level 2 or 1 (cancel parsing)
} else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
back_delete(opt, cache, sback, b); // cancel test
stop_looping = 1;
}
@@ -729,9 +641,8 @@ int url_savename(lien_adrfilsave *const afs,
"Loop with HEAD request (during prefetch) at %s%s",
current.adr, current.fil);
}
if (!hts_wait_available_socket(sback, opt,
cache, ptr))
return -1;
// Ajouter
URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
if (back_add(sback, opt, cache, moved.adr, moved.fil, methode, referer_adr, referer_fil, 1) != -1) { // OK
hts_log_print(opt, LOG_DEBUG,
"(during prefetch) %s (%d) to link %s at %s%s",
@@ -785,10 +696,30 @@ int url_savename(lien_adrfilsave *const afs,
// libérer emplacement backing
}
// no error: change the type?
ext_chg = resolve_extension(
opt, NULL, back[b].r.cdispo, back[b].r.contenttype,
back[b].url_fil, ext, sizeof(ext));
{ // pas d'erreur, changer type?
char s[16];
s[0] = '\0';
if (strnotempty(back[b].r.cdispo)) { /* filename given */
ext_chg = 2; /* change filename */
strcpybuff(ext, back[b].r.cdispo);
} else if (wire_patches_ext(opt, back[b].r.contenttype,
back[b].url_fil)) {
if (give_mimext(
s, sizeof(s),
back[b].r.contenttype)) { // recognized extension
ext_chg = 1;
strcpybuff(ext, s);
}
}
#ifdef DEFAULT_BIN_EXT
// no extension and potentially bogus
else if (ishtest == -2) {
ext_chg = 1;
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
}
#endif
}
}
// FIN Si non déplacé, forcer type?

View File

@@ -100,8 +100,6 @@ void standard_name(char *b, size_t bsize, const char *dot_pos,
const char *nom_pos, const char *fil_complete,
int short_ver);
void url_savename_addstr(char *d, const char *s);
/* Contested wire-vs-ext verdict that a body sniff could settle (htssniff.h). */
int hts_ext_sniff_wanted(httrackp *opt, const char *wiremime, const char *file);
char *url_md5(char *digest_buffer, const char *fil_complete);
void url_savename_refname(const char *adr, const char *fil, char *filename);
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,

View File

@@ -49,7 +49,6 @@ Please visit our Website: http://www.httrack.com
#include "htsindex.h"
#include "htscharset.h"
#include "htsencoding.h"
#include "htssniff.h"
/* external modules */
#include "htsmodules.h"
@@ -109,47 +108,62 @@ Please visit our Website: http://www.httrack.com
#define HT_ADD_FOP
/* Mutable extended-context fields: one source of truth so the DEFINE/SET/SAVE
load and store lists can't drift apart. */
/* clang-format off */
#define ENGINE_MUTABLE_FIELDS(X) \
X(int, error, stre->error_) \
X(int, store_errpage, stre->store_errpage_) \
X(int, makeindex_done, stre->makeindex_done_) \
X(FILE *, makeindex_fp, stre->makeindex_fp_) \
X(int, makeindex_links, stre->makeindex_links_) \
X(LLint, stat_fragment, stre->stat_fragment_)
#define ENGINE_FIELD_DECLARE(type, name, src) type name = *(src);
#define ENGINE_FIELD_LOAD(type, name, src) name = *(src);
#define ENGINE_FIELD_STORE(type, name, src) *(src) = name;
#define ENGINE_DEFINE_CONTEXT() \
ENGINE_DEFINE_CONTEXT_BASE(); \
/* */ \
htsblk* const r HTS_UNUSED = stre->r_; \
hash_struct* const hash HTS_UNUSED = stre->hash_; \
char* const codebase HTS_UNUSED = stre->codebase; \
char* const base HTS_UNUSED = stre->base; \
/* */ \
const char * const template_header HTS_UNUSED = stre->template_header_; \
const char * const template_body HTS_UNUSED = stre->template_body_; \
const char * const template_footer HTS_UNUSED = stre->template_footer_; \
/* */ \
HTS_UNUSED char* const makeindex_firstlink = stre->makeindex_firstlink_; \
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_DECLARE) \
/* load-once (kept out of SET/SAVE): re-reading would reset the throttle */ \
/* */ \
/* */ \
int error = * stre->error_; \
int store_errpage = * stre->store_errpage_; \
/* */ \
int makeindex_done = *stre->makeindex_done_; \
FILE* makeindex_fp = *stre->makeindex_fp_; \
int makeindex_links = *stre->makeindex_links_; \
/* */ \
LLint stat_fragment = *stre->stat_fragment_; \
HTS_UNUSED TStamp makestat_time = stre->makestat_time; \
HTS_UNUSED FILE* makestat_fp = stre->makestat_fp
/* clang-format off: an edit realigns all backslashes, churning the macro. */
/* clang-format off */
/* Load-once: re-reading resets makestat_time (mutated locally, never SAVEd). */
#define ENGINE_SET_CONTEXT() \
ENGINE_SET_CONTEXT_BASE(); \
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_LOAD)
/* */ \
error = * stre->error_; \
store_errpage = * stre->store_errpage_; \
/* */ \
makeindex_done = *stre->makeindex_done_; \
makeindex_fp = *stre->makeindex_fp_; \
makeindex_links = *stre->makeindex_links_; \
/* */ \
stat_fragment = *stre->stat_fragment_
/* clang-format on */
#define ENGINE_LOAD_CONTEXT() \
ENGINE_DEFINE_CONTEXT()
#define ENGINE_SAVE_CONTEXT() \
ENGINE_SAVE_CONTEXT_BASE(); \
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_STORE)
/* clang-format on */
/* */ \
* stre->error_ = error; \
* stre->store_errpage_ = store_errpage; \
/* */ \
*stre->makeindex_done_ = makeindex_done; \
*stre->makeindex_fp_ = makeindex_fp; \
*stre->makeindex_links_ = makeindex_links; \
/* */ \
*stre->stat_fragment_ = stat_fragment
#define _ROBOTS ((robots_wizard*)opt->robotsptr)
@@ -605,14 +619,13 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
}
// Decode title with encoding
if (str->page_charset_ != NULL &&
*str->page_charset_ != '\0') {
char *sUtf = hts_convertStringToUTF8(
s, strlen(s), str->page_charset_);
if (str->page_charset_ != NULL
&& *str->page_charset_ != '\0') {
char *const sUtf =
hts_convertStringToUTF8(s, strlen(s), str->page_charset_);
if (sUtf != NULL) {
/* UTF-8 can expand past s[]; truncate to fit */
snprintf(s, sizeof(s), "%s", sUtf);
freet(sUtf);
strcpy(s, sUtf);
free(sUtf);
}
}
@@ -3399,7 +3412,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
back_fillmax(sback, opt, cache, ptr, numero_passe);
if (!hts_loop_tick(sback, opt, 0, ptr)) {
// Transfer rate
engine_stats();
// Refresh various stats
HTS_STAT.stat_nsocket = back_nsoc(sback);
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
if (!RUN_CALLBACK7
(opt, loop, sback->lnk, sback->count, 0, ptr, opt->lien_tot,
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
*stre->exit_xh_ = 1; // exit requested
XH_uninit;
@@ -3410,6 +3436,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
nofollow = 1; // moins violent
opt->state._hts_cancel = 0;
}
}
// refresh the backing system each 2 seconds
if (engine_stats()) {
@@ -3476,24 +3503,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
return 0;
}
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
* contract in htsparse.h. */
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
const char *cur_fil,
const char *moved_adr,
const char *moved_fil) {
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
const int norm_query = opt->urlhack && !opt->no_query_dedup;
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
if (strcasecmp(jump_identification_const(moved_adr),
jump_identification_const(cur_adr)) != 0)
return HTS_FALSE;
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
return strcasecmp(n_fil, pn_fil) == 0;
}
/*
Check 301, 302, .. statuscodes (moved)
*/
@@ -3539,9 +3548,36 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
if ((reponse =
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
int set_prio_to = 0; // pas de priotité fixéd par wizard
// A same-file alias redirect must be followed, not stubbed (#159).
const hts_boolean same_savefile = hts_redirect_same_savefile(
opt, urladr(), urlfil(), moved->adr, moved->fil);
// check whether URLHack is harmless or not (per the effective
// sub-flags)
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
!opt->no_query_dedup)) {
const int norm_host = !opt->no_www_dedup;
const int norm_slash = !opt->no_slash_dedup;
const int norm_query = !opt->no_query_dedup;
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
strlcpybuff(n_adr,
norm_host ? jump_normalized_const(moved->adr)
: jump_identification_const(moved->adr),
sizeof(n_adr));
strlcpybuff(pn_adr,
norm_host ? jump_normalized_const(urladr())
: jump_identification_const(urladr()),
sizeof(pn_adr));
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
norm_query);
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
norm_query);
if (strcasecmp(n_adr, pn_adr) == 0
&& strcasecmp(n_fil, pn_fil) == 0) {
hts_log_print(opt, LOG_WARNING,
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
urladr(), urlfil(), moved->adr, moved->fil);
}
}
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
// c'est (en gros) la même URL..
// si c'est un problème de casse dans le host c'est que le serveur est buggé
@@ -3569,17 +3605,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
moved->adr, moved->fil);
}
} else if (same_savefile) {
// A stub would point at itself; follow the redirect instead.
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
&set_prio_to, NULL) != 1) {
get_it = 1;
hts_log_print(opt, LOG_WARNING,
"Redirect to a same-file alias, fetching real "
"content: %s%s -> %s%s",
urladr(), urlfil(), moved->adr, moved->fil);
}
} /* sinon traité normalement */
} /* sinon traité normalement */
}
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
@@ -3602,11 +3628,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
heap(heap(ptr)->precedent)->adr,
heap(heap(ptr)->precedent)->fil, opt,
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
// Same-file alias: the reserved name is the invalidated source,
// so record anyway.
if (same_savefile ||
hash_read(hash, savedmoved.save, NULL,
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
// enregistrer lien avec SAV IDENTIQUE
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
// mode test?
@@ -3630,6 +3652,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
"moving %s to an existing file %s",
heap(ptr)->fil, urlfil());
}
}
}
@@ -3946,8 +3969,22 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
{
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
// Transfer rate
engine_stats();
// Refresh various stats
HTS_STAT.stat_nsocket = back_nsoc(sback);
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
b = 0;
if (!hts_loop_tick(sback, opt, b, ptr) || !back_checkmirror(opt)) {
if (!RUN_CALLBACK7
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)
|| !back_checkmirror(opt)) {
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
*stre->exit_xh_ = 1; // exit requested
XH_uninit;
@@ -4049,11 +4086,21 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
while(opt->state._hts_setpause || back_pluggable_sockets_strict(sback, opt) <= 0) { // on fait la pause..
opt->state._hts_in_html_parsing = 6;
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
/* time limit (-E) exceeded: stop waiting for a socket (#481) */
if (!back_checkmirror(opt))
break;
if (!hts_loop_tick(sback, opt, b, ptr)) {
// Transfer rate
engine_stats();
// Refresh various stats
HTS_STAT.stat_nsocket = back_nsoc(sback);
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
if (!RUN_CALLBACK7
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
*stre->exit_xh_ = 1; // exit requested
XH_uninit;
@@ -4240,12 +4287,26 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
freet(s);
}
if (!hts_loop_tick(sback, opt, b, ptr)) {
// Transfer rate
engine_stats();
// Refresh various stats
HTS_STAT.stat_nsocket = back_nsoc(sback);
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
if (!RUN_CALLBACK7
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
*stre->exit_xh_ = 1; // exit requested
XH_uninit;
return 0;
}
}
#if HTS_POLL
@@ -4478,9 +4539,10 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
IS_DELAYED_EXT(afs->save) && continue_loop && loops < 7; loops++) {
continue_loop = 0;
/* Wait for an available slot */
if (!hts_wait_available_socket(sback, opt, cache, ptr))
return -1;
/*
Wait for an available slot
*/
WAIT_FOR_AVAILABLE_SOCKET();
/* We can lookup directly in the cache to speedup this mess */
if (opt->delayed_cached) {
@@ -4626,28 +4688,39 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
if (ptr >= 0) {
back_fillmax(sback, opt, cache, ptr, numero_passe);
}
if (!hts_loop_tick(sback, opt, b, ptr)) {
back_set_unlocked(sback, b);
return -1;
} else if (opt->state._hts_cancel ||
!back_checkmirror(
opt)) { // cancel level 2 or 1 (cancel parsing)
back_delete(opt, cache, sback, b); // cancel test
break;
// on est obligé d'appeler le shell pour le refresh..
{
// Transfer rate
engine_stats();
// Refresh various stats
HTS_STAT.stat_nsocket = back_nsoc(sback);
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
if (!RUN_CALLBACK7
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
return -1;
} else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
back_delete(opt, cache, sback, b); // cancel test
break;
}
}
} while (
/* dns/connect/request */
(back[b].status >= 99 && back[b].status <= 101) ||
/* For redirects, wait for request to be terminated */
(HTTP_IS_REDIRECT(back[b].r.statuscode) && back[b].status > 0) ||
/* Same for errors */
(HTTP_IS_ERROR(back[b].r.statuscode) && back[b].status > 0) ||
/* Contested type: wait for a sniffable body head (or EOF) */
(back[b].r.statuscode == HTTP_OK && back[b].status > 0 &&
strnotempty(back[b].r.cdispo) == 0 &&
back[b].r.size < HTS_SNIFF_LEN &&
hts_ext_sniff_wanted(opt, back[b].r.contenttype,
back[b].url_fil)));
} while(
/* dns/connect/request */
(back[b].status >= 99 && back[b].status <= 101)
||
/* For redirects, wait for request to be terminated */
(HTTP_IS_REDIRECT(back[b].r.statuscode) && back[b].status > 0)
||
/* Same for errors */
(HTTP_IS_ERROR(back[b].r.statuscode) && back[b].status > 0)
);
if (b >= 0) {
back_set_unlocked(sback, b); // Unlocked entry
}
@@ -4782,9 +4855,6 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
/* Still have a back reference */
if (b >= 0) {
/* patch url_sav BEFORE finalize: it records/caches under this name
*/
strcpybuff(back[b].url_sav, afs->save);
/* Finalize now as we have the type */
if (back[b].status == STATUS_READY) {
if (!back[b].finalized) {
@@ -4792,6 +4862,8 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
back_finalize(opt, cache, sback, b);
}
}
/* Patch destination filename for direct-to-disk mode */
strcpybuff(back[b].url_sav, afs->save);
}
} // b >= 0

View File

@@ -116,19 +116,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
int hts_mirror_check_moved(htsmoduleStruct * str,
htsmoduleStructExtended * stre);
/*
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
same local file, so it must be followed rather than turned into a
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
stripped, www kept (www dedup is the crawl layer's job), path
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
on the dedup hash, which folds www and never collapses http<->https.
*/
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
const char *cur_fil,
const char *moved_adr,
const char *moved_fil);
/*
Process user intercations: pause, add link, delete link..
*/
@@ -175,4 +162,27 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
/* Apply changes */ \
* str->ptr_ = ptr
#define WAIT_FOR_AVAILABLE_SOCKET() do { \
int prev = opt->state._hts_in_html_parsing; \
while(back_pluggable_sockets_strict(sback, opt) <= 0) { \
opt->state._hts_in_html_parsing = 6; \
/* Wait .. */ \
back_wait(sback,opt,cache,0); \
/* Transfer rate */ \
engine_stats(); \
/* Refresh various stats */ \
HTS_STAT.stat_nsocket=back_nsoc(sback); \
HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \
HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \
HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \
HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \
HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \
/* Check */ \
if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, -1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
return -1; \
} \
} \
opt->state._hts_in_html_parsing = prev; \
} while(0)
#endif

View File

@@ -456,13 +456,6 @@ static HTS_INLINE HTS_UNUSED const char *htsbuff_str(const htsbuff *b) {
return b->buf;
}
/** True if 'count' records of >= 1 byte each fit in 'available' bytes; guards
an attacker-controlled count driving a large allocation. */
static HTS_INLINE HTS_UNUSED hts_boolean hts_count_fits(size_t count,
LLint available) {
return (available >= 0 && (LLint) count <= available) ? HTS_TRUE : HTS_FALSE;
}
/* Thin aliases over the libc allocator/memcpy (historical "t" suffix); no
added bounds checking. freet() also NULLs the freed pointer and tolerates
NULL. memcpybuff() despite the name is a raw memcpy: the caller owns the

View File

@@ -45,14 +45,11 @@ Please visit our Website: http://www.httrack.com
#include "htscore.h"
#include "htsdefines.h"
#include "htslib.h"
#include "htsparse.h"
#include "htscache_selftest.h"
#include "htsdns_selftest.h"
#include "htscharset.h"
#include "htsencoding.h"
#include "htsftp.h"
#include "htsmd5.h"
#include "htssniff.h"
#if HTS_USEZLIB
#include "htszlib.h"
#endif
@@ -63,10 +60,6 @@ Please visit our Website: http://www.httrack.com
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef _WIN32
#include <sys/socket.h>
#include <unistd.h>
#endif
/* very minimalistic internal tests */
static void basic_selftests(void) {
@@ -714,8 +707,7 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
}
s = strdupt(argv[0]);
enc = argc >= 2 ? argv[1] : "UTF-8";
if (s != NULL &&
hts_unescapeEntitiesWithCharset(s, s, strlen(s) + 1, enc) == 0) {
if (s != NULL && hts_unescapeEntitiesWithCharset(s, s, strlen(s), enc) == 0) {
printf("%s\n", s);
freet(s);
} else {
@@ -724,34 +716,6 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
return 0;
}
/* The unescapers must reserve one byte for the trailing NUL: a 'max'-byte
dest holding 'max' output chars pre-fix wrote dest[max] (1-byte OOB, caught
by ASan). Both unescapeEntities and unescapeUrl share the guard. */
static int st_unescape_bounds(httrackp *opt, int argc, char **argv) {
char dest[4];
(void) opt;
(void) argc;
(void) argv;
assertf(hts_unescapeEntities("abcd", dest, sizeof(dest)) == -1);
assertf(hts_unescapeUrl("abcd", dest, sizeof(dest)) == -1);
assertf(hts_unescapeEntities("abc", dest, sizeof(dest)) == 0);
assertf(strcmp(dest, "abc") == 0);
/* raw multi-byte UTF-8 flush path (bypasses the per-byte guard) */
assertf(hts_unescapeUrl("ab\xC3\xA9", dest, sizeof(dest)) == -1);
assertf(hts_unescapeUrl("a\xC3\xA9", dest, sizeof(dest)) == 0);
assertf(strcmp(dest, "a\xC3\xA9") == 0);
{
/* %xx-encoded flush path (utfBufferJ = lastJ rollback) */
char wide[8];
assertf(hts_unescapeUrl("%C3%A9", wide, sizeof(wide)) == 0);
assertf(strcmp(wide, "\xC3\xA9") == 0);
}
printf("unescape-bounds self-test OK\n");
return 0;
}
static int st_hashtable(httrackp *opt, int argc, char **argv) {
char *snum;
unsigned long count = 0;
@@ -1094,218 +1058,35 @@ static int st_resolve(httrackp *opt, int argc, char **argv) {
return 0;
}
/* Extra args are key=value: adr= cdispo= statuscode= status= strip= urlhack=
no-www= no-slash= no-query= n83= type=, plus repeatable prior=adr|fil|sav
registering an already-crawled link (dedup/collision paths). */
/* Parse raw response-header lines and print the naming-relevant fields. */
static int st_header(httrackp *opt, int argc, char **argv) {
htsblk r;
int i;
(void) opt;
if (argc < 1) {
fprintf(stderr, "header: needs at least one raw header line\n");
return 1;
}
memset(&r, 0, sizeof(r));
for (i = 0; i < argc; i++) {
char BIGSTK line[HTS_URLMAXSIZE * 2];
strcpybuff(line, argv[i]);
treathead(NULL, "www.example.com", "/", &r, line);
}
printf("contenttype=%s cdispo=%s\n", r.contenttype, r.cdispo);
return 0;
}
/* Decode a body argument ("hex:FFD8.." or literal text) into buf. */
static size_t st_decode_body(const char *arg, char *buf, size_t size) {
size_t n = 0;
if (strncmp(arg, "hex:", 4) == 0) {
const char *s = arg + 4;
for (; s[0] != '\0' && s[1] != '\0' && n + 1 < size; s += 2) {
unsigned int byte;
if (sscanf(s, "%2x", &byte) != 1)
break;
buf[n++] = (char) byte;
}
} else {
n = strlen(arg);
if (n >= size)
n = size - 1;
memcpy(buf, arg, n);
}
buf[n] = '\0';
return n;
}
static int st_sniff(httrackp *opt, int argc, char **argv) {
char BIGSTK body[1024];
size_t n;
(void) opt;
if (argc < 2) {
fprintf(stderr, "sniff: needs a content-type and a body\n");
return 1;
}
n = st_decode_body(argv[1], body, sizeof(body));
printf("sniff: known=%d consistent=%d\n",
hts_sniff_mime_known(argv[0]) == HTS_TRUE,
hts_sniff_mime_consistent(body, n, argv[0]) == HTS_TRUE);
return 0;
}
static int st_savename(httrackp *opt, int argc, char **argv) {
lien_adrfilsave afs;
cache_back cache;
struct_back *sback;
hash_struct hash;
lien_back headers;
const char *adr = "www.example.com";
const char *cdispo = NULL;
const char *body = NULL;
const char *cached = NULL;
const char *bodyfile = "st-savename-body.tmp";
int statuscode = HTTP_OK, status = 0;
int i;
if (argc < 2) {
fprintf(stderr, "savename: needs a fil and a content-type\n");
return 1;
}
/* knobs first: hash_init and the prior links depend on them */
for (i = 2; i < argc; i++) {
const char *const a = argv[i];
if (strncmp(a, "adr=", 4) == 0)
adr = a + 4;
else if (strncmp(a, "cdispo=", 7) == 0)
cdispo = a + 7;
else if (strncmp(a, "statuscode=", 11) == 0)
statuscode = atoi(a + 11);
else if (strncmp(a, "status=", 7) == 0)
status = atoi(a + 7);
else if (strncmp(a, "strip=", 6) == 0)
StringCopy(opt->strip_query, a + 6);
else if (strncmp(a, "urlhack=", 8) == 0)
opt->urlhack = atoi(a + 8) ? HTS_TRUE : HTS_FALSE;
else if (strncmp(a, "no-www=", 7) == 0)
opt->no_www_dedup = atoi(a + 7) ? HTS_TRUE : HTS_FALSE;
else if (strncmp(a, "no-slash=", 9) == 0)
opt->no_slash_dedup = atoi(a + 9) ? HTS_TRUE : HTS_FALSE;
else if (strncmp(a, "no-query=", 9) == 0)
opt->no_query_dedup = atoi(a + 9) ? HTS_TRUE : HTS_FALSE;
else if (strncmp(a, "n83=", 4) == 0)
opt->savename_83 = atoi(a + 4);
else if (strncmp(a, "type=", 5) == 0)
opt->savename_type = atoi(a + 5);
else if (strncmp(a, "body=", 5) == 0)
body = a + 5;
else if (strncmp(a, "cached=", 7) == 0)
cached = a + 7;
else if (strncmp(a, "prior=", 6) != 0) {
fprintf(stderr, "savename: unknown arg '%s'\n", a);
return 1;
}
}
memset(&afs, 0, sizeof(afs));
strcpybuff(afs.af.adr, adr);
strcpybuff(afs.af.adr, "www.example.com");
strcpybuff(afs.af.fil, argv[0]);
memset(&cache, 0, sizeof(cache));
if (cached != NULL) { /* cached=<content-type>|<save name> */
char *dup = strdupt(cached);
char *const sep = strchr(dup, '|');
char locbuf[64] = "";
htsblk cr;
if (sep == NULL) {
fprintf(stderr, "savename: cached needs ctype|save\n");
return 1;
}
*sep = '\0';
/* one-entry cache in cwd, reopened read-only; body is PNG magic on
purpose: only the recorded name (X-Save) may drive the naming */
StringCopy(opt->path_log, "");
cache.type = 1;
cache.log = cache.errlog = stderr;
cache.hashtable = coucal_new(0);
cache_init(&cache, opt);
hts_init_htsblk(&cr);
cr.statuscode = HTTP_OK;
strcpybuff(cr.msg, "OK");
strcpybuff(cr.contenttype, dup);
cr.location = locbuf;
cr.adr = strdupt("\x89PNG\r\n\x1a\n");
cr.size = 8;
cache_add(opt, &cache, &cr, adr, argv[0], sep + 1, 1, NULL);
freet(cr.adr);
if (cache.zipOutput != NULL) {
zipClose(cache.zipOutput, NULL);
cache.zipOutput = NULL;
}
memset(&cache, 0, sizeof(cache));
cache.type = 1;
cache.log = cache.errlog = stderr;
cache.hashtable = coucal_new(0);
cache.ro = 1;
cache_init(&cache, opt);
freet(dup);
} else {
cache.hashtable = (void *) coucal_new(0);
}
cache.hashtable = (void *) coucal_new(0);
sback = back_new(opt, opt->maxsoc * 32 + 1024);
/* same wiring as hts_mirror (htscore.c) */
hash_init(opt, &hash, opt->urlhack);
hash.liens = (const lien_url *const *const *) &opt->liens;
opt->hash = &hash;
hts_record_init(opt);
for (i = 2; i < argc; i++) {
if (strncmp(argv[i], "prior=", 6) == 0) {
char *dup = strdupt(argv[i] + 6);
char *const p1 = strchr(dup, '|');
char *const p2 = p1 != NULL ? strchr(p1 + 1, '|') : NULL;
if (p2 == NULL) {
fprintf(stderr, "savename: prior needs adr|fil|sav\n");
return 1;
}
*p1 = *p2 = '\0';
if (!hts_record_link(opt, dup, p1 + 1, p2 + 1, "", "", NULL))
return 1;
freet(dup);
}
}
memset(&headers, 0, sizeof(headers));
headers.status = status;
headers.r.statuscode = statuscode;
headers.status = 0;
headers.r.statuscode = HTTP_OK;
strcpybuff(headers.r.contenttype, argv[1]);
if (cdispo != NULL)
strcpybuff(headers.r.cdispo, cdispo);
strcpybuff(headers.url_fil, argv[0]);
if (body != NULL) { /* leading body bytes, read via url_sav */
char BIGSTK data[1024];
const size_t n = st_decode_body(body, data, sizeof(data));
FILE *const fp = fopen(bodyfile, "wb");
if (fp == NULL || fwrite(data, 1, n, fp) != n) {
fprintf(stderr, "savename: can not write %s\n", bodyfile);
return 1;
}
fclose(fp);
strcpybuff(headers.url_sav, bodyfile);
}
url_savename(&afs, NULL, NULL, NULL, opt, sback, &cache, &hash, 0, 0,
&headers);
if (body != NULL)
(void) UNLINK(bodyfile);
printf("savename: %s\n", afs.save);
return 0;
}
@@ -1559,37 +1340,6 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
return 0;
}
/* #159: hts_redirect_same_savefile decides whether a redirect is a same-file
* alias. */
static int st_redirect_samefile(httrackp *opt, int argc, char **argv) {
(void) argc;
(void) argv;
#define SAME(aa, fa, ab, fb) hts_redirect_same_savefile(opt, aa, fa, ab, fb)
/* scheme and userinfo collapse (the #159 case); a different path does not */
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
assertf(SAME("http://user@foo.com", "/a", "http://foo.com", "/a"));
assertf(!SAME("http://foo.com", "/a", "http://foo.com", "/b"));
/* www stays distinct here; the crawl's dedup layer folds www, not this helper
*/
opt->urlhack = HTS_TRUE;
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
assertf(!SAME("http://www.foo.com", "/a", "http://foo.com", "/a"));
/* slash/query fold only when the dedup flag is on */
assertf(SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
assertf(
SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
assertf(!SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
assertf(
!SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
/* but a pure scheme alias still collapses regardless of dedup opt-outs */
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
#undef SAME
printf("redirect-samefile self-test OK\n");
return 0;
}
// hts_finish_makeindex writes the footer, emits the refresh meta only when
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
static int st_makeindex(httrackp *opt, int argc, char **argv) {
@@ -1987,86 +1737,6 @@ static int st_robots(httrackp *opt, int argc, char **argv) {
return 0;
}
/* get_ftp_line must bound a hostile, CRLF-less reply into its internal
1024-byte buffer; ASan turns the pre-fix overflow into an abort here. */
#ifndef _WIN32
static int st_ftpline(httrackp *opt, int argc, char **argv) {
int sv[2];
char line[2048];
char flood[4096];
(void) opt;
(void) argc;
(void) argv;
memset(flood, 'x', sizeof(flood));
assertf(socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == 0);
assertf(write(sv[1], "220 ", 4) == 4); // valid 3-digit code
assertf(write(sv[1], flood, sizeof(flood)) == (ssize_t) sizeof(flood));
assertf(write(sv[1], "\r\n", 2) == 2); // end the line so we return
close(sv[1]);
line[0] = '\0';
get_ftp_line(sv[0], line, sizeof(line), 5);
close(sv[0]);
printf("ftp-line self-test OK (bounded %d-byte reply)\n",
(int) sizeof(flood));
return 0;
}
#endif
/* ftp_split_userpass: well-formed split, plus a hostile over-long userinfo
that pre-fix overran user[256]/pass[256]. */
static int st_ftpuser(httrackp *opt, int argc, char **argv) {
char user[256], pass[256];
char in[1200];
(void) opt;
(void) argc;
(void) argv;
{
const char ok[] = "bob:secret@host/f"; // '@' at index 10
ftp_split_userpass(ok, ok + 11, user, sizeof(user), pass, sizeof(pass));
assertf(strcmp(user, "bob") == 0);
assertf(strcmp(pass, "secret") == 0);
}
memset(in, 'u', 400);
in[400] = ':';
memset(in + 401, 'p', 400);
in[801] = '@';
in[802] = '\0';
ftp_split_userpass(in, in + 802, user, sizeof(user), pass, sizeof(pass));
assertf(strlen(user) == sizeof(user) - 1);
assertf(strlen(pass) == sizeof(pass) - 1);
{
/* tight sizes + guard byte catch an off-by-one the 256 case can't */
char ubuf[16], pbuf[16];
memset(ubuf, 'Z', sizeof(ubuf));
memset(pbuf, 'Z', sizeof(pbuf));
ftp_split_userpass(in, in + 802, ubuf, 8, pbuf, 8);
assertf(strcmp(ubuf, "uuuuuuu") == 0);
assertf(strcmp(pbuf, "ppppppp") == 0);
assertf(ubuf[8] == 'Z' && pbuf[8] == 'Z');
}
printf("ftp-userpass self-test OK\n");
return 0;
}
/* hts_count_fits caps the .class constant-pool entry count to the file size,
rejecting the ~68 MB-per-file calloc DoS. */
static int st_java(httrackp *opt, int argc, char **argv) {
(void) opt;
(void) argc;
(void) argv;
assertf(hts_count_fits(10, 1000) == HTS_TRUE);
assertf(hts_count_fits(0, 10) == HTS_TRUE);
assertf(hts_count_fits(65535, 10) == HTS_FALSE);
assertf(hts_count_fits(1, 0) == HTS_FALSE);
assertf(hts_count_fits(1, -1) == HTS_FALSE);
printf("java constant-pool cap self-test OK\n");
return 0;
}
/* ------------------------------------------------------------ */
/* Registry: name -> handler, with a usage hint and a one-line description. */
/* ------------------------------------------------------------ */
@@ -2087,8 +1757,6 @@ static const struct selftest_entry {
st_stripquery},
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
st_urlhack},
{"redirect-samefile", "", "same-file redirect detection self-test (#159)",
st_redirect_samefile},
{"mime", "<filename>", "MIME type for a filename", st_mime},
{"charset", "<charset> <string>",
"convert a string to UTF-8 from a charset", st_charset},
@@ -2097,8 +1765,6 @@ static const struct selftest_entry {
{"idna-decode", "<host>", "decode an IDNA/punycode hostname",
st_idna_decode},
{"entities", "<string> [encoding]", "unescape HTML entities", st_entities},
{"unescape-bounds", "", "unescapers reserve the NUL byte (no 1-byte OOB)",
st_unescape_bounds},
{"hashtable", "<count|file>", "coucal hashtable stress test", st_hashtable},
{"strsafe", "[overflow|overflow-buff [str]]", "bounded string-op self-test",
st_strsafe},
@@ -2108,12 +1774,8 @@ static const struct selftest_entry {
st_relative},
{"resolve", "<link> <adr> <fil>", "resolve a link against an origin",
st_resolve},
{"header", "<raw-header-line> ...", "response header-line parsing",
st_header},
{"savename", "<fil> <content-type> [key=value ...]",
"local save-name for a URL", st_savename},
{"sniff", "<content-type> <hex:..|text>", "MIME magic consistency",
st_sniff},
{"savename", "<fil> <content-type>", "local save-name for a URL",
st_savename},
{"cache", "<dir>", "cache read/write round-trip self-test", st_cache},
{"cache-golden", "<dir> [regen]", "frozen cache-format read self-test",
st_cache_golden},
@@ -2133,12 +1795,6 @@ static const struct selftest_entry {
"Accept-Encoding advertises gzip+deflate, both decode", st_acceptencoding},
{"robots", "", "robots.txt RFC 9309 Allow/Disallow precedence self-test",
st_robots},
#ifndef _WIN32
{"ftp-line", "", "get_ftp_line bounds a hostile FTP reply line",
st_ftpline},
#endif
{"ftp-userpass", "", "ftp_split_userpass bounds URL userinfo", st_ftpuser},
{"java", "", "java .class constant-pool count cap self-test", st_java},
};
static void list_selftests(void) {

View File

@@ -1,352 +0,0 @@
/* ------------------------------------------------------------ */
/*
HTTrack Website Copier, Offline Browser for Windows and Unix
Copyright (C) 1998-2017 Xavier Roche and other contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Important notes:
- We hereby ask people using this source NOT to use it in purpose of grabbing
emails addresses, or collecting any other private information on persons.
This would disgrace our work, and spoil the many hours we spent on it.
Please visit our Website: http://www.httrack.com
*/
/* ------------------------------------------------------------ */
/* File: MIME magic-byte consistency checks */
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
#include "htssniff.h"
#include <string.h>
#include "htslib.h"
/* One magic rule: `len` bytes at `off` confirm `mime`. */
typedef struct sniff_magic {
const char *mime;
unsigned short off;
unsigned char len;
const char *bytes;
} sniff_magic;
/* Direction is mime -> magic (verify a claim, never classify); types with
no reliable magic (plain text, css, js..) are deliberately absent. Patterns
follow the WHATWG MIME Sniffing Standard tables where it defines them
(https://mimesniff.spec.whatwg.org/); the rest covers httrack's wider MIME
set. Spec-only types absent from our MIME tables (EOT, font/collection)
are omitted as unreachable. */
static const sniff_magic sniff_table[] = {
/* images */
{"image/jpeg", 0, 3, "\xff\xd8\xff"},
{"image/pipeg", 0, 3, "\xff\xd8\xff"},
{"image/pjpeg", 0, 3, "\xff\xd8\xff"},
{"image/png", 0, 8, "\x89PNG\r\n\x1a\n"},
{"image/gif", 0, 6, "GIF87a"},
{"image/gif", 0, 6, "GIF89a"},
{"image/bmp", 0, 2, "BM"},
{"image/tiff", 0, 4, "II*\0"},
{"image/tiff", 0, 4, "MM\0*"},
{"image/x-icon", 0, 4, "\0\0\1\0"},
{"image/x-icon", 0, 4, "\0\0\2\0"}, /* Windows cursor, per the spec */
{"image/x-portable-bitmap", 0, 2, "P1"},
{"image/x-portable-bitmap", 0, 2, "P4"},
{"image/x-portable-pixmap", 0, 2, "P3"},
{"image/x-portable-pixmap", 0, 2, "P6"},
{"image/x-xpixmap", 0, 9, "/* XPM */"},
{"image/x-xbitmap", 0, 7, "#define"},
{"image/x-rgb", 0, 2, "\x01\xda"},
{"image/x-cmu-raster", 0, 4, "\xf1\x00\x40\xbb"},
/* audio */
{"audio/mpeg", 0, 3, "ID3"},
{"audio/basic", 0, 4, ".snd"},
{"audio/mid", 0, 8, "MThd\0\0\0\6"},
{"audio/midi", 0, 8, "MThd\0\0\0\6"},
{"audio/x-pn-realaudio", 0, 4, ".ra\xfd"},
{"audio/x-pn-realaudio", 0, 4, ".RMF"},
{"audio/x-pn-realaudio-plugin", 0, 4, ".ra\xfd"},
{"audio/x-pn-realaudio-plugin", 0, 4, ".RMF"},
{"audio/flac", 0, 4, "fLaC"},
{"audio/aac", 0, 4, "ADIF"},
/* video */
{"video/mpeg", 0, 4, "\x00\x00\x01\xba"},
{"video/mpeg", 0, 4, "\x00\x00\x01\xb3"},
{"video/x-sgi-movie", 0, 4, "MOVI"},
/* archives / compression */
{"application/x-gzip", 0, 3, "\x1f\x8b\x08"},
{"multipart/x-gzip", 0, 3, "\x1f\x8b\x08"},
{"application/x-compressed", 0, 3, "\x1f\x8b\x08"},
{"application/x-compress", 0, 2, "\x1f\x9d"},
{"application/x-bzip2", 0, 3, "BZh"},
{"application/x-7z-compressed", 0, 6, "7z\xbc\xaf\x27\x1c"},
/* 6-byte prefix common to RAR4 (spec) and RAR5 */
{"application/x-rar-compressed", 0, 6, "Rar!\x1a\x07"},
{"application/zstd", 0, 4, "\x28\xb5\x2f\xfd"},
{"application/arj", 0, 2, "\x60\xea"},
{"application/x-cpio", 0, 6, "070701"},
{"application/x-cpio", 0, 6, "070707"},
{"application/x-cpio", 0, 2, "\xc7\x71"},
{"application/x-sv4cpio", 0, 6, "070701"},
{"application/x-sv4crc", 0, 6, "070702"},
{"application/x-stuffit", 0, 8, "StuffIt "},
{"application/x-stuffit", 0, 4, "SIT!"},
{"application/mac-binhex40", 0, 10, "(This file"},
/* documents */
{"application/pdf", 0, 5, "%PDF-"},
{"application/postscript", 0, 2, "%!"},
{"application/rtf", 0, 5, "{\\rtf"},
{"application/x-dvi", 0, 2, "\xf7\x02"},
{"application/x-hdf", 0, 4, "\x0e\x03\x13\x01"},
{"application/x-hdf", 0, 8, "\x89HDF\r\n\x1a\n"},
{"application/x-netcdf", 0, 4, "CDF\x01"},
{"application/x-netcdf", 0, 4, "CDF\x02"},
{"application/x-msaccess", 0, 19, "\0\1\0\0Standard Jet DB"},
/* fonts */
{"font/woff", 0, 4, "wOFF"},
{"font/woff2", 0, 4, "wOF2"},
{"font/ttf", 0, 4, "\0\1\0\0"},
{"font/ttf", 0, 4, "true"},
{"font/otf", 0, 4, "OTTO"},
/* misc */
{"application/x-shockwave-flash", 0, 3, "FWS"},
{"application/x-shockwave-flash", 0, 3, "CWS"},
{"application/x-shockwave-flash", 0, 3, "ZWS"},
{"application/futuresplash", 0, 3, "FWS"},
{"application/x-director", 0, 4, "RIFX"},
{"application/x-director", 0, 4, "XFIR"},
{"application/x-java-vm", 0, 4, "\xca\xfe\xba\xbe"},
{"application/wasm", 0, 4, "\0asm"},
{"application/x-msmetafile", 0, 4, "\xd7\xcd\xc6\x9a"},
{"application/x-msmetafile", 0, 4, "\x01\x00\x09\x00"},
{"application/x-x509-ca-cert", 0, 2, "\x30\x82"},
{"application/x-pkcs12", 0, 2, "\x30\x82"},
{"application/x-pkcs7-mime", 0, 2, "\x30\x82"},
{"application/x-pkcs7-signature", 0, 2, "\x30\x82"},
{"application/x-pkcs7-certificates", 0, 2, "\x30\x82"},
{"x-world/x-vrml", 0, 5, "#VRML"},
{"application/x-bittorrent", 0, 11, "d8:announce"},
{"drawing/x-dwf", 0, 4, "(DWF"},
{"application/acad", 0, 4, "AC10"},
{NULL, 0, 0, NULL}};
/* MIME families sharing a container magic */
static const char *const zip_mimes[] = {
"application/zip", "application/x-zip-compressed", "multipart/x-zip", NULL};
static const char *const zip_mime_prefixes[] = {
"application/vnd.openxmlformats-officedocument.",
"application/vnd.oasis.opendocument.", NULL};
static const char *const ole_mimes[] = {"application/msword",
"application/excel",
"application/vnd.ms-excel",
"application/powerpoint",
"application/vnd.ms-powerpoint",
"application/vnd.ms-project",
"application/vnd.ms-works",
"application/x-msmoney",
"application/x-mspublisher",
NULL};
static const char *const tar_mimes[] = {
"application/x-tar", "application/x-ustar", "application/x-gtar", NULL};
static const char *const ogg_mimes[] = {"application/ogg", "audio/ogg",
"video/ogg", "audio/opus", NULL};
static const char *const ebml_mimes[] = {"video/webm", "audio/webm", NULL};
/* ISO-BMFF, any 'ftyp' brand: containers overlap too much to split */
static const char *const bmff_mimes[] = {"video/mp4", "audio/mp4",
"video/quicktime", NULL};
static const char *const avif_mimes[] = {"image/avif", NULL};
static const char *const heic_mimes[] = {"image/heic", NULL};
static const char *const asf_mimes[] = {"video/x-ms-asf", "video/x-ms-wmv",
"video/x-la-asf", NULL};
static const char *const xml_mimes[] = {"application/xml", "text/xml",
"image/svg+xml", "image/svg-xml", NULL};
static const char *const svg_mimes[] = {"image/svg+xml", "image/svg-xml", NULL};
static const char *const html_mimes[] = {"text/html", NULL};
static const char *const pem_mimes[] = {
"application/x-x509-ca-cert", "application/x-pkcs7-certificates",
"application/x-pkcs7-mime", "application/x-pkcs7-signature", NULL};
static hts_boolean mime_in(const char *const *list, const char *mime) {
size_t i;
for (i = 0; list[i] != NULL; i++)
if (strfield2(list[i], mime))
return HTS_TRUE;
return HTS_FALSE;
}
static hts_boolean mime_in_prefix(const char *const *list, const char *mime) {
size_t i;
for (i = 0; list[i] != NULL; i++)
if (strfield(mime, list[i]))
return HTS_TRUE;
return HTS_FALSE;
}
static hts_boolean has_bytes(const unsigned char *d, size_t n, size_t off,
const char *bytes, size_t len) {
/* overflow-safe: untrusted n alone on one side */
return n >= off && len <= n - off && memcmp(d + off, bytes, len) == 0
? HTS_TRUE
: HTS_FALSE;
}
static unsigned char ascii_lower(unsigned char c) {
return c >= 'A' && c <= 'Z' ? (unsigned char) (c + 32) : c;
}
/* Case-insensitive text prefix after an optional UTF-8 BOM and whitespace. */
static hts_boolean has_text_prefix(const unsigned char *d, size_t n,
const char *prefix) {
const size_t len = strlen(prefix);
size_t i, k;
i = n >= 3 && memcmp(d, "\xef\xbb\xbf", 3) == 0 ? 3 : 0;
while (i < n && (d[i] == ' ' || d[i] == '\t' || d[i] == '\r' || d[i] == '\n'))
i++;
if (len > n - i) /* i <= n from the loop above */
return HTS_FALSE;
for (k = 0; k < len; k++)
if (ascii_lower(d[i + k]) != ascii_lower((unsigned char) prefix[k]))
return HTS_FALSE;
return HTS_TRUE;
}
typedef enum sniff_op {
SNIFF_QUERY_KNOWN, /* is any rule defined for this MIME? */
SNIFF_QUERY_MATCH /* do the bytes confirm this MIME? */
} sniff_op;
/* Single walk for both queries so the rule set can't drift apart. */
static hts_boolean sniff_eval(sniff_op op, const unsigned char *d, size_t n,
const char *mime) {
size_t i;
/* KNOWN short-circuits; MATCH tests the magic */
#define SNIFF_RULE(cond) \
do { \
if (op == SNIFF_QUERY_KNOWN) \
return HTS_TRUE; \
if (cond) \
return HTS_TRUE; \
} while (0)
for (i = 0; sniff_table[i].mime != NULL; i++) {
if (strfield2(sniff_table[i].mime, mime)) {
SNIFF_RULE(has_bytes(d, n, sniff_table[i].off, sniff_table[i].bytes,
sniff_table[i].len));
}
}
if (mime_in(zip_mimes, mime) || mime_in_prefix(zip_mime_prefixes, mime)) {
SNIFF_RULE(has_bytes(d, n, 0, "PK\3\4", 4) ||
has_bytes(d, n, 0, "PK\5\6", 4));
}
if (mime_in(ole_mimes, mime)) {
SNIFF_RULE(has_bytes(d, n, 0, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8));
}
if (mime_in(tar_mimes, mime)) {
SNIFF_RULE(has_bytes(d, n, 257, "ustar", 5));
}
if (mime_in(ogg_mimes, mime)) {
SNIFF_RULE(has_bytes(d, n, 0, "OggS\0", 5));
}
if (mime_in(ebml_mimes, mime)) {
SNIFF_RULE(has_bytes(d, n, 0, "\x1a\x45\xdf\xa3", 4));
}
if (mime_in(bmff_mimes, mime)) {
SNIFF_RULE(has_bytes(d, n, 4, "ftyp", 4));
}
if (mime_in(avif_mimes, mime)) {
SNIFF_RULE(has_bytes(d, n, 4, "ftypavif", 8) ||
has_bytes(d, n, 4, "ftypavis", 8));
}
if (mime_in(heic_mimes, mime)) {
SNIFF_RULE(
has_bytes(d, n, 4, "ftyphei", 7) || has_bytes(d, n, 4, "ftyphev", 7) ||
has_bytes(d, n, 4, "ftypmif1", 8) || has_bytes(d, n, 4, "ftypmsf1", 8));
}
if (mime_in(asf_mimes, mime)) {
SNIFF_RULE(has_bytes(d, n, 0, "\x30\x26\xb2\x75\x8e\x66\xcf\x11", 8));
}
if (strfield2("audio/x-wav", mime)) {
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) && has_bytes(d, n, 8, "WAVE", 4));
}
if (strfield2("video/x-msvideo", mime)) {
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) && has_bytes(d, n, 8, "AVI ", 4));
}
if (strfield2("image/webp", mime)) {
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) &&
has_bytes(d, n, 8, "WEBPVP", 6));
}
if (strfield2("image/x-portable-anymap", mime)) {
SNIFF_RULE(n >= 2 && d[0] == 'P' && d[1] >= '1' && d[1] <= '6');
}
if (strfield2("audio/x-aiff", mime)) {
SNIFF_RULE(
has_bytes(d, n, 0, "FORM", 4) &&
(has_bytes(d, n, 8, "AIFF", 4) || has_bytes(d, n, 8, "AIFC", 4)));
}
if (strfield2("audio/mpeg", mime)) {
/* MPEG audio frame sync (11 bits), valid layer and bitrate fields */
SNIFF_RULE(n >= 2 && d[0] == 0xff && (d[1] & 0xe0) == 0xe0 &&
(d[1] & 0x06) != 0);
}
if (strfield2("audio/aac", mime)) {
/* ADTS sync */
SNIFF_RULE(n >= 2 && d[0] == 0xff && (d[1] & 0xf6) == 0xf0);
}
if (strfield2("video/mp2t", mime)) {
SNIFF_RULE(n >= 1 && d[0] == 0x47 && (n <= 188 || d[188] == 0x47));
}
if (mime_in(xml_mimes, mime)) {
SNIFF_RULE(has_text_prefix(d, n, "<?xml"));
}
if (mime_in(svg_mimes, mime)) {
SNIFF_RULE(has_text_prefix(d, n, "<svg") ||
has_text_prefix(d, n, "<!DOCTYPE svg"));
}
if (mime_in(html_mimes, mime)) {
SNIFF_RULE(has_text_prefix(d, n, "<!DOCTYPE") ||
has_text_prefix(d, n, "<html") ||
has_text_prefix(d, n, "<head"));
}
if (mime_in(pem_mimes, mime)) {
SNIFF_RULE(has_text_prefix(d, n, "-----BEGIN"));
}
if (strfield2("audio/x-mpegurl", mime)) {
SNIFF_RULE(has_text_prefix(d, n, "#EXTM3U"));
}
if (strfield2("text/x-vcard", mime)) {
SNIFF_RULE(has_text_prefix(d, n, "BEGIN:VCARD"));
}
#undef SNIFF_RULE
return HTS_FALSE;
}
hts_boolean hts_sniff_mime_known(const char *mime) {
if (mime == NULL || *mime == '\0')
return HTS_FALSE;
return sniff_eval(SNIFF_QUERY_KNOWN, NULL, 0, mime);
}
hts_boolean hts_sniff_mime_consistent(const void *data, size_t size,
const char *mime) {
if (data == NULL || size == 0 || mime == NULL || *mime == '\0')
return HTS_FALSE;
return sniff_eval(SNIFF_QUERY_MATCH, (const unsigned char *) data, size,
mime);
}

View File

@@ -1,50 +0,0 @@
/* ------------------------------------------------------------ */
/*
HTTrack Website Copier, Offline Browser for Windows and Unix
Copyright (C) 1998-2017 Xavier Roche and other contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Important notes:
- We hereby ask people using this source NOT to use it in purpose of grabbing
emails addresses, or collecting any other private information on persons.
This would disgrace our work, and spoil the many hours we spent on it.
Please visit our Website: http://www.httrack.com
*/
/* ------------------------------------------------------------ */
/* File: MIME magic-byte consistency checks */
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
#ifndef HTSSNIFF_DEFH
#define HTSSNIFF_DEFH
#include <stddef.h>
#include "htsglobal.h"
/* Leading-body window read to arbitrate a wire/extension MIME conflict. */
#define HTS_SNIFF_LEN 512
/* Can a magic rule ever confirm this MIME? (whether sniffing is worth it) */
hts_boolean hts_sniff_mime_known(const char *mime);
/* TRUE when the leading body bytes are consistent with the claimed MIME;
FALSE on unknown MIME, unknown magic, or too-short data (fail-safe). */
hts_boolean hts_sniff_mime_consistent(const void *data, size_t size,
const char *mime);
#endif

View File

@@ -1,7 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# get_ftp_line bounds a hostile CRLF-less FTP reply into its 1024-byte buffer.
httrack -O /dev/null -#test=ftp-line run | grep -q "ftp-line self-test OK"

View File

@@ -1,7 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# ftp_split_userpass bounds an over-long user:pass@ from a hostile ftp:// URL.
httrack -O /dev/null -#test=ftp-userpass run | grep -q "ftp-userpass self-test OK"

View File

@@ -1,29 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# Response header-line parsing (treathead via -#test=header <raw-line> ...).
# Isolates the wire layer from url_savename, which strips traversal on its own.
hdr() {
local want="$1"
shift
out="$(httrack -O /dev/null -#test=header "$@" | grep '^contenttype=')"
test "$out" == "$want" || {
echo "FAIL: $* -> '$out' (want '$want')"
exit 1
}
}
hdr 'contenttype=application/pdf cdispo=' 'Content-Type: application/pdf'
# filename= is honored quoted or bare.
hdr 'contenttype= cdispo=report.pdf' \
'Content-Disposition: attachment; filename="report.pdf"'
hdr 'contenttype= cdispo=report.pdf' \
'Content-Disposition: attachment; filename=report.pdf'
# Path components in the filename are dropped on the wire (RFC 2616).
hdr 'contenttype= cdispo=evil.pdf' \
'Content-Disposition: attachment; filename="../../evil.pdf"'

View File

@@ -1,7 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# .class constant-pool count is capped to the file size (calloc DoS).
httrack -O /dev/null -#test=java run | grep -q "java constant-pool cap self-test OK"

View File

@@ -1,9 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
# followed through, not turned into a self-pointing "moved" stub. The decision
# helper is exercised by the engine self-test.
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"

View File

@@ -3,38 +3,13 @@
set -euo pipefail
# Local save-name resolution (url_savename via -#test=savename <fil> <content-type> [key=value ...]).
# name() asserts on the basename, full() on the whole path; prior= registers an
# already-crawled link whose sav is rooted under the -O path (/dev/null here).
# resolve httrack before cd: make check puts a RELATIVE ../src on PATH
httrack_bin=$(cd "$(dirname "$(command -v httrack)")" && pwd)/httrack
# scratch dir: body= and cached= write temp files (st-savename-body.tmp, hts-cache/)
scratch=$(mktemp -d)
trap 'rm -rf "$scratch"' EXIT
cd "$scratch"
run() {
"$httrack_bin" -O /dev/null -#test=savename "$@" | sed -n 's/^savename: //p'
}
# Local save-name extension resolution (url_savename via -#test=savename <fil> <content-type>).
# Asserts on the basename of "savename: <path>".
name() {
local fil="$1" ctype="$2" want="$3"
shift 3
out="$(run "$fil" "$ctype" "$@")"
test "${out##*/}" == "$want" || {
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
exit 1
}
}
full() {
local fil="$1" ctype="$2" want="$3"
shift 3
out="$(run "$fil" "$ctype" "$@")"
test "$out" == "$want" || {
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
out="$(httrack -O /dev/null -#test=savename "$1" "$2" | sed -n 's/^savename: //p')"
test "${out##*/}" == "$3" || {
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
exit 1
}
}
@@ -64,95 +39,3 @@ name '/types/data.json' 'application/json' 'data.json'
# Agreeing type must not rewrite the extension's casing (no strip-and-reappend).
name '/x.JPG' 'image/jpeg' 'x.JPG'
# A Content-Disposition filename replaces the URL name outright.
name '/x.php' 'application/pdf' 'report.pdf' cdispo=report.pdf
name '/download' 'text/html' 'setup.exe' cdispo=setup.exe
# Reserved characters in a hostile Content-Disposition name are sanitized.
name '/x.php' 'application/pdf' 'set_up.exe' 'cdispo=set:up.exe'
# The md5-of-query suffix lands inside a Content-Disposition name too.
name '/x.php?id=1' 'application/pdf' 'report681a.pdf' cdispo=report.pdf
# Still-downloading path (status=-1): mime drives the ext, cdispo is ignored
# there (the deliberately unfolded 4th resolve_extension variant).
name '/x.pdf' 'text/html' 'x.html' status=-1
name '/x.html' 'text/html' 'x.html' status=-1
name '/x.php' 'application/pdf' 'x.pdf' status=-1 cdispo=report.pdf
# Contested type (wire disagrees with a specific ext): magic bytes proving the
# extension right keep it, anything else trusts the wire as before.
name '/photo.jpg' 'image/png' 'photo.jpg' body=hex:FFD8FFE000104A46
name '/photo.jpg' 'image/png' 'photo.png' body=hex:89504E470D0A1A0A
name '/photo.jpg' 'image/png' 'photo.png'
name '/doc.pdf' 'text/html' 'doc.pdf' body=hex:255044462D312E34
name '/doc.pdf' 'text/html' 'doc.html' 'body=<html><body>soft 404</body></html>'
name '/style.css' 'image/png' 'style.png' 'body=body { }' # no rule for css: wire wins
# A redirect answer resolves nothing: delayed placeholder name.
name '/x.php' 'text/html' 'x.0.delayed' statuscode=301
# Root and query-only URLs get index + the md5-of-query suffix.
name '/' 'text/html' 'index.html'
name '/?a=1' 'text/html' 'index3872.html'
# Same URL crawled before: reuse its sav verbatim (case preserved).
full '/X.PHP' 'text/html' 'www.example.com/CASE.HTML' \
'prior=www.example.com|/X.PHP|www.example.com/CASE.HTML'
# Another URL owns the name: collision suffix -2, then -3, case-insensitively.
name '/x.php' 'text/html' 'x-2.html' \
'prior=www.example.com|/other.html|/dev/null/www.example.com/x.html'
name '/x.php' 'text/html' 'x-3.html' \
'prior=www.example.com|/o1.html|/dev/null/www.example.com/x.html' \
'prior=www.example.com|/o2.html|/dev/null/www.example.com/x-2.html'
name '/INDEX.HTML' 'text/html' 'INDEX-2.HTML' \
'prior=www.example.com|/index.html|/dev/null/www.example.com/index.html'
# Same basename in another directory is NOT a collision.
name '/x.php' 'text/html' 'x.html' \
'prior=www.example.com|/sub/x.html|/dev/null/www.example.com/sub/x.html'
# 8-3 modes: DOS truncates every component to 8+3, ISO9660 level 2 to 31.
full '/directory-long/verylongfilename.html' 'text/html' \
'/dev/null/EXAMPLE/DIRECTOR/VERYLONG.HTM' n83=1
full '/directory-long/verylongfilename.html' 'text/html' \
'/dev/null/EXAMPLE_C/DIRECTORY_LONG/VERYLONGFILENAME.HTM' n83=2
name '/verylongfilename.php' 'text/html' 'VERYLO-2.HTM' n83=1 \
'prior=www.example.com|/other.html|/dev/null/EXAMPLE/VERYLONG.HTM'
# urlhack dedup (#271): // collapse and www-strip map to the prior link's sav;
# the per-feature negatives opt out and take a fresh name.
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/PRIOR.html' \
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' no-slash=1 \
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
full '/w.php' 'text/html' '/dev/null/www.example.com/W-PRIOR.html' adr=example.com \
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
full '/w.php' 'text/html' '/dev/null/example.com/w.html' adr=example.com no-www=1 \
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
# Distinct URLs must stay distinct under urlhack (no over-normalization).
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' \
'prior=www.example.com|/a/c.php|/dev/null/www.example.com/a/C-PRIOR.html'
# --strip-query (#112): stripped key dedups onto the prior sav; without the
# option the same URLs stay distinct.
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/PAGE-PRIOR.html' \
strip=sid 'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
# A kept key that differs must still block the dedup (no over-stripping).
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
strip=sid 'prior=www.example.com|/page.php?id=4|/dev/null/www.example.com/PAGE-PRIOR.html'
# Hostile fils stay rooted under the mirror: ../ (raw or %2e-encoded) drops out,
# control characters become spaces, oversized names cap at 210 chars (the cap
# can chop the extension off entirely).
full '/../../etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
full '/%2e%2e/%2e%2e/etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
full '/x.php' 'application/pdf' '/dev/null/www.example.com///evil.exe' 'cdispo=../../evil.exe'
name $'/evil\rname\t.php' 'text/html' 'evil name .html'
name "/$(printf 'a%.0s' {1..300}).php" 'text/html' "$(printf 'a%.0s' {1..210})"

View File

@@ -1,87 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# MIME magic consistency (-#test=sniff <content-type> <hex:..|text>), the
# tie-break behind htsname's wire-vs-extension naming.
chk() {
local mime="$1" body="$2" want="$3"
out="$(httrack -#test=sniff "$mime" "$body" | sed -n 's/^sniff: //p')"
test "$out" == "$want" || {
echo "FAIL: '$mime' '$body' -> '$out' (want '$want')"
exit 1
}
}
yes='known=1 consistent=1'
no='known=1 consistent=0'
unk='known=0 consistent=0'
# images
chk image/jpeg hex:FFD8FFE000104A46 "$yes"
chk image/png hex:89504E470D0A1A0A "$yes"
chk image/png hex:FFD8FFE000104A46 "$no" # jpeg bytes are not a png
chk image/gif 'GIF89a' "$yes"
chk image/bmp 'BMxxxx' "$yes"
chk image/tiff hex:49492A00 "$yes"
chk image/tiff hex:4D4D002A "$yes" # both endians
chk image/x-icon hex:00000100 "$yes"
chk image/x-icon hex:00000200 "$yes" # Windows cursor, spec maps to x-icon
chk image/webp 'RIFFxxxxWEBPVP' "$yes"
chk image/webp 'RIFFxxxxWAVE' "$no" # riff subtype discriminates
chk image/avif hex:0000001C6674797061766966 "$yes"
chk image/avif hex:0000001C6674797068656963 "$no" # heic brand is not avif
chk image/heic hex:0000001C6674797068656963 "$yes"
chk image/svg+xml '<svg xmlns="x">' "$yes"
chk image/svg+xml $'\xef\xbb\xbf <?xml version="1.0"?>' "$yes" # BOM+ws skip
# audio / video
chk audio/mpeg 'ID3xxx' "$yes"
chk audio/mpeg hex:FFFB9000 "$yes" # bare frame sync
chk audio/aac hex:FFF15080 "$yes"
chk audio/flac 'fLaC' "$yes"
chk audio/ogg hex:4F67675300 "$yes"
chk audio/x-wav 'RIFFxxxxWAVE' "$yes"
chk video/x-msvideo 'RIFFxxxxAVI ' "$yes"
chk video/x-msvideo 'RIFFxxxxWAVE' "$no"
chk video/mp4 hex:000000186674797069736F6D "$yes"
chk video/webm hex:1A45DFA3 "$yes"
chk video/mpeg hex:000001BA "$yes"
chk video/x-ms-wmv hex:3026B2758E66CF11 "$yes"
# archives; zip magic covers the office-container families
chk application/zip hex:504B0304 "$yes"
chk application/vnd.openxmlformats-officedocument.wordprocessingml.document hex:504B0304 "$yes"
chk application/vnd.oasis.opendocument.text hex:504B0304 "$yes"
chk application/msword hex:D0CF11E0A1B11AE1 "$yes"
chk application/msword hex:504B0304 "$no" # legacy .doc is OLE, not zip
chk application/x-gzip hex:1F8B08 "$yes"
chk application/x-bzip2 'BZh9' "$yes"
chk application/x-7z-compressed hex:377ABCAF271C "$yes"
chk application/x-rar-compressed hex:526172211A07 "$yes"
chk application/zstd hex:28B52FFD "$yes"
chk application/x-tar "hex:$(printf '00%.0s' {1..257})7573746172" "$yes" # ustar at 257
chk application/x-tar hex:7573746172 "$no"
# documents, fonts, misc
chk application/pdf '%PDF-1.7' "$yes"
chk application/pdf '<html><body>soft 404</body></html>' "$no"
chk application/postscript '%!PS-Adobe' "$yes"
chk application/rtf '{\rtf1' "$yes"
chk font/woff2 'wOF2' "$yes"
chk font/otf 'OTTO' "$yes"
chk font/ttf hex:0001000000 "$yes"
chk application/x-shockwave-flash 'CWSx' "$yes"
chk application/x-java-vm hex:CAFEBABE "$yes"
chk application/wasm hex:0061736D "$yes"
chk text/html $' \r\n<!DOCTYPE html><html>' "$yes"
chk text/html '<html lang="en">' "$yes"
chk text/html 'plain text, no markup' "$no"
chk text/xml '<?xml version="1.0"?>' "$yes"
# no magic rule at all: never confirmed, never blocks the wire type
chk text/css 'body { }' "$unk"
chk text/plain 'hello' "$unk"
chk application/x-javascript 'var x;' "$unk"

View File

@@ -1,7 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# Entity/URL unescapers reserve one byte for the trailing NUL (no 1-byte OOB).
httrack -O /dev/null -#test=unescape-bounds run | grep -q "unescape-bounds self-test OK"

View File

@@ -1,33 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# Update-run naming from a real cache entry (-#test=savename cached=<ctype>|<save>).
# Named 01_zlib-*: the cache writer needs zlib, which the MSan job can't run.
# resolve httrack before cd: make check puts a RELATIVE ../src on PATH
httrack_bin=$(cd "$(dirname "$(command -v httrack)")" && pwd)/httrack
scratch=$(mktemp -d)
trap 'rm -rf "$scratch"' EXIT
cd "$scratch"
name() {
local fil="$1" ctype="$2" want="$3"
shift 3
out="$("$httrack_bin" -O /dev/null -#test=savename "$fil" "$ctype" "$@" | sed -n 's/^savename: //p')"
test "${out##*/}" == "$want" || {
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
exit 1
}
}
# No live bytes: the recorded save name (X-Save) reproduces the previous
# verdict; cached body bytes (PNG magic) are ignored; css has no magic rule.
name '/photo.jpg' 'image/png' 'photo.jpg' 'cached=image/png|www.example.com/photo.jpg'
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.png'
name '/photo.jpg' 'image/jpeg' 'photo.jpg' 'cached=image/jpeg|www.example.com/photo.png'
name '/style.css' 'image/png' 'style.css' 'cached=image/png|www.example.com/style.css'
# agreement keeps the URL ext verbatim (.jpeg), never canonicalized to .jpg
name '/photo.jpeg' 'image/jpeg' 'photo.jpeg' 'cached=image/jpeg|www.example.com/photo.jpeg'

View File

@@ -1,10 +1,11 @@
#!/bin/bash
#
# Content-Type vs URL-extension naming (#267 family, default -%N2). A MISSING
# type keeps a specific non-HTML ext; a DECLARED disagreeing type is trusted
# unless magic bytes prove the ext right (lie/wrongtype/packed keep theirs),
# so a real HTML body (report.pdf) still becomes .html. Wrong names are
# asserted absent so a regression in either direction fails.
# Content-Type vs URL-extension naming (issue #267 family) under the default
# delayed type check (-%N2). Policy: a MISSING Content-Type must not clobber a
# URL extension that maps to a specific non-HTML type (.png/.pdf stay as-is);
# an explicitly DECLARED type is trusted, so a binary-looking URL that really
# serves HTML (text/html on .pdf/.jpg) is named .html. The "wrong" names are
# asserted absent so a regression in either direction fails here.
: "${top_srcdir:=..}"
@@ -13,11 +14,7 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'types/notype.pdf' --not-found 'types/notype.html' \
--found 'types/photo.png' \
--found 'types/doc.pdf' \
--found 'types/lie.png' --not-found 'types/lie.html' \
--found 'types/wrongtype.jpg' --not-found 'types/wrongtype.png' \
--found 'types/bigtype.jpg' --not-found 'types/bigtype.png' \
--found 'types/mutant.jpg' --not-found 'types/mutant.png' \
--found 'types/packed.jpg' --not-found 'types/packed.png' \
--found 'types/lie.html' --not-found 'types/lie.png' \
--found 'types/report.html' --not-found 'types/report.pdf' \
--found 'types/page.htm' --not-found 'types/page.html' \
--found 'types/script.js' \

View File

@@ -1,18 +1,15 @@
#!/bin/bash
#
# An update pass keeps the names the first crawl chose: type and save name
# ride the cache, so a declared-text/html .pdf stays .html, a typeless .png
# stays .png, and a sniff-kept ext is reproduced from X-Save even when the
# refetched content changed (mutant.jpg serves PNG bytes on the rerun).
# A second (update) pass must keep the names the first crawl chose. The stored
# Content-Type rides the cache, so the update reads back the same value -- the
# unknown/unknown sentinel for a typeless response, the declared type otherwise
# -- and names consistently: a declared-text/html .pdf stays .html and a
# typeless .png stays .png across the update rather than reverting.
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
--found 'types/report.html' --not-found 'types/report.pdf' \
--found 'types/notype.png' --not-found 'types/notype.html' \
--found 'types/lie.png' --not-found 'types/lie.html' \
--found 'types/wrongtype.jpg' --not-found 'types/wrongtype.png' \
--found 'types/bigtype.jpg' --not-found 'types/bigtype.png' \
--found 'types/packed.jpg' --not-found 'types/packed.png' \
--found 'types/mutant.jpg' --not-found 'types/mutant.png' \
--found 'types/lie.html' \
httrack 'BASEURL/types/index.html'

View File

@@ -1,13 +0,0 @@
#!/bin/bash
# Issue #279: an anchored link (target.html#sec, quoted or bare) fetches the
# target with the fragment dropped (strict server 400s on a '#' in the request)
# but keeps it in the rewritten local link so the anchor still works.
set -e
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'fraglink/target.html' \
--file-matches 'fraglink/index.html' 'href=target\.html#sec' \
--file-matches 'fraglink/index.html' 'href="target\.html#sec2"' \
httrack 'BASEURL/fraglink/index.html'

View File

@@ -1,23 +0,0 @@
#!/bin/bash
# The java plugin must load (versioned dlopen name) and parse a .class
# constant pool: a resource named only inside Foo.class gets crawled.
set -e
: "${top_srcdir:=..}"
tmproot=$(mktemp -d)
trap 'rm -rf "$tmproot"' EXIT
mkdir "$tmproot/javaclass"
cat >"$tmproot/javaclass/index.html" <<'EOF'
<html><body><a href="Foo.class">applet</a></body></html>
EOF
printf 'GIF89a' >"$tmproot/javaclass/hello.gif"
# magic/minor/major, count=2, one CONSTANT_Utf8 "hello.gif", class/superclass
printf '\xCA\xFE\xBA\xBE\x00\x00\x00\x32\x00\x02\x01\x00\x09hello.gif\x00\x00\x00\x00' \
>"$tmproot/javaclass/Foo.class"
bash "$top_srcdir/tests/local-crawl.sh" --root "$tmproot" --errors 0 \
--found 'javaclass/Foo.class' \
--found 'javaclass/hello.gif' \
httrack 'BASEURL/javaclass/index.html'

View File

@@ -1,17 +0,0 @@
#!/bin/bash
#
# Content-Disposition names the saved file: the attachment filename replaces
# the URL-derived name, and a traversal filename is reduced to its last
# component, inside the mirror.
set -euo pipefail
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'cdispo/report.pdf' \
--file-matches 'cdispo/report.pdf' '%PDF' \
--not-found 'cdispo/fetch.pdf' \
--found 'cdispo/evil.pdf' \
--not-found 'evil.pdf' \
httrack 'BASEURL/cdispo/index.html'

View File

@@ -1,20 +0,0 @@
#!/bin/bash
#
# Degenerate delayed-type paths (#5/#107 family): redirects that never resolve
# a name must drop cleanly -- no .delayed leftovers (audited by local-crawl.sh),
# no "bogus state" cache warnings, resolvable links still land correctly.
set -euo pipefail
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --rerun --errors 0 \
--found 'delayed/real.pdf' \
--file-matches 'delayed/real.pdf' '%PDF' \
--found 'delayed/notype.bin.html' \
--found 'delayed/empty.html' \
--not-found 'delayed/noloc.html' \
--not-found 'delayed/selfloop.html' \
--not-found 'delayed/chain9.pdf' \
--log-not-found 'bogus state' \
httrack 'BASEURL/delayed/index.html'

View File

@@ -1,21 +0,0 @@
#!/bin/bash
#
# -E time limit (#481): server pages trickle for minutes; the engine must stop
# on its own at -E plus grace, aborting the in-flight transfers.
set -euo pipefail
: "${top_srcdir:=..}"
# cancelled crawls can orphan .delayed placeholders (#483): skip that audit
start=$(date +%s)
bash "$top_srcdir/tests/local-crawl.sh" \
--skip-delayed-audit \
--log-found 'More than 2 seconds passed' \
httrack 'BASEURL/trickle/index.html' -E2 -c4
wall=$(($(date +%s) - start))
# hard stop is due at -E2 + 5s grace; near TRICKLE_SECONDS means it never fired
if [ "$wall" -ge 30 ]; then
echo "crawl took ${wall}s, -E hard stop did not engage" >&2
exit 1
fi

View File

@@ -6,7 +6,6 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
local-crawl.sh local-server.py server.crt server.key \
server-root/simple/basic.html server-root/simple/link.html \
server-root/stripquery/index.html server-root/stripquery/a.html \
server-root/fraglink/index.html server-root/fraglink/target.html \
fixtures/cache-golden/hts-cache/new.zip
TESTS_ENVIRONMENT =
@@ -35,37 +34,29 @@ TESTS = \
01_engine-entities.test \
01_engine-filelist.test \
01_engine-filter.test \
01_engine-ftp-line.test \
01_engine-ftp-userpass.test \
01_engine-hashtable.test \
01_engine-header.test \
01_engine-idna.test \
01_engine-escape-room.test \
01_engine-inplace-escape.test \
01_engine-java.test \
01_engine-makeindex.test \
01_engine-mime.test \
01_engine-parse.test \
01_engine-pause.test \
01_engine-rcfile.test \
01_engine-redirect.test \
01_engine-relative.test \
01_engine-robots.test \
01_engine-savename.test \
01_engine-selftest-dispatch.test \
01_engine-simplify.test \
01_engine-sniff.test \
01_engine-status.test \
01_engine-stripquery.test \
01_engine-strsafe.test \
01_engine-urlhack.test \
01_engine-unescape-bounds.test \
01_engine-useragent.test \
01_zlib-acceptencoding.test \
01_zlib-cache.test \
01_zlib-cache-golden.test \
01_zlib-cache-writefail.test \
01_zlib-savename-cached.test \
02_manpage-regen.test \
02_update-cache.test \
10_crawl-simple.test \
@@ -92,11 +83,6 @@ TESTS = \
26_local-strip-query.test \
27_local-cookies-file.test \
28_local-pause.test \
29_local-redirect-fragment.test \
30_local-fragment-link.test \
31_local-javaclass.test \
32_local-cdispo.test \
33_local-delayed.test \
34_local-maxtime.test
29_local-redirect-fragment.test
CLEANFILES = check-network_sh.cache

View File

@@ -15,11 +15,8 @@
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
# --errors N --files N --found PATH ... --directory PATH ... \
# --log-found REGEX ... --log-not-found REGEX ... \
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
# httrack BASEURL/some/path [httrack-args...]
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
# host root), to assert rewritten link/content survived the crawl.
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
# which the ephemeral port forces into the cookie domain) and passes it to
# httrack via --cookies-file, to exercise preloaded cookies.
@@ -92,7 +89,6 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create
# --- parse leading control flags --------------------------------------------
declare -a audit=()
declare -a cookies=()
skip_delayed_audit=""
scheme=http
pos=0
args=("$@")
@@ -117,9 +113,6 @@ while test "$pos" -lt "$nargs"; do
pos=$((pos + 1))
cookies+=("${args[$pos]}")
;;
--skip-delayed-audit)
skip_delayed_audit=1
;;
--errors | --files)
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
pos=$((pos + 1))
@@ -128,10 +121,6 @@ while test "$pos" -lt "$nargs"; do
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
pos=$((pos + 1))
;;
--file-matches | --file-not-matches)
audit+=("${args[$pos]}" "${args[$((pos + 1))]}" "${args[$((pos + 2))]}")
pos=$((pos + 2))
;;
httrack)
pos=$((pos + 1))
break
@@ -250,17 +239,6 @@ done
test -n "$hostroot" || die "could not find host root under $out"
debug "host root: $hostroot"
# A completed crawl must leave no .delayed temporaries (issue #107).
# --skip-delayed-audit: a cancelled crawl can orphan placeholders (issue #483)
if test -z "$skip_delayed_audit"; then
info "checking for leftover .delayed files"
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
if test -z "$leftovers"; then result "OK"; else
result "leftover: $leftovers"
exit 1
fi
fi
# --- audit -------------------------------------------------------------------
i=0
while test "$i" -lt "${#audit[@]}"; do
@@ -316,24 +294,6 @@ while test "$i" -lt "${#audit[@]}"; do
exit 1
else result "OK"; fi
;;
--file-matches)
path="${audit[$((i + 1))]}"
i=$((i + 2))
info "checking ${path} matches ${audit[$i]}"
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then result "OK"; else
result "no match"
exit 1
fi
;;
--file-not-matches)
path="${audit[$((i + 1))]}"
i=$((i + 2))
info "checking ${path} lacks ${audit[$i]}"
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then
result "matched"
exit 1
else result "OK"; fi
;;
esac
i=$((i + 1))
done

View File

@@ -14,7 +14,6 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
"""
import argparse
import gzip
import os
import time
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
@@ -135,14 +134,12 @@ class Handler(SimpleHTTPRequestHandler):
# --- type/extension matrix (issue #267 family) -------------------------
def send_raw(self, body, content_type, extra_headers=()):
def send_raw(self, body, content_type):
"""Send a raw body with an explicit Content-Type, or none at all when
content_type is None (to observe httrack's typeless-file naming)."""
self.send_response(200)
if content_type is not None:
self.send_header("Content-Type", content_type)
for name, value in extra_headers:
self.send_header(name, value)
self.send_header("Content-Length", str(len(body)))
self.end_headers()
if self.command != "HEAD":
@@ -151,8 +148,6 @@ class Handler(SimpleHTTPRequestHandler):
# Fake-binary blobs for the image/pdf/typeless cases.
FAKE_PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64
FAKE_PDF = b"%PDF-1.4\n" + b"\x00" * 64
FAKE_JPEG = b"\xff\xd8\xff\xe0" + b"\x00" * 64
BIG_JPEG = b"\xff\xd8\xff\xe0" + bytes(range(256)) * 64 # > sniff window
# path -> (body, content_type); None sends no header, "" sends an empty
# Content-Type value (no usable type, must be treated like None).
@@ -164,8 +159,6 @@ class Handler(SimpleHTTPRequestHandler):
"/types/notype.pdf": (FAKE_PDF, None),
"/types/emptyct.png": (FAKE_PNG, ""),
"/types/lie.png": (FAKE_PNG, "text/html"),
"/types/wrongtype.jpg": (FAKE_JPEG, "image/png"),
"/types/bigtype.jpg": (BIG_JPEG, "image/png"),
"/types/report.pdf": (b"<html><body>real page</body></html>", "text/html"),
"/types/page.htm": (b"<html><body>htm page</body></html>", "text/html"),
"/types/script.js": (b"var x = 1;\n", "application/javascript"),
@@ -183,10 +176,6 @@ class Handler(SimpleHTTPRequestHandler):
'\t<a href="notype.pdf">notypepdf</a>\n'
'\t<img src="emptyct.png" />\n'
'\t<img src="lie.png" />\n'
'\t<img src="wrongtype.jpg" />\n'
'\t<img src="bigtype.jpg" />\n'
'\t<img src="mutant.jpg" />\n'
'\t<img src="packed.jpg" />\n'
'\t<a href="report.pdf">report</a>\n'
'\t<a href="page.htm">htm</a>\n'
'\t<script src="script.js"></script>\n'
@@ -201,25 +190,6 @@ class Handler(SimpleHTTPRequestHandler):
body, ctype = self.TYPE_MATRIX[path]
self.send_raw(body, ctype)
# content changes between crawls: run 1 sniffs JPEG, the update pass must
# keep the run-1 name (recorded verdict) even though the body is now PNG
MUTANT_SEEN = set()
def route_types_mutant(self):
path = urlsplit(self.path).path
body = self.FAKE_PNG if path in self.MUTANT_SEEN else self.FAKE_JPEG
if self.command != "HEAD":
self.MUTANT_SEEN.add(path)
self.send_raw(body, "image/png")
# gzip on the wire: the sniff must see the decoded body, not the stream
def route_types_packed(self):
self.send_raw(
gzip.compress(self.FAKE_JPEG),
"image/png",
extra_headers=[("Content-Encoding", "gzip")],
)
# --- MIME-type exclusion abort (issue #58) -----------------------------
# A -mime:application/pdf filter must abort the transfer once the header
# arrives, not download the whole body and discard it.
@@ -384,27 +354,6 @@ class Handler(SimpleHTTPRequestHandler):
if self.command != "HEAD":
self.wfile.write(body)
# Content-Disposition naming: the attachment filename replaces the
# URL-derived name; path components in it are stripped (RFC 2616).
CDISPO_NAMES = {
"/cdispo/fetch.php": "report.pdf",
"/cdispo/evil.php": "../../evil.pdf",
}
def route_cdispo_index(self):
self.send_html(
'\t<a href="fetch.php">report</a>\n' '\t<a href="evil.php">evil</a>\n'
)
def route_cdispo(self):
filename = self.CDISPO_NAMES[urlsplit(self.path).path]
cdispo = 'attachment; filename="%s"' % filename
self.send_raw(
self.FAKE_PDF,
"application/pdf",
extra_headers=[("Content-Disposition", cdispo)],
)
# 302 whose Location carries a #fragment (#204): the fragment is a UA anchor
# that must be dropped before the target is fetched. A leaked '#' reaches the
# strict-server guard below and 400s.
@@ -420,74 +369,6 @@ class Handler(SimpleHTTPRequestHandler):
def route_redir_target(self):
self.send_raw(b"<html><body>redirect target</body></html>\n", "text/html")
# --- delayed-type degenerate paths (issues #5/#107) --------------------
def route_delayed_index(self):
self.send_html(
'\t<a href="noloc.php">noloc</a>\n'
'\t<a href="selfloop.php">selfloop</a>\n'
'\t<a href="chain1.php">chain</a>\n'
'\t<a href="redir.php">redir</a>\n'
'\t<a href="notype.bin">notype</a>\n'
'\t<a href="empty.php">empty</a>\n'
)
def send_redirect(self, location):
self.send_response(302, "Found")
if location is not None:
self.send_header("Location", location)
self.send_header("Content-Length", "0")
self.end_headers()
def route_delayed_noloc(self):
self.send_redirect(None) # 302 without Location: name never resolves
def route_delayed_selfloop(self):
self.send_redirect("selfloop.php")
def route_delayed_chain(self):
# chain1..chain9: one more hop than the type-check redirect budget
n = int(urlsplit(self.path).path.rsplit("chain", 1)[1].split(".")[0])
if n < 9:
self.send_redirect("chain%d.php" % (n + 1))
else:
self.send_raw(self.FAKE_PDF, "application/pdf")
def route_delayed_redir(self):
self.send_redirect("real.pdf")
def route_delayed_realpdf(self):
self.send_raw(self.FAKE_PDF, "application/pdf")
def route_delayed_notype(self):
self.send_raw(self.FAKE_PDF, None)
def route_delayed_empty(self):
self.send_raw(b"", "text/html") # 200 + Content-Length: 0
# -E time-limit (#481): pages that trickle far longer than any -E budget,
# so only an engine-side abort can end the crawl.
TRICKLE_SECONDS = 60
def route_trickle_index(self):
self.send_html(
"".join('\t<a href="p%d.bin">p%d</a>\n' % (i, i) for i in range(8))
)
def route_trickle_page(self):
self.send_response(200)
self.send_header("Content-Type", "application/octet-stream")
self.send_header("Content-Length", str(2 * self.TRICKLE_SECONDS))
self.end_headers()
if self.command == "HEAD":
return
try:
for _ in range(self.TRICKLE_SECONDS):
self.wfile.write(b"xy")
self.wfile.flush()
time.sleep(1.0)
except OSError:
pass
ROUTES = {
"/cookies/entrance.php": route_entrance,
"/cookies/second.php": route_second,
@@ -503,10 +384,6 @@ class Handler(SimpleHTTPRequestHandler):
"/types/notype.pdf": route_types,
"/types/emptyct.png": route_types,
"/types/lie.png": route_types,
"/types/wrongtype.jpg": route_types,
"/types/bigtype.jpg": route_types,
"/types/mutant.jpg": route_types_mutant,
"/types/packed.jpg": route_types_packed,
"/types/report.pdf": route_types,
"/types/page.htm": route_types,
"/types/script.js": route_types,
@@ -529,34 +406,6 @@ class Handler(SimpleHTTPRequestHandler):
"/mimex/index.html": route_mimex_index,
"/mimex/blob.pdf": route_mimex_blob,
"/mimex/real.html": route_mimex_real,
"/cdispo/index.html": route_cdispo_index,
"/cdispo/fetch.php": route_cdispo,
"/cdispo/evil.php": route_cdispo,
"/delayed/index.html": route_delayed_index,
"/trickle/index.html": route_trickle_index,
"/trickle/p0.bin": route_trickle_page,
"/trickle/p1.bin": route_trickle_page,
"/trickle/p2.bin": route_trickle_page,
"/trickle/p3.bin": route_trickle_page,
"/trickle/p4.bin": route_trickle_page,
"/trickle/p5.bin": route_trickle_page,
"/trickle/p6.bin": route_trickle_page,
"/trickle/p7.bin": route_trickle_page,
"/delayed/noloc.php": route_delayed_noloc,
"/delayed/selfloop.php": route_delayed_selfloop,
"/delayed/redir.php": route_delayed_redir,
"/delayed/real.pdf": route_delayed_realpdf,
"/delayed/notype.bin": route_delayed_notype,
"/delayed/empty.php": route_delayed_empty,
"/delayed/chain1.php": route_delayed_chain,
"/delayed/chain2.php": route_delayed_chain,
"/delayed/chain3.php": route_delayed_chain,
"/delayed/chain4.php": route_delayed_chain,
"/delayed/chain5.php": route_delayed_chain,
"/delayed/chain6.php": route_delayed_chain,
"/delayed/chain7.php": route_delayed_chain,
"/delayed/chain8.php": route_delayed_chain,
"/delayed/chain9.php": route_delayed_chain,
"/redir/index.html": route_redir_index,
"/redir/go.php": route_redir_go,
"/redir/target.html": route_redir_target,

View File

@@ -1,4 +0,0 @@
<html><body>
<a href=target.html#sec>unquoted fragment link</a>
<a href="target.html#sec2">quoted fragment link</a>
</body></html>

View File

@@ -1 +0,0 @@
<html><body><a name="sec"></a><a name="sec2"></a>target</body></html>