Compare commits

..

1 Commits

Author SHA1 Message Date
Xavier Roche
b0d131f084 Harden three parser buffer copies against overflow
Width-less sscanf("%s %s %s") parsed a request line into fixed method/url/
protocol buffers in the catch-URL proxy (htscatchurl.c) and the postfile
sender (htslib.c), so a long token could overflow the smallest one
(method[32] / [256]). Add field widths matching each buffer.

The makeindex title decoder copied the UTF-8-expanded title into s[] with a
raw strcpy; a charset that expands past the buffer would overflow it.
Truncate with snprintf and free through freet.

The entity/URL unescapers guarded the copy with `j + 1 > max`, which reserves
nothing for the trailing NUL: an input filling dest to exactly `max` chars
wrote dest[max], one byte past a max-sized buffer. Every production caller
passes max = buffer capacity (strlen+1 or sizeof), so this was a real 1-byte
OOB, latent only because output is usually shorter than the buffer. Use
`>=`. The st_entities self-test passed max = strlen (relying on strdup's
extra byte); correct it to strlen+1 to match the callers.

Self-test unescape-bounds exercises the exact-fill boundary on both
unescapers; it aborts under ASan on the pre-fix guard.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-02 20:46:30 +02:00
27 changed files with 154 additions and 861 deletions

View File

@@ -63,16 +63,6 @@ AC_SUBST(LT_CV_OBJDIR,$lt_cv_objdir)
# Export version info
AC_SUBST(VERSION_INFO)
# Versioned plugin name for dlopen() in hts_create_opt(); soname major is
# libtool's current - age, so this tracks VERSION_INFO bumps automatically.
HTS_SONAME_MAJOR=$((${VERSION_INFO%%:*} - ${VERSION_INFO##*:}))
case "$host_os" in
darwin*) HTS_LIBHTSJAVA_NAME="libhtsjava.$HTS_SONAME_MAJOR.dylib" ;;
*) HTS_LIBHTSJAVA_NAME="libhtsjava.so.$HTS_SONAME_MAJOR" ;;
esac
AC_DEFINE_UNQUOTED([HTS_LIBHTSJAVA_NAME], ["$HTS_LIBHTSJAVA_NAME"],
[Versioned libhtsjava runtime name, derived from VERSION_INFO])
### Default CFLAGS
DEFAULT_CFLAGS="-Wall -Wformat -Wformat-security \
-Wmultichar -Wwrite-strings -Wcast-qual -Wcast-align \

View File

@@ -2237,13 +2237,12 @@ int host_wait(httrackp * opt, lien_back * back) {
static int slot_can_be_cleaned(const lien_back * back) {
return (back->status == STATUS_READY) // ready
/* Check autoclean */
&& (!back->locked) // not held by hts_wait_delayed (name pending)
&& (!back->testmode) // not test mode
&& (strnotempty(back->url_sav)) // filename exists
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
&& (back->r.size >= 0) // size>=0
;
/* Check autoclean */
&& (!back->testmode) // not test mode
&& (strnotempty(back->url_sav)) // filename exists
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
&& (back->r.size >= 0) // size>=0
;
}
static int slot_can_be_finalized(httrackp * opt, const lien_back * back) {
@@ -2892,10 +2891,10 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
// range size hack old location
#if HTS_DIRECTDISK
// Shortcut: store the file directly on disk when possible,
// sparing memory
if (back[i].status &&
!back[i].locked) { // name still pending when locked
// Court-circuit:
// Peut-on stocker le fichier directement sur disque?
// Ahh que ca serait vachement mieux et que ahh que la mémoire vous dit merci!
if (back[i].status) {
if (back[i].r.is_write == 0) { // mode mémoire
if (back[i].r.adr == NULL) { // rien n'a été écrit
if (!back[i].testmode) { // pas mode test
@@ -3961,12 +3960,8 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
&& (back[i].r.adr = (char *) malloct(2))) {
back[i].r.adr[0] = 0;
}
/* locked = name pending; the waiter finalizes after
patching url_sav (else: cached as .delayed, #5) */
if (!back[i].locked) {
hts_log_print(opt, LOG_TRACE, "finalizing empty");
back_finalize(opt, cache, sback, i);
}
hts_log_print(opt, LOG_TRACE, "finalizing empty");
back_finalize(opt, cache, sback, i);
} else if (!back[i].r.is_chunk) { // pas de chunk
//if (back[i].r.http11!=2) { // pas de chunk
back[i].is_chunk = 0;

View File

@@ -69,15 +69,11 @@ typedef struct t_hts_callbackarg t_hts_callbackarg;
typedef struct t_hts_callbackarg t_hts_callbackarg;
#endif
/* Marks a symbol an external wrapper module exports back to the engine.
Must override -fvisibility=hidden on ELF, or dlopen()ed plugins (htsjava)
hide their own hts_plug()/hts_unplug() entry points. */
/* Marks a symbol an external wrapper module exports back to the engine
(dllexport on Windows, nothing elsewhere). */
#ifndef EXTERNAL_FUNCTION
#ifdef _WIN32
#define EXTERNAL_FUNCTION __declspec(dllexport)
#elif ((defined(__GNUC__) && (__GNUC__ >= 4)) || \
(defined(HAVE_VISIBILITY) && HAVE_VISIBILITY))
#define EXTERNAL_FUNCTION __attribute__((visibility("default")))
#else
#define EXTERNAL_FUNCTION
#endif

View File

@@ -300,11 +300,6 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
/* Was the character read successfully ? */
if (nRead == utfBufferSize) {
/* the 'continue' below skips the NUL-reserve guard: re-check */
if (utfBufferJ + utfBufferSize >= max) {
return -1;
}
/* Rollback write position to sequence start write position */
j = utfBufferJ;

View File

@@ -128,33 +128,6 @@ void launch_ftp(FTPDownloadStruct * params) {
return 0; \
}
/* Bounded split of a hostile-URL "user[:pass]@" prefix (see htsftp.h). */
void ftp_split_userpass(const char *src, const char *end, char *user,
size_t user_size, char *pass, size_t pass_size) {
size_t n = 0;
assertf(user_size > 0 && pass_size > 0); /* the size-1 math underflows on 0 */
while (src[n] != '\0' && src[n] != ':') {
if (n < user_size - 1)
user[n] = src[n];
n++;
}
user[n < user_size ? n : user_size - 1] = '\0';
pass[0] = '\0';
if (src[n] == ':') { // password follows the colon
const size_t base = n + 1;
size_t k = 0;
while (&src[base + k + 1] < end && src[base + k] != '\0') {
if (k < pass_size - 1)
pass[k] = src[base + k];
k++;
}
pass[k < pass_size ? k : pass_size - 1] = '\0';
}
}
// la véritable fonction une fois lancées les routines thread/fork
int run_launch_ftp(FTPDownloadStruct * pStruct) {
lien_back *back = pStruct->pBack;
@@ -200,7 +173,24 @@ int run_launch_ftp(FTPDownloadStruct * pStruct) {
while(*real_adr == '/')
real_adr++; // sauter /
if ((adr = jump_identification(real_adr)) != real_adr) { // user
ftp_split_userpass(real_adr, adr, user, sizeof(user), pass, sizeof(pass));
int i = -1;
pass[0] = '\0';
do {
i++;
user[i] = real_adr[i];
} while((real_adr[i] != ':') && (real_adr[i]));
user[i] = '\0';
if (real_adr[i] == ':') { // pass
int j = -1;
i++; // oui on saute aussi le :
do {
j++;
pass[j] = real_adr[i + j];
} while(((&real_adr[i + j + 1]) < adr) && (real_adr[i + j]));
pass[j] = '\0';
}
}
// Calculer RETR <nom>
{
@@ -994,8 +984,8 @@ int get_ftp_line(T_SOC soc, char *ptrline, size_t line_size, int timeout) {
//case 0: break; // pas encore --> erreur (on attend)!
case 1:
HTS_STAT.HTS_TOTAL_RECV += 1; // compter flux entrant
if ((b != 10) && (b != 13) && (i < (int) sizeof(data) - 1))
data[i++] = b; // truncate hostile over-long reply lines
if ((b != 10) && (b != 13))
data[i++] = b;
break;
default:
if (ptrline)

View File

@@ -70,11 +70,6 @@ int back_launch_ftp(FTPDownloadStruct * params);
int run_launch_ftp(FTPDownloadStruct * params);
int send_line(T_SOC soc, const char *data);
int get_ftp_line(T_SOC soc, char *line, size_t line_size, int timeout);
/* Split a "user[:pass]@" prefix (end = jump_identification result) into
bounded, NUL-terminated user/pass buffers, truncating to fit.
Both sizes must be nonzero. */
void ftp_split_userpass(const char *src, const char *end, char *user,
size_t user_size, char *pass, size_t pass_size);
T_SOC get_datasocket(char *to_send, size_t to_send_size);
int stop_ftp(lien_back * back);
char *linejmp(char *line);

View File

@@ -63,9 +63,6 @@ Please visit our Website: http://www.httrack.com
/* This file */
#include "htsjava.h"
/* calloct/freet wrappers */
#include "htssafe.h"
static int reverse_endian(void) {
int endian = 1;
@@ -207,16 +204,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
return 0;
}
/* A constant-pool entry is >= 1 byte on disk; reject a count exceeding
the file size (hostile .class ~68 MB alloc DoS). */
if (!hts_count_fits(header.count, (LLint) fsize(file))) {
fclose(fpout);
sprintf(str->err_msg,
"Invalid constant pool count %u (file len " LLintP ")",
(unsigned) header.count, (LLint) fsize(file));
return 0;
}
tab = (RESP_STRUCT *) calloct(header.count, sizeof(RESP_STRUCT));
tab = (RESP_STRUCT *) calloc(header.count, sizeof(RESP_STRUCT));
if (!tab) {
sprintf(str->err_msg, "Unable to alloc %d bytes",
(int) sizeof(RESP_STRUCT));
@@ -242,7 +230,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
} else { // ++ une erreur est survenue!
if (strnotempty(str->err_msg) == 0)
strcpy(str->err_msg, "Internal readtable error");
freet(tab);
free(tab);
if (fpout) {
fclose(fpout);
fpout = NULL;
@@ -300,7 +288,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
#if JAVADEBUG
printf("end\n");
#endif
freet(tab);
free(tab);
if (fpout) {
fclose(fpout);
fpout = NULL;

View File

@@ -33,19 +33,15 @@ Please visit our Website: http://www.httrack.com
#ifndef HTSJAVA_DEFH
#define HTSJAVA_DEFH
#include <stdint.h>
#ifndef HTS_DEF_FWSTRUCT_JAVA_HEADER
#define HTS_DEF_FWSTRUCT_JAVA_HEADER
typedef struct JAVA_HEADER JAVA_HEADER;
#endif
/* 10-byte on-disk .class header image, fread() directly: fields need exact
widths (LP64's 8-byte 'unsigned long' magic never matched 0xCAFEBABE). */
struct JAVA_HEADER {
uint32_t magic;
uint16_t minor;
uint16_t major;
uint16_t count;
unsigned long int magic;
unsigned short int minor;
unsigned short int major;
unsigned short int count;
};
#ifndef HTS_DEF_FWSTRUCT_RESP_STRUCT

View File

@@ -6023,11 +6023,9 @@ HTSEXT_API httrackp *hts_create_opt(void) {
"htsswf", "htsjava", "httrack-plugin", NULL
};
#else
#ifndef HTS_LIBHTSJAVA_NAME
#define HTS_LIBHTSJAVA_NAME "libhtsjava.so" /* non-autoconf fallback */
#endif
static const char *defaultModules[] = {"libhtsswf.so.1", HTS_LIBHTSJAVA_NAME,
"httrack-plugin", NULL};
static const char *defaultModules[] = {
"libhtsswf.so.1", "libhtsjava.so.2", "httrack-plugin", NULL
};
#endif
httrackp *opt = malloc(sizeof(httrackp));

View File

@@ -138,66 +138,37 @@ static void cleanEndingSpaceOrDot(char *s) {
}
}
/* Wire Content-Type vs URL extension: a patchable wire type wins over an
unspecific ext, the HTS_UNKNOWN_MIME sentinel keeps a specific non-HTML ext
(#267 guard), a declared disagreement is CONTESTED. Sentinel and verdict
ride the cache, so updates stay consistent. */
typedef enum wire_verdict {
WIRE_KEEPS_EXT,
WIRE_WINS,
WIRE_CONTESTED
} wire_verdict;
static wire_verdict wire_ext_verdict(httrackp *opt, const char *wiremime,
const char *file, char *urlmime,
size_t urlmime_size) {
if (may_unknown2(opt, wiremime, file))
return WIRE_KEEPS_EXT; /* type kept verbatim (keep-list / bogus-multiple) */
urlmime[0] = '\0';
/* type implied by the URL extension, only when confidently known (flag 0) */
if (!get_httptype_sized(opt, urlmime, urlmime_size, file, 0))
return WIRE_WINS; /* URL ext implies no known type */
if (strfield2(wiremime, urlmime))
return WIRE_KEEPS_EXT; /* agreement (no .htm->.html churn) */
if (!is_hypertext_mime(opt, urlmime, file) &&
strfield2(wiremime, HTS_UNKNOWN_MIME))
return WIRE_KEEPS_EXT; /* no declared type */
return WIRE_CONTESTED;
}
/* Should the wire Content-Type override the URL's own extension when naming the
saved file? True when the type is patchable (may_unknown2) and either the URL
extension implies no specific type or the server declared a disagreeing one.
A URL extension mapping to a specific non-HTML type is kept only when the
server declared NO type (the HTS_UNKNOWN_MIME sentinel; the #267 mangle
guard): a typeless .png stays .png, but a .pdf explicitly served as text/html
is named .html. The sentinel rides the cache, so updates stay consistent. */
static int wire_patches_ext(httrackp *opt, const char *wiremime,
const char *file) {
char urlmime[256];
switch (wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime))) {
case WIRE_KEEPS_EXT:
if (may_unknown2(opt, wiremime, file))
return 0; /* type kept verbatim (keep-list / bogus-multiple) */
urlmime[0] = '\0';
/* type implied by the URL extension, only when confidently known (flag 0) */
if (!get_httptype_sized(opt, urlmime, sizeof(urlmime), file, 0))
return 1; /* URL ext implies no known type: trust the wire type */
if (strfield2(wiremime, urlmime))
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
/* wire disagrees with a specific non-HTML URL ext. Keep the ext only when
the server declared no type (the sentinel); an explicitly declared type,
even text/html, is trusted, so a binary-looking URL that really serves
HTML (login/error interstitial, soft-404) is named .html. */
if (!is_hypertext_mime(opt, urlmime, file) &&
strfield2(wiremime, HTS_UNKNOWN_MIME))
return 0;
case WIRE_WINS:
return 1;
case WIRE_CONTESTED:
break; /* no content evidence is consulted today: trust the wire */
}
return 1;
}
/* Wire-metadata name change: a Content-Disposition filename wins (returns 2),
else the declared type's ext when wire_patches_ext() allows (returns 1),
else 0. ext receives the new extension or replacement filename. */
static int resolve_extension(httrackp *opt, const char *cdispo,
const char *contenttype, const char *fil,
char *ext, size_t ext_size) {
if (strnotempty(cdispo)) {
strlcpybuff(ext, cdispo, ext_size);
return 2;
}
if (wire_patches_ext(opt, contenttype, fil) &&
give_mimext(ext, ext_size, contenttype))
return 1;
return 0;
}
// Build the local save name (save) from adr/fil; renames on collision
// (e.g. INDEX.HTML vs index.html).
// forme le nom du fichier à sauver (save) à partir de fil et adr
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
int url_savename(lien_adrfilsave *const afs,
lien_adrfil *const former,
const char *referer_adr, const char *referer_fil,
@@ -434,23 +405,45 @@ int url_savename(lien_adrfilsave *const afs,
// si option check_type activée
if (is_html < 0 && opt->check_type && !ext_chg) {
int ishtest = 0;
if (protocol != PROTOCOL_FILE
&& protocol != PROTOCOL_FTP
) {
// tester type avec requète HEAD si on ne connait pas le type du fichier
if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD ||
ishtml(opt, fil) < 0) { // unsure whether it's html or a file
(ishtest = ishtml(opt, fil)) <
0) { // unsure whether it's html or a file
// lire dans le cache
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
if (r.statuscode != -1) { // cache entry read OK
if (r.statuscode != -1) { // pas d'erreur de lecture cache
char s[32];
s[0] = '\0';
hts_log_print(opt, LOG_DEBUG, "Testing link type (from cache) %s%s",
adr_complete, fil_complete);
if (!HTTP_IS_REDIRECT(r.statuscode)) {
ext_chg = resolve_extension(opt, r.cdispo, r.contenttype, fil,
ext, sizeof(ext));
if (strnotempty(r.cdispo)) { /* filename given */
ext_chg = 2; /* change filename */
strcpybuff(ext, r.cdispo);
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
if (give_mimext(s, sizeof(s),
r.contenttype)) { // recognized extension
ext_chg = 1;
strcpybuff(ext, s);
}
}
}
#ifdef DEFAULT_BIN_EXT
// no extension and potentially bogus
else if (ishtest == -2) {
ext_chg = 1;
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
}
#endif
//
} else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD &&
is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
Lookup mimetype not only by extension,
@@ -474,11 +467,22 @@ int url_savename(lien_adrfilsave *const afs,
// fail later
else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
!opt->state.stop) {
// Check if the file is ready in backing.
// Check if the file is ready in backing. We basically take the same logic as later.
// FIXME: we should cleanup and factorize this unholy mess
if (headers != NULL && headers->status >= 0 && !is_redirect) {
ext_chg = resolve_extension(opt, headers->r.cdispo,
headers->r.contenttype,
headers->url_fil, ext, sizeof(ext));
if (strnotempty(headers->r.cdispo)) { /* filename given */
ext_chg = 2; /* change filename */
strcpybuff(ext, headers->r.cdispo);
} else if (wire_patches_ext(opt, headers->r.contenttype,
headers->url_fil)) {
char s[16];
if (give_mimext(
s, sizeof(s),
headers->r.contenttype)) { // recognized extension
ext_chg = 1;
strcpybuff(ext, s);
}
}
}
else if (mime_type != NULL) {
ext[0] = '\0';
@@ -496,6 +500,13 @@ int url_savename(lien_adrfilsave *const afs,
if (!may_unknown2(opt, mime_type, fil)) {
ext_chg = 1;
}
#ifdef DEFAULT_BIN_EXT
// no extension and potentially bogus
else if (ishtml(opt, fil) == -2) {
ext_chg = 1;
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
}
#endif
} else {
ext_chg = 0;
}
@@ -685,10 +696,30 @@ int url_savename(lien_adrfilsave *const afs,
// libérer emplacement backing
}
// no error: change the type?
ext_chg = resolve_extension(
opt, back[b].r.cdispo, back[b].r.contenttype,
back[b].url_fil, ext, sizeof(ext));
{ // pas d'erreur, changer type?
char s[16];
s[0] = '\0';
if (strnotempty(back[b].r.cdispo)) { /* filename given */
ext_chg = 2; /* change filename */
strcpybuff(ext, back[b].r.cdispo);
} else if (wire_patches_ext(opt, back[b].r.contenttype,
back[b].url_fil)) {
if (give_mimext(
s, sizeof(s),
back[b].r.contenttype)) { // recognized extension
ext_chg = 1;
strcpybuff(ext, s);
}
}
#ifdef DEFAULT_BIN_EXT
// no extension and potentially bogus
else if (ishtest == -2) {
ext_chg = 1;
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
}
#endif
}
}
// FIN Si non déplacé, forcer type?

View File

@@ -4845,9 +4845,6 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
/* Still have a back reference */
if (b >= 0) {
/* Patch destination filename for direct-to-disk mode, BEFORE any
finalize: it records and caches the entry under url_sav */
strcpybuff(back[b].url_sav, afs->save);
/* Finalize now as we have the type */
if (back[b].status == STATUS_READY) {
if (!back[b].finalized) {
@@ -4855,6 +4852,8 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
back_finalize(opt, cache, sback, b);
}
}
/* Patch destination filename for direct-to-disk mode */
strcpybuff(back[b].url_sav, afs->save);
}
} // b >= 0

View File

@@ -456,13 +456,6 @@ static HTS_INLINE HTS_UNUSED const char *htsbuff_str(const htsbuff *b) {
return b->buf;
}
/** True if 'count' records of >= 1 byte each fit in 'available' bytes; guards
an attacker-controlled count driving a large allocation. */
static HTS_INLINE HTS_UNUSED hts_boolean hts_count_fits(size_t count,
LLint available) {
return (available >= 0 && (LLint) count <= available) ? HTS_TRUE : HTS_FALSE;
}
/* Thin aliases over the libc allocator/memcpy (historical "t" suffix); no
added bounds checking. freet() also NULLs the freed pointer and tolerates
NULL. memcpybuff() despite the name is a raw memcpy: the caller owns the

View File

@@ -50,7 +50,6 @@ Please visit our Website: http://www.httrack.com
#include "htsdns_selftest.h"
#include "htscharset.h"
#include "htsencoding.h"
#include "htsftp.h"
#include "htsmd5.h"
#if HTS_USEZLIB
#include "htszlib.h"
@@ -62,10 +61,6 @@ Please visit our Website: http://www.httrack.com
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef _WIN32
#include <sys/socket.h>
#include <unistd.h>
#endif
/* very minimalistic internal tests */
static void basic_selftests(void) {
@@ -736,17 +731,6 @@ static int st_unescape_bounds(httrackp *opt, int argc, char **argv) {
assertf(hts_unescapeUrl("abcd", dest, sizeof(dest)) == -1);
assertf(hts_unescapeEntities("abc", dest, sizeof(dest)) == 0);
assertf(strcmp(dest, "abc") == 0);
/* raw multi-byte UTF-8 flush path (bypasses the per-byte guard) */
assertf(hts_unescapeUrl("ab\xC3\xA9", dest, sizeof(dest)) == -1);
assertf(hts_unescapeUrl("a\xC3\xA9", dest, sizeof(dest)) == 0);
assertf(strcmp(dest, "a\xC3\xA9") == 0);
{
/* %xx-encoded flush path (utfBufferJ = lastJ rollback) */
char wide[8];
assertf(hts_unescapeUrl("%C3%A9", wide, sizeof(wide)) == 0);
assertf(strcmp(wide, "\xC3\xA9") == 0);
}
printf("unescape-bounds self-test OK\n");
return 0;
}
@@ -1093,202 +1077,35 @@ static int st_resolve(httrackp *opt, int argc, char **argv) {
return 0;
}
/* Extra args are key=value: adr= cdispo= statuscode= status= strip= urlhack=
no-www= no-slash= no-query= n83= type=, plus repeatable prior=adr|fil|sav
registering an already-crawled link (dedup/collision paths). */
/* Parse raw response-header lines and print the naming-relevant fields. */
static int st_header(httrackp *opt, int argc, char **argv) {
htsblk r;
int i;
(void) opt;
if (argc < 1) {
fprintf(stderr, "header: needs at least one raw header line\n");
return 1;
}
memset(&r, 0, sizeof(r));
for (i = 0; i < argc; i++) {
char BIGSTK line[HTS_URLMAXSIZE * 2];
strcpybuff(line, argv[i]);
treathead(NULL, "www.example.com", "/", &r, line);
}
printf("contenttype=%s cdispo=%s\n", r.contenttype, r.cdispo);
return 0;
}
/* Decode a body argument ("hex:FFD8.." or literal text) into buf. */
static size_t st_decode_body(const char *arg, char *buf, size_t size) {
size_t n = 0;
if (strncmp(arg, "hex:", 4) == 0) {
const char *s = arg + 4;
for (; s[0] != '\0' && s[1] != '\0' && n + 1 < size; s += 2) {
unsigned int byte;
if (sscanf(s, "%2x", &byte) != 1)
break;
buf[n++] = (char) byte;
}
} else {
n = strlen(arg);
if (n >= size)
n = size - 1;
memcpy(buf, arg, n);
}
buf[n] = '\0';
return n;
}
static int st_savename(httrackp *opt, int argc, char **argv) {
lien_adrfilsave afs;
cache_back cache;
struct_back *sback;
hash_struct hash;
lien_back headers;
const char *adr = "www.example.com";
const char *cdispo = NULL;
const char *body = NULL;
const char *cached = NULL;
const char *bodyfile = "st-savename-body.tmp";
int statuscode = HTTP_OK, status = 0;
int i;
if (argc < 2) {
fprintf(stderr, "savename: needs a fil and a content-type\n");
return 1;
}
/* knobs first: hash_init and the prior links depend on them */
for (i = 2; i < argc; i++) {
const char *const a = argv[i];
if (strncmp(a, "adr=", 4) == 0)
adr = a + 4;
else if (strncmp(a, "cdispo=", 7) == 0)
cdispo = a + 7;
else if (strncmp(a, "statuscode=", 11) == 0)
statuscode = atoi(a + 11);
else if (strncmp(a, "status=", 7) == 0)
status = atoi(a + 7);
else if (strncmp(a, "strip=", 6) == 0)
StringCopy(opt->strip_query, a + 6);
else if (strncmp(a, "urlhack=", 8) == 0)
opt->urlhack = atoi(a + 8) ? HTS_TRUE : HTS_FALSE;
else if (strncmp(a, "no-www=", 7) == 0)
opt->no_www_dedup = atoi(a + 7) ? HTS_TRUE : HTS_FALSE;
else if (strncmp(a, "no-slash=", 9) == 0)
opt->no_slash_dedup = atoi(a + 9) ? HTS_TRUE : HTS_FALSE;
else if (strncmp(a, "no-query=", 9) == 0)
opt->no_query_dedup = atoi(a + 9) ? HTS_TRUE : HTS_FALSE;
else if (strncmp(a, "n83=", 4) == 0)
opt->savename_83 = atoi(a + 4);
else if (strncmp(a, "type=", 5) == 0)
opt->savename_type = atoi(a + 5);
else if (strncmp(a, "body=", 5) == 0)
body = a + 5;
else if (strncmp(a, "cached=", 7) == 0)
cached = a + 7;
else if (strncmp(a, "prior=", 6) != 0) {
fprintf(stderr, "savename: unknown arg '%s'\n", a);
return 1;
}
}
memset(&afs, 0, sizeof(afs));
strcpybuff(afs.af.adr, adr);
strcpybuff(afs.af.adr, "www.example.com");
strcpybuff(afs.af.fil, argv[0]);
memset(&cache, 0, sizeof(cache));
if (cached != NULL) { /* cached=<content-type>|<save name> */
char *dup = strdupt(cached);
char *const sep = strchr(dup, '|');
char locbuf[64] = "";
htsblk cr;
if (sep == NULL) {
fprintf(stderr, "savename: cached needs ctype|save\n");
return 1;
}
*sep = '\0';
/* one-entry cache in cwd, reopened read-only; body is PNG magic on
purpose: naming must not depend on stored content */
StringCopy(opt->path_log, "");
cache.type = 1;
cache.log = cache.errlog = stderr;
cache.hashtable = coucal_new(0);
cache_init(&cache, opt);
hts_init_htsblk(&cr);
cr.statuscode = HTTP_OK;
strcpybuff(cr.msg, "OK");
strcpybuff(cr.contenttype, dup);
cr.location = locbuf;
cr.adr = strdupt("\x89PNG\r\n\x1a\n");
cr.size = 8;
cache_add(opt, &cache, &cr, adr, argv[0], sep + 1, 1, NULL);
freet(cr.adr);
if (cache.zipOutput != NULL) {
zipClose(cache.zipOutput, NULL);
cache.zipOutput = NULL;
}
memset(&cache, 0, sizeof(cache));
cache.type = 1;
cache.log = cache.errlog = stderr;
cache.hashtable = coucal_new(0);
cache.ro = 1;
cache_init(&cache, opt);
freet(dup);
} else {
cache.hashtable = (void *) coucal_new(0);
}
cache.hashtable = (void *) coucal_new(0);
sback = back_new(opt, opt->maxsoc * 32 + 1024);
/* same wiring as hts_mirror (htscore.c) */
hash_init(opt, &hash, opt->urlhack);
hash.liens = (const lien_url *const *const *) &opt->liens;
opt->hash = &hash;
hts_record_init(opt);
for (i = 2; i < argc; i++) {
if (strncmp(argv[i], "prior=", 6) == 0) {
char *dup = strdupt(argv[i] + 6);
char *const p1 = strchr(dup, '|');
char *const p2 = p1 != NULL ? strchr(p1 + 1, '|') : NULL;
if (p2 == NULL) {
fprintf(stderr, "savename: prior needs adr|fil|sav\n");
return 1;
}
*p1 = *p2 = '\0';
if (!hts_record_link(opt, dup, p1 + 1, p2 + 1, "", "", NULL))
return 1;
freet(dup);
}
}
memset(&headers, 0, sizeof(headers));
headers.status = status;
headers.r.statuscode = statuscode;
headers.status = 0;
headers.r.statuscode = HTTP_OK;
strcpybuff(headers.r.contenttype, argv[1]);
if (cdispo != NULL)
strcpybuff(headers.r.cdispo, cdispo);
strcpybuff(headers.url_fil, argv[0]);
if (body != NULL) { /* leading body bytes, exposed via url_sav */
char BIGSTK data[1024];
const size_t n = st_decode_body(body, data, sizeof(data));
FILE *const fp = fopen(bodyfile, "wb");
if (fp == NULL || fwrite(data, 1, n, fp) != n) {
fprintf(stderr, "savename: can not write %s\n", bodyfile);
return 1;
}
fclose(fp);
strcpybuff(headers.url_sav, bodyfile);
}
url_savename(&afs, NULL, NULL, NULL, opt, sback, &cache, &hash, 0, 0,
&headers);
if (body != NULL)
(void) UNLINK(bodyfile);
printf("savename: %s\n", afs.save);
return 0;
}
@@ -1970,86 +1787,6 @@ static int st_robots(httrackp *opt, int argc, char **argv) {
return 0;
}
/* get_ftp_line must bound a hostile, CRLF-less reply into its internal
1024-byte buffer; ASan turns the pre-fix overflow into an abort here. */
#ifndef _WIN32
static int st_ftpline(httrackp *opt, int argc, char **argv) {
int sv[2];
char line[2048];
char flood[4096];
(void) opt;
(void) argc;
(void) argv;
memset(flood, 'x', sizeof(flood));
assertf(socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == 0);
assertf(write(sv[1], "220 ", 4) == 4); // valid 3-digit code
assertf(write(sv[1], flood, sizeof(flood)) == (ssize_t) sizeof(flood));
assertf(write(sv[1], "\r\n", 2) == 2); // end the line so we return
close(sv[1]);
line[0] = '\0';
get_ftp_line(sv[0], line, sizeof(line), 5);
close(sv[0]);
printf("ftp-line self-test OK (bounded %d-byte reply)\n",
(int) sizeof(flood));
return 0;
}
#endif
/* ftp_split_userpass: well-formed split, plus a hostile over-long userinfo
that pre-fix overran user[256]/pass[256]. */
static int st_ftpuser(httrackp *opt, int argc, char **argv) {
char user[256], pass[256];
char in[1200];
(void) opt;
(void) argc;
(void) argv;
{
const char ok[] = "bob:secret@host/f"; // '@' at index 10
ftp_split_userpass(ok, ok + 11, user, sizeof(user), pass, sizeof(pass));
assertf(strcmp(user, "bob") == 0);
assertf(strcmp(pass, "secret") == 0);
}
memset(in, 'u', 400);
in[400] = ':';
memset(in + 401, 'p', 400);
in[801] = '@';
in[802] = '\0';
ftp_split_userpass(in, in + 802, user, sizeof(user), pass, sizeof(pass));
assertf(strlen(user) == sizeof(user) - 1);
assertf(strlen(pass) == sizeof(pass) - 1);
{
/* tight sizes + guard byte catch an off-by-one the 256 case can't */
char ubuf[16], pbuf[16];
memset(ubuf, 'Z', sizeof(ubuf));
memset(pbuf, 'Z', sizeof(pbuf));
ftp_split_userpass(in, in + 802, ubuf, 8, pbuf, 8);
assertf(strcmp(ubuf, "uuuuuuu") == 0);
assertf(strcmp(pbuf, "ppppppp") == 0);
assertf(ubuf[8] == 'Z' && pbuf[8] == 'Z');
}
printf("ftp-userpass self-test OK\n");
return 0;
}
/* hts_count_fits caps the .class constant-pool entry count to the file size,
rejecting the ~68 MB-per-file calloc DoS. */
static int st_java(httrackp *opt, int argc, char **argv) {
(void) opt;
(void) argc;
(void) argv;
assertf(hts_count_fits(10, 1000) == HTS_TRUE);
assertf(hts_count_fits(0, 10) == HTS_TRUE);
assertf(hts_count_fits(65535, 10) == HTS_FALSE);
assertf(hts_count_fits(1, 0) == HTS_FALSE);
assertf(hts_count_fits(1, -1) == HTS_FALSE);
printf("java constant-pool cap self-test OK\n");
return 0;
}
/* ------------------------------------------------------------ */
/* Registry: name -> handler, with a usage hint and a one-line description. */
/* ------------------------------------------------------------ */
@@ -2091,10 +1828,8 @@ static const struct selftest_entry {
st_relative},
{"resolve", "<link> <adr> <fil>", "resolve a link against an origin",
st_resolve},
{"header", "<raw-header-line> ...", "response header-line parsing",
st_header},
{"savename", "<fil> <content-type> [key=value ...]",
"local save-name for a URL", st_savename},
{"savename", "<fil> <content-type>", "local save-name for a URL",
st_savename},
{"cache", "<dir>", "cache read/write round-trip self-test", st_cache},
{"cache-golden", "<dir> [regen]", "frozen cache-format read self-test",
st_cache_golden},
@@ -2114,12 +1849,6 @@ static const struct selftest_entry {
"Accept-Encoding advertises gzip+deflate, both decode", st_acceptencoding},
{"robots", "", "robots.txt RFC 9309 Allow/Disallow precedence self-test",
st_robots},
#ifndef _WIN32
{"ftp-line", "", "get_ftp_line bounds a hostile FTP reply line",
st_ftpline},
#endif
{"ftp-userpass", "", "ftp_split_userpass bounds URL userinfo", st_ftpuser},
{"java", "", "java .class constant-pool count cap self-test", st_java},
};
static void list_selftests(void) {

View File

@@ -1,7 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# get_ftp_line bounds a hostile CRLF-less FTP reply into its 1024-byte buffer.
httrack -O /dev/null -#test=ftp-line run | grep -q "ftp-line self-test OK"

View File

@@ -1,7 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# ftp_split_userpass bounds an over-long user:pass@ from a hostile ftp:// URL.
httrack -O /dev/null -#test=ftp-userpass run | grep -q "ftp-userpass self-test OK"

View File

@@ -1,29 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# Response header-line parsing (treathead via -#test=header <raw-line> ...).
# Isolates the wire layer from url_savename, which strips traversal on its own.
hdr() {
local want="$1"
shift
out="$(httrack -O /dev/null -#test=header "$@" | grep '^contenttype=')"
test "$out" == "$want" || {
echo "FAIL: $* -> '$out' (want '$want')"
exit 1
}
}
hdr 'contenttype=application/pdf cdispo=' 'Content-Type: application/pdf'
# filename= is honored quoted or bare.
hdr 'contenttype= cdispo=report.pdf' \
'Content-Disposition: attachment; filename="report.pdf"'
hdr 'contenttype= cdispo=report.pdf' \
'Content-Disposition: attachment; filename=report.pdf'
# Path components in the filename are dropped on the wire (RFC 2616).
hdr 'contenttype= cdispo=evil.pdf' \
'Content-Disposition: attachment; filename="../../evil.pdf"'

View File

@@ -1,7 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# .class constant-pool count is capped to the file size (calloc DoS).
httrack -O /dev/null -#test=java run | grep -q "java constant-pool cap self-test OK"

View File

@@ -3,38 +3,13 @@
set -euo pipefail
# Local save-name resolution (url_savename via -#test=savename <fil> <content-type> [key=value ...]).
# name() asserts on the basename, full() on the whole path; prior= registers an
# already-crawled link whose sav is rooted under the -O path (/dev/null here).
# resolve httrack before cd: make check puts a RELATIVE ../src on PATH
httrack_bin=$(cd "$(dirname "$(command -v httrack)")" && pwd)/httrack
# scratch dir: body= and cached= write temp files (st-savename-body.tmp, hts-cache/)
scratch=$(mktemp -d)
trap 'rm -rf "$scratch"' EXIT
cd "$scratch"
run() {
"$httrack_bin" -O /dev/null -#test=savename "$@" | sed -n 's/^savename: //p'
}
# Local save-name extension resolution (url_savename via -#test=savename <fil> <content-type>).
# Asserts on the basename of "savename: <path>".
name() {
local fil="$1" ctype="$2" want="$3"
shift 3
out="$(run "$fil" "$ctype" "$@")"
test "${out##*/}" == "$want" || {
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
exit 1
}
}
full() {
local fil="$1" ctype="$2" want="$3"
shift 3
out="$(run "$fil" "$ctype" "$@")"
test "$out" == "$want" || {
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
out="$(httrack -O /dev/null -#test=savename "$1" "$2" | sed -n 's/^savename: //p')"
test "${out##*/}" == "$3" || {
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
exit 1
}
}
@@ -64,96 +39,3 @@ name '/types/data.json' 'application/json' 'data.json'
# Agreeing type must not rewrite the extension's casing (no strip-and-reappend).
name '/x.JPG' 'image/jpeg' 'x.JPG'
# A Content-Disposition filename replaces the URL name outright.
name '/x.php' 'application/pdf' 'report.pdf' cdispo=report.pdf
name '/download' 'text/html' 'setup.exe' cdispo=setup.exe
# Reserved characters in a hostile Content-Disposition name are sanitized.
name '/x.php' 'application/pdf' 'set_up.exe' 'cdispo=set:up.exe'
# The md5-of-query suffix lands inside a Content-Disposition name too.
name '/x.php?id=1' 'application/pdf' 'report681a.pdf' cdispo=report.pdf
# Still-downloading path (status=-1): mime drives the ext, cdispo is ignored
# there (the deliberately unfolded 4th resolve_extension variant).
name '/x.pdf' 'text/html' 'x.html' status=-1
name '/x.html' 'text/html' 'x.html' status=-1
name '/x.php' 'application/pdf' 'x.pdf' status=-1 cdispo=report.pdf
# Contested type (wire disagrees with a specific ext): the wire is trusted and
# body bytes are not consulted; pinned so a content-based tie-break shows up
# as an explicit flip of these rows.
name '/photo.jpg' 'image/png' 'photo.png' body=hex:FFD8FFE000104A46
name '/photo.jpg' 'image/png' 'photo.png' body=hex:89504E470D0A1A0A
name '/photo.jpg' 'image/png' 'photo.png'
name '/doc.pdf' 'text/html' 'doc.html' body=hex:255044462D312E34
name '/doc.pdf' 'text/html' 'doc.html' 'body=<html><body>soft 404</body></html>'
name '/style.css' 'image/png' 'style.png' 'body=body { }'
# A redirect answer resolves nothing: delayed placeholder name.
name '/x.php' 'text/html' 'x.0.delayed' statuscode=301
# Root and query-only URLs get index + the md5-of-query suffix.
name '/' 'text/html' 'index.html'
name '/?a=1' 'text/html' 'index3872.html'
# Same URL crawled before: reuse its sav verbatim (case preserved).
full '/X.PHP' 'text/html' 'www.example.com/CASE.HTML' \
'prior=www.example.com|/X.PHP|www.example.com/CASE.HTML'
# Another URL owns the name: collision suffix -2, then -3, case-insensitively.
name '/x.php' 'text/html' 'x-2.html' \
'prior=www.example.com|/other.html|/dev/null/www.example.com/x.html'
name '/x.php' 'text/html' 'x-3.html' \
'prior=www.example.com|/o1.html|/dev/null/www.example.com/x.html' \
'prior=www.example.com|/o2.html|/dev/null/www.example.com/x-2.html'
name '/INDEX.HTML' 'text/html' 'INDEX-2.HTML' \
'prior=www.example.com|/index.html|/dev/null/www.example.com/index.html'
# Same basename in another directory is NOT a collision.
name '/x.php' 'text/html' 'x.html' \
'prior=www.example.com|/sub/x.html|/dev/null/www.example.com/sub/x.html'
# 8-3 modes: DOS truncates every component to 8+3, ISO9660 level 2 to 31.
full '/directory-long/verylongfilename.html' 'text/html' \
'/dev/null/EXAMPLE/DIRECTOR/VERYLONG.HTM' n83=1
full '/directory-long/verylongfilename.html' 'text/html' \
'/dev/null/EXAMPLE_C/DIRECTORY_LONG/VERYLONGFILENAME.HTM' n83=2
name '/verylongfilename.php' 'text/html' 'VERYLO-2.HTM' n83=1 \
'prior=www.example.com|/other.html|/dev/null/EXAMPLE/VERYLONG.HTM'
# urlhack dedup (#271): // collapse and www-strip map to the prior link's sav;
# the per-feature negatives opt out and take a fresh name.
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/PRIOR.html' \
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' no-slash=1 \
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
full '/w.php' 'text/html' '/dev/null/www.example.com/W-PRIOR.html' adr=example.com \
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
full '/w.php' 'text/html' '/dev/null/example.com/w.html' adr=example.com no-www=1 \
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
# Distinct URLs must stay distinct under urlhack (no over-normalization).
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' \
'prior=www.example.com|/a/c.php|/dev/null/www.example.com/a/C-PRIOR.html'
# --strip-query (#112): stripped key dedups onto the prior sav; without the
# option the same URLs stay distinct.
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/PAGE-PRIOR.html' \
strip=sid 'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
# A kept key that differs must still block the dedup (no over-stripping).
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
strip=sid 'prior=www.example.com|/page.php?id=4|/dev/null/www.example.com/PAGE-PRIOR.html'
# Hostile fils stay rooted under the mirror: ../ (raw or %2e-encoded) drops out,
# control characters become spaces, oversized names cap at 210 chars (the cap
# can chop the extension off entirely).
full '/../../etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
full '/%2e%2e/%2e%2e/etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
full '/x.php' 'application/pdf' '/dev/null/www.example.com///evil.exe' 'cdispo=../../evil.exe'
name $'/evil\rname\t.php' 'text/html' 'evil name .html'
name "/$(printf 'a%.0s' {1..300}).php" 'text/html' "$(printf 'a%.0s' {1..210})"

View File

@@ -1,33 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# Update-run naming from a real cache entry (-#test=savename cached=<ctype>|<save>).
# Named 01_zlib-*: the cache writer needs zlib, which the MSan job can't run.
# resolve httrack before cd: make check puts a RELATIVE ../src on PATH
httrack_bin=$(cd "$(dirname "$(command -v httrack)")" && pwd)/httrack
scratch=$(mktemp -d)
trap 'rm -rf "$scratch"' EXIT
cd "$scratch"
name() {
local fil="$1" ctype="$2" want="$3"
shift 3
out="$("$httrack_bin" -O /dev/null -#test=savename "$fil" "$ctype" "$@" | sed -n 's/^savename: //p')"
test "${out##*/}" == "$want" || {
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
exit 1
}
}
# Names are re-derived from the stored headers on every run: neither the
# recorded save name nor the cached body bytes change the verdict (pinned).
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.jpg'
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.png'
name '/photo.jpg' 'image/jpeg' 'photo.jpg' 'cached=image/jpeg|www.example.com/photo.png'
name '/style.css' 'image/png' 'style.png' 'cached=image/png|www.example.com/style.css'
# agreement keeps the URL ext verbatim (.jpeg), never canonicalized to .jpg
name '/photo.jpeg' 'image/jpeg' 'photo.jpeg' 'cached=image/jpeg|www.example.com/photo.jpeg'

View File

@@ -15,10 +15,6 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'types/photo.png' \
--found 'types/doc.pdf' \
--found 'types/lie.html' --not-found 'types/lie.png' \
--found 'types/wrongtype.png' --not-found 'types/wrongtype.jpg' \
--found 'types/bigtype.png' --not-found 'types/bigtype.jpg' \
--found 'types/packed.png' --not-found 'types/packed.jpg' \
--found 'types/mutant.png' --not-found 'types/mutant.jpg' \
--found 'types/report.html' --not-found 'types/report.pdf' \
--found 'types/page.htm' --not-found 'types/page.html' \
--found 'types/script.js' \

View File

@@ -12,7 +12,4 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
--found 'types/report.html' --not-found 'types/report.pdf' \
--found 'types/notype.png' --not-found 'types/notype.html' \
--found 'types/lie.html' \
--found 'types/wrongtype.png' --not-found 'types/wrongtype.jpg' \
--found 'types/packed.png' --not-found 'types/packed.jpg' \
--found 'types/mutant.png' --not-found 'types/mutant.jpg' \
httrack 'BASEURL/types/index.html'

View File

@@ -1,23 +0,0 @@
#!/bin/bash
# The java plugin must load (versioned dlopen name) and parse a .class
# constant pool: a resource named only inside Foo.class gets crawled.
set -e
: "${top_srcdir:=..}"
tmproot=$(mktemp -d)
trap 'rm -rf "$tmproot"' EXIT
mkdir "$tmproot/javaclass"
cat >"$tmproot/javaclass/index.html" <<'EOF'
<html><body><a href="Foo.class">applet</a></body></html>
EOF
printf 'GIF89a' >"$tmproot/javaclass/hello.gif"
# magic/minor/major, count=2, one CONSTANT_Utf8 "hello.gif", class/superclass
printf '\xCA\xFE\xBA\xBE\x00\x00\x00\x32\x00\x02\x01\x00\x09hello.gif\x00\x00\x00\x00' \
>"$tmproot/javaclass/Foo.class"
bash "$top_srcdir/tests/local-crawl.sh" --root "$tmproot" --errors 0 \
--found 'javaclass/Foo.class' \
--found 'javaclass/hello.gif' \
httrack 'BASEURL/javaclass/index.html'

View File

@@ -1,17 +0,0 @@
#!/bin/bash
#
# Content-Disposition names the saved file: the attachment filename replaces
# the URL-derived name, and a traversal filename is reduced to its last
# component, inside the mirror.
set -euo pipefail
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'cdispo/report.pdf' \
--file-matches 'cdispo/report.pdf' '%PDF' \
--not-found 'cdispo/fetch.pdf' \
--found 'cdispo/evil.pdf' \
--not-found 'evil.pdf' \
httrack 'BASEURL/cdispo/index.html'

View File

@@ -1,20 +0,0 @@
#!/bin/bash
#
# Degenerate delayed-type paths (#5/#107 family): redirects that never resolve
# a name must drop cleanly -- no .delayed leftovers (audited by local-crawl.sh),
# no "bogus state" cache warnings, resolvable links still land correctly.
set -euo pipefail
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --rerun --errors 0 \
--found 'delayed/real.pdf' \
--file-matches 'delayed/real.pdf' '%PDF' \
--found 'delayed/notype.bin.html' \
--found 'delayed/empty.html' \
--not-found 'delayed/noloc.html' \
--not-found 'delayed/selfloop.html' \
--not-found 'delayed/chain9.pdf' \
--log-not-found 'bogus state' \
httrack 'BASEURL/delayed/index.html'

View File

@@ -35,14 +35,10 @@ TESTS = \
01_engine-entities.test \
01_engine-filelist.test \
01_engine-filter.test \
01_engine-ftp-line.test \
01_engine-ftp-userpass.test \
01_engine-hashtable.test \
01_engine-header.test \
01_engine-idna.test \
01_engine-escape-room.test \
01_engine-inplace-escape.test \
01_engine-java.test \
01_engine-makeindex.test \
01_engine-mime.test \
01_engine-parse.test \
@@ -64,7 +60,6 @@ TESTS = \
01_zlib-cache.test \
01_zlib-cache-golden.test \
01_zlib-cache-writefail.test \
01_zlib-savename-cached.test \
02_manpage-regen.test \
02_update-cache.test \
10_crawl-simple.test \
@@ -92,9 +87,6 @@ TESTS = \
27_local-cookies-file.test \
28_local-pause.test \
29_local-redirect-fragment.test \
30_local-fragment-link.test \
31_local-javaclass.test \
32_local-cdispo.test \
33_local-delayed.test
30_local-fragment-link.test
CLEANFILES = check-network_sh.cache

View File

@@ -246,14 +246,6 @@ done
test -n "$hostroot" || die "could not find host root under $out"
debug "host root: $hostroot"
# A completed crawl must leave no .delayed temporaries (issue #107)
info "checking for leftover .delayed files"
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
if test -z "$leftovers"; then result "OK"; else
result "leftover: $leftovers"
exit 1
fi
# --- audit -------------------------------------------------------------------
i=0
while test "$i" -lt "${#audit[@]}"; do

View File

@@ -14,7 +14,6 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
"""
import argparse
import gzip
import os
import time
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
@@ -135,14 +134,12 @@ class Handler(SimpleHTTPRequestHandler):
# --- type/extension matrix (issue #267 family) -------------------------
def send_raw(self, body, content_type, extra_headers=()):
def send_raw(self, body, content_type):
"""Send a raw body with an explicit Content-Type, or none at all when
content_type is None (to observe httrack's typeless-file naming)."""
self.send_response(200)
if content_type is not None:
self.send_header("Content-Type", content_type)
for name, value in extra_headers:
self.send_header(name, value)
self.send_header("Content-Length", str(len(body)))
self.end_headers()
if self.command != "HEAD":
@@ -151,8 +148,6 @@ class Handler(SimpleHTTPRequestHandler):
# Fake-binary blobs for the image/pdf/typeless cases.
FAKE_PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64
FAKE_PDF = b"%PDF-1.4\n" + b"\x00" * 64
FAKE_JPEG = b"\xff\xd8\xff\xe0" + b"\x00" * 64
BIG_JPEG = b"\xff\xd8\xff\xe0" + bytes(range(256)) * 64 # > sniff window
# path -> (body, content_type); None sends no header, "" sends an empty
# Content-Type value (no usable type, must be treated like None).
@@ -164,8 +159,6 @@ class Handler(SimpleHTTPRequestHandler):
"/types/notype.pdf": (FAKE_PDF, None),
"/types/emptyct.png": (FAKE_PNG, ""),
"/types/lie.png": (FAKE_PNG, "text/html"),
"/types/wrongtype.jpg": (FAKE_JPEG, "image/png"),
"/types/bigtype.jpg": (BIG_JPEG, "image/png"),
"/types/report.pdf": (b"<html><body>real page</body></html>", "text/html"),
"/types/page.htm": (b"<html><body>htm page</body></html>", "text/html"),
"/types/script.js": (b"var x = 1;\n", "application/javascript"),
@@ -183,10 +176,6 @@ class Handler(SimpleHTTPRequestHandler):
'\t<a href="notype.pdf">notypepdf</a>\n'
'\t<img src="emptyct.png" />\n'
'\t<img src="lie.png" />\n'
'\t<img src="wrongtype.jpg" />\n'
'\t<img src="bigtype.jpg" />\n'
'\t<img src="mutant.jpg" />\n'
'\t<img src="packed.jpg" />\n'
'\t<a href="report.pdf">report</a>\n'
'\t<a href="page.htm">htm</a>\n'
'\t<script src="script.js"></script>\n'
@@ -201,25 +190,6 @@ class Handler(SimpleHTTPRequestHandler):
body, ctype = self.TYPE_MATRIX[path]
self.send_raw(body, ctype)
# content changes between crawls: run 1 sniffs JPEG, the update pass must
# keep the run-1 name (recorded verdict) even though the body is now PNG
MUTANT_SEEN = set()
def route_types_mutant(self):
path = urlsplit(self.path).path
body = self.FAKE_PNG if path in self.MUTANT_SEEN else self.FAKE_JPEG
if self.command != "HEAD":
self.MUTANT_SEEN.add(path)
self.send_raw(body, "image/png")
# gzip on the wire: the sniff must see the decoded body, not the stream
def route_types_packed(self):
self.send_raw(
gzip.compress(self.FAKE_JPEG),
"image/png",
extra_headers=[("Content-Encoding", "gzip")],
)
# --- MIME-type exclusion abort (issue #58) -----------------------------
# A -mime:application/pdf filter must abort the transfer once the header
# arrives, not download the whole body and discard it.
@@ -384,27 +354,6 @@ class Handler(SimpleHTTPRequestHandler):
if self.command != "HEAD":
self.wfile.write(body)
# Content-Disposition naming: the attachment filename replaces the
# URL-derived name; path components in it are stripped (RFC 2616).
CDISPO_NAMES = {
"/cdispo/fetch.php": "report.pdf",
"/cdispo/evil.php": "../../evil.pdf",
}
def route_cdispo_index(self):
self.send_html(
'\t<a href="fetch.php">report</a>\n' '\t<a href="evil.php">evil</a>\n'
)
def route_cdispo(self):
filename = self.CDISPO_NAMES[urlsplit(self.path).path]
cdispo = 'attachment; filename="%s"' % filename
self.send_raw(
self.FAKE_PDF,
"application/pdf",
extra_headers=[("Content-Disposition", cdispo)],
)
# 302 whose Location carries a #fragment (#204): the fragment is a UA anchor
# that must be dropped before the target is fetched. A leaked '#' reaches the
# strict-server guard below and 400s.
@@ -420,50 +369,6 @@ class Handler(SimpleHTTPRequestHandler):
def route_redir_target(self):
self.send_raw(b"<html><body>redirect target</body></html>\n", "text/html")
# --- delayed-type degenerate paths (issues #5/#107) --------------------
def route_delayed_index(self):
self.send_html(
'\t<a href="noloc.php">noloc</a>\n'
'\t<a href="selfloop.php">selfloop</a>\n'
'\t<a href="chain1.php">chain</a>\n'
'\t<a href="redir.php">redir</a>\n'
'\t<a href="notype.bin">notype</a>\n'
'\t<a href="empty.php">empty</a>\n'
)
def send_redirect(self, location):
self.send_response(302, "Found")
if location is not None:
self.send_header("Location", location)
self.send_header("Content-Length", "0")
self.end_headers()
def route_delayed_noloc(self):
self.send_redirect(None) # 302 without Location: name never resolves
def route_delayed_selfloop(self):
self.send_redirect("selfloop.php")
def route_delayed_chain(self):
# chain1..chain9: one more hop than the type-check redirect budget
n = int(urlsplit(self.path).path.rsplit("chain", 1)[1].split(".")[0])
if n < 9:
self.send_redirect("chain%d.php" % (n + 1))
else:
self.send_raw(self.FAKE_PDF, "application/pdf")
def route_delayed_redir(self):
self.send_redirect("real.pdf")
def route_delayed_realpdf(self):
self.send_raw(self.FAKE_PDF, "application/pdf")
def route_delayed_notype(self):
self.send_raw(self.FAKE_PDF, None)
def route_delayed_empty(self):
self.send_raw(b"", "text/html") # 200 + Content-Length: 0
ROUTES = {
"/cookies/entrance.php": route_entrance,
"/cookies/second.php": route_second,
@@ -479,10 +384,6 @@ class Handler(SimpleHTTPRequestHandler):
"/types/notype.pdf": route_types,
"/types/emptyct.png": route_types,
"/types/lie.png": route_types,
"/types/wrongtype.jpg": route_types,
"/types/bigtype.jpg": route_types,
"/types/mutant.jpg": route_types_mutant,
"/types/packed.jpg": route_types_packed,
"/types/report.pdf": route_types,
"/types/page.htm": route_types,
"/types/script.js": route_types,
@@ -505,25 +406,6 @@ class Handler(SimpleHTTPRequestHandler):
"/mimex/index.html": route_mimex_index,
"/mimex/blob.pdf": route_mimex_blob,
"/mimex/real.html": route_mimex_real,
"/cdispo/index.html": route_cdispo_index,
"/cdispo/fetch.php": route_cdispo,
"/cdispo/evil.php": route_cdispo,
"/delayed/index.html": route_delayed_index,
"/delayed/noloc.php": route_delayed_noloc,
"/delayed/selfloop.php": route_delayed_selfloop,
"/delayed/redir.php": route_delayed_redir,
"/delayed/real.pdf": route_delayed_realpdf,
"/delayed/notype.bin": route_delayed_notype,
"/delayed/empty.php": route_delayed_empty,
"/delayed/chain1.php": route_delayed_chain,
"/delayed/chain2.php": route_delayed_chain,
"/delayed/chain3.php": route_delayed_chain,
"/delayed/chain4.php": route_delayed_chain,
"/delayed/chain5.php": route_delayed_chain,
"/delayed/chain6.php": route_delayed_chain,
"/delayed/chain7.php": route_delayed_chain,
"/delayed/chain8.php": route_delayed_chain,
"/delayed/chain9.php": route_delayed_chain,
"/redir/index.html": route_redir_index,
"/redir/go.php": route_redir_go,
"/redir/target.html": route_redir_target,