Compare commits

...

2 Commits

Author SHA1 Message Date
Xavier Roche
90e804a712 Fix three network-facing overflows in the FTP and Java parsers
get_ftp_line() copied a server reply byte-by-byte into a fixed char[1024]
with no index bound, so a hostile or MITM FTP server could smash the stack
with an over-long CRLF-less line. Bound the write and truncate.

The ftp:// userinfo parser copied "user:pass@" into user[256]/pass[256] with
two unbounded loops, overflowing from a long userinfo supplied by a hostile
ftp:// link. Extract the split into ftp_split_userpass(), which truncates
each field to fit.

The Java .class parser did calloc(header.count, sizeof(RESP_STRUCT)) on an
attacker-controlled u2 count, allocating ~68 MB per crafted class (DoS). Cap
the count to the file size (each constant-pool entry is at least one byte on
disk) via a new hts_count_fits() guard, and move the alloc/free to the
bounds-checked calloct/freet wrappers.

Self-tests: ftp-line drives get_ftp_line over a socketpair with a 4 KB reply,
ftp-userpass feeds an over-long userinfo, java exercises the count cap. The
first two abort under ASan on the pre-fix code.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-02 20:28:02 +02:00
Xavier Roche
92db2f2b41 htsparse: follow same-file redirects instead of self-pointing stubs (#159) (#471)
An http->https redirect (or any alias where the savename strips a
component the URL keeps) collapses the source and target onto one saved
file. hts_mirror_check_moved wrote a "Page has moved" stub linking to the
target's savename, but that equals the source's, so the stub pointed at
itself and the real content was never saved.

Detect a same-file alias with a new hts_redirect_same_savefile helper
(scheme and userinfo stripped, www kept, path slash/query-normalized per
the URL-hack dedup flags) and follow the redirect through: record the
moved link at the same savename so its content overwrites the placeholder.
Genuinely-different moves keep the stub. The old informational "URL Hack
identical" log is superseded by an actionable message on the followed
redirect. Covered by a redirect-samefile engine self-test.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-02 09:41:17 +02:00
12 changed files with 252 additions and 56 deletions

View File

@@ -128,6 +128,31 @@ void launch_ftp(FTPDownloadStruct * params) {
return 0; \
}
/* Bounded split of a hostile-URL "user[:pass]@" prefix (see htsftp.h). */
void ftp_split_userpass(const char *src, const char *end, char *user,
size_t user_size, char *pass, size_t pass_size) {
size_t n = 0;
while (src[n] != '\0' && src[n] != ':') {
if (n < user_size - 1)
user[n] = src[n];
n++;
}
user[n < user_size ? n : user_size - 1] = '\0';
pass[0] = '\0';
if (src[n] == ':') { // password follows the colon
const size_t base = n + 1;
size_t k = 0;
while (&src[base + k + 1] < end && src[base + k] != '\0') {
if (k < pass_size - 1)
pass[k] = src[base + k];
k++;
}
pass[k < pass_size ? k : pass_size - 1] = '\0';
}
}
// la véritable fonction une fois lancées les routines thread/fork
int run_launch_ftp(FTPDownloadStruct * pStruct) {
lien_back *back = pStruct->pBack;
@@ -173,24 +198,7 @@ int run_launch_ftp(FTPDownloadStruct * pStruct) {
while(*real_adr == '/')
real_adr++; // sauter /
if ((adr = jump_identification(real_adr)) != real_adr) { // user
int i = -1;
pass[0] = '\0';
do {
i++;
user[i] = real_adr[i];
} while((real_adr[i] != ':') && (real_adr[i]));
user[i] = '\0';
if (real_adr[i] == ':') { // pass
int j = -1;
i++; // oui on saute aussi le :
do {
j++;
pass[j] = real_adr[i + j];
} while(((&real_adr[i + j + 1]) < adr) && (real_adr[i + j]));
pass[j] = '\0';
}
ftp_split_userpass(real_adr, adr, user, sizeof(user), pass, sizeof(pass));
}
// Calculer RETR <nom>
{
@@ -984,8 +992,8 @@ int get_ftp_line(T_SOC soc, char *ptrline, size_t line_size, int timeout) {
//case 0: break; // pas encore --> erreur (on attend)!
case 1:
HTS_STAT.HTS_TOTAL_RECV += 1; // compter flux entrant
if ((b != 10) && (b != 13))
data[i++] = b;
if ((b != 10) && (b != 13) && (i < (int) sizeof(data) - 1))
data[i++] = b; // truncate hostile over-long reply lines
break;
default:
if (ptrline)

View File

@@ -70,6 +70,10 @@ int back_launch_ftp(FTPDownloadStruct * params);
int run_launch_ftp(FTPDownloadStruct * params);
int send_line(T_SOC soc, const char *data);
int get_ftp_line(T_SOC soc, char *line, size_t line_size, int timeout);
/* Split a "user[:pass]@" prefix (end = jump_identification result) into
bounded, NUL-terminated user/pass buffers, truncating to fit. */
void ftp_split_userpass(const char *src, const char *end, char *user,
size_t user_size, char *pass, size_t pass_size);
T_SOC get_datasocket(char *to_send, size_t to_send_size);
int stop_ftp(lien_back * back);
char *linejmp(char *line);

View File

@@ -63,6 +63,9 @@ Please visit our Website: http://www.httrack.com
/* This file */
#include "htsjava.h"
/* calloct/freet wrappers */
#include "htssafe.h"
static int reverse_endian(void) {
int endian = 1;
@@ -204,7 +207,16 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
return 0;
}
tab = (RESP_STRUCT *) calloc(header.count, sizeof(RESP_STRUCT));
/* A constant-pool entry is >= 1 byte on disk; reject a count exceeding
the file size (hostile .class ~68 MB alloc DoS). */
if (!hts_count_fits(header.count, (LLint) fsize(file))) {
fclose(fpout);
sprintf(str->err_msg,
"Invalid constant pool count %u (file len " LLintP ")",
(unsigned) header.count, (LLint) fsize(file));
return 0;
}
tab = (RESP_STRUCT *) calloct(header.count, sizeof(RESP_STRUCT));
if (!tab) {
sprintf(str->err_msg, "Unable to alloc %d bytes",
(int) sizeof(RESP_STRUCT));
@@ -230,7 +242,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
} else { // ++ une erreur est survenue!
if (strnotempty(str->err_msg) == 0)
strcpy(str->err_msg, "Internal readtable error");
free(tab);
freet(tab);
if (fpout) {
fclose(fpout);
fpout = NULL;
@@ -288,7 +300,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
#if JAVADEBUG
printf("end\n");
#endif
free(tab);
freet(tab);
if (fpout) {
fclose(fpout);
fpout = NULL;

View File

@@ -3488,6 +3488,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
return 0;
}
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
* contract in htsparse.h. */
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
const char *cur_fil,
const char *moved_adr,
const char *moved_fil) {
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
const int norm_query = opt->urlhack && !opt->no_query_dedup;
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
if (strcasecmp(jump_identification_const(moved_adr),
jump_identification_const(cur_adr)) != 0)
return HTS_FALSE;
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
return strcasecmp(n_fil, pn_fil) == 0;
}
/*
Check 301, 302, .. statuscodes (moved)
*/
@@ -3533,36 +3551,9 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
if ((reponse =
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
int set_prio_to = 0; // pas de priotité fixéd par wizard
// check whether URLHack is harmless or not (per the effective
// sub-flags)
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
!opt->no_query_dedup)) {
const int norm_host = !opt->no_www_dedup;
const int norm_slash = !opt->no_slash_dedup;
const int norm_query = !opt->no_query_dedup;
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
strlcpybuff(n_adr,
norm_host ? jump_normalized_const(moved->adr)
: jump_identification_const(moved->adr),
sizeof(n_adr));
strlcpybuff(pn_adr,
norm_host ? jump_normalized_const(urladr())
: jump_identification_const(urladr()),
sizeof(pn_adr));
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
norm_query);
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
norm_query);
if (strcasecmp(n_adr, pn_adr) == 0
&& strcasecmp(n_fil, pn_fil) == 0) {
hts_log_print(opt, LOG_WARNING,
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
urladr(), urlfil(), moved->adr, moved->fil);
}
}
// A same-file alias redirect must be followed, not stubbed (#159).
const hts_boolean same_savefile = hts_redirect_same_savefile(
opt, urladr(), urlfil(), moved->adr, moved->fil);
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
// c'est (en gros) la même URL..
// si c'est un problème de casse dans le host c'est que le serveur est buggé
@@ -3590,7 +3581,17 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
moved->adr, moved->fil);
}
} /* sinon traité normalement */
} else if (same_savefile) {
// A stub would point at itself; follow the redirect instead.
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
&set_prio_to, NULL) != 1) {
get_it = 1;
hts_log_print(opt, LOG_WARNING,
"Redirect to a same-file alias, fetching real "
"content: %s%s -> %s%s",
urladr(), urlfil(), moved->adr, moved->fil);
}
} /* sinon traité normalement */
}
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
@@ -3613,7 +3614,11 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
heap(heap(ptr)->precedent)->adr,
heap(heap(ptr)->precedent)->fil, opt,
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
// Same-file alias: the reserved name is the invalidated source,
// so record anyway.
if (same_savefile ||
hash_read(hash, savedmoved.save, NULL,
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
// enregistrer lien avec SAV IDENTIQUE
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
// mode test?
@@ -3637,7 +3642,6 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
"moving %s to an existing file %s",
heap(ptr)->fil, urlfil());
}
}
}

View File

@@ -116,6 +116,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
int hts_mirror_check_moved(htsmoduleStruct * str,
htsmoduleStructExtended * stre);
/*
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
same local file, so it must be followed rather than turned into a
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
stripped, www kept (www dedup is the crawl layer's job), path
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
on the dedup hash, which folds www and never collapses http<->https.
*/
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
const char *cur_fil,
const char *moved_adr,
const char *moved_fil);
/*
Process user intercations: pause, add link, delete link..
*/

View File

@@ -456,6 +456,13 @@ static HTS_INLINE HTS_UNUSED const char *htsbuff_str(const htsbuff *b) {
return b->buf;
}
/** True if 'count' records of >= 1 byte each fit in 'available' bytes; guards
an attacker-controlled count driving a large allocation. */
static HTS_INLINE HTS_UNUSED hts_boolean hts_count_fits(size_t count,
LLint available) {
return (available >= 0 && (LLint) count <= available) ? HTS_TRUE : HTS_FALSE;
}
/* Thin aliases over the libc allocator/memcpy (historical "t" suffix); no
added bounds checking. freet() also NULLs the freed pointer and tolerates
NULL. memcpybuff() despite the name is a raw memcpy: the caller owns the

View File

@@ -45,10 +45,12 @@ Please visit our Website: http://www.httrack.com
#include "htscore.h"
#include "htsdefines.h"
#include "htslib.h"
#include "htsparse.h"
#include "htscache_selftest.h"
#include "htsdns_selftest.h"
#include "htscharset.h"
#include "htsencoding.h"
#include "htsftp.h"
#include "htsmd5.h"
#if HTS_USEZLIB
#include "htszlib.h"
@@ -60,6 +62,10 @@ Please visit our Website: http://www.httrack.com
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef _WIN32
#include <sys/socket.h>
#include <unistd.h>
#endif
/* very minimalistic internal tests */
static void basic_selftests(void) {
@@ -1340,6 +1346,37 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
return 0;
}
/* #159: hts_redirect_same_savefile decides whether a redirect is a same-file
* alias. */
static int st_redirect_samefile(httrackp *opt, int argc, char **argv) {
(void) argc;
(void) argv;
#define SAME(aa, fa, ab, fb) hts_redirect_same_savefile(opt, aa, fa, ab, fb)
/* scheme and userinfo collapse (the #159 case); a different path does not */
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
assertf(SAME("http://user@foo.com", "/a", "http://foo.com", "/a"));
assertf(!SAME("http://foo.com", "/a", "http://foo.com", "/b"));
/* www stays distinct here; the crawl's dedup layer folds www, not this helper
*/
opt->urlhack = HTS_TRUE;
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
assertf(!SAME("http://www.foo.com", "/a", "http://foo.com", "/a"));
/* slash/query fold only when the dedup flag is on */
assertf(SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
assertf(
SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
assertf(!SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
assertf(
!SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
/* but a pure scheme alias still collapses regardless of dedup opt-outs */
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
#undef SAME
printf("redirect-samefile self-test OK\n");
return 0;
}
// hts_finish_makeindex writes the footer, emits the refresh meta only when
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
static int st_makeindex(httrackp *opt, int argc, char **argv) {
@@ -1737,6 +1774,75 @@ static int st_robots(httrackp *opt, int argc, char **argv) {
return 0;
}
/* get_ftp_line must bound a hostile, CRLF-less reply into its internal
1024-byte buffer; ASan turns the pre-fix overflow into an abort here. */
#ifndef _WIN32
static int st_ftpline(httrackp *opt, int argc, char **argv) {
int sv[2];
char line[2048];
char flood[4096];
(void) opt;
(void) argc;
(void) argv;
memset(flood, 'x', sizeof(flood));
assertf(socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == 0);
assertf(write(sv[1], "220 ", 4) == 4); // valid 3-digit code
assertf(write(sv[1], flood, sizeof(flood)) == (ssize_t) sizeof(flood));
assertf(write(sv[1], "\r\n", 2) == 2); // end the line so we return
close(sv[1]);
line[0] = '\0';
get_ftp_line(sv[0], line, sizeof(line), 5);
close(sv[0]);
printf("ftp-line self-test OK (bounded %d-byte reply)\n",
(int) sizeof(flood));
return 0;
}
#endif
/* ftp_split_userpass: well-formed split, plus a hostile over-long userinfo
that pre-fix overran user[256]/pass[256]. */
static int st_ftpuser(httrackp *opt, int argc, char **argv) {
char user[256], pass[256];
char in[1200];
(void) opt;
(void) argc;
(void) argv;
{
const char ok[] = "bob:secret@host/f"; // '@' at index 10
ftp_split_userpass(ok, ok + 11, user, sizeof(user), pass, sizeof(pass));
assertf(strcmp(user, "bob") == 0);
assertf(strcmp(pass, "secret") == 0);
}
memset(in, 'u', 400);
in[400] = ':';
memset(in + 401, 'p', 400);
in[801] = '@';
in[802] = '\0';
ftp_split_userpass(in, in + 802, user, sizeof(user), pass, sizeof(pass));
assertf(strlen(user) == sizeof(user) - 1);
assertf(strlen(pass) == sizeof(pass) - 1);
printf("ftp-userpass self-test OK\n");
return 0;
}
/* hts_count_fits caps the .class constant-pool entry count to the file size,
rejecting the ~68 MB-per-file calloc DoS. */
static int st_java(httrackp *opt, int argc, char **argv) {
(void) opt;
(void) argc;
(void) argv;
assertf(hts_count_fits(10, 1000) == HTS_TRUE);
assertf(hts_count_fits(0, 10) == HTS_TRUE);
assertf(hts_count_fits(65535, 10) == HTS_FALSE);
assertf(hts_count_fits(1, 0) == HTS_FALSE);
assertf(hts_count_fits(1, -1) == HTS_FALSE);
printf("java constant-pool cap self-test OK\n");
return 0;
}
/* ------------------------------------------------------------ */
/* Registry: name -> handler, with a usage hint and a one-line description. */
/* ------------------------------------------------------------ */
@@ -1757,6 +1863,8 @@ static const struct selftest_entry {
st_stripquery},
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
st_urlhack},
{"redirect-samefile", "", "same-file redirect detection self-test (#159)",
st_redirect_samefile},
{"mime", "<filename>", "MIME type for a filename", st_mime},
{"charset", "<charset> <string>",
"convert a string to UTF-8 from a charset", st_charset},
@@ -1795,6 +1903,12 @@ static const struct selftest_entry {
"Accept-Encoding advertises gzip+deflate, both decode", st_acceptencoding},
{"robots", "", "robots.txt RFC 9309 Allow/Disallow precedence self-test",
st_robots},
#ifndef _WIN32
{"ftp-line", "", "get_ftp_line bounds a hostile FTP reply line",
st_ftpline},
#endif
{"ftp-userpass", "", "ftp_split_userpass bounds URL userinfo", st_ftpuser},
{"java", "", "java .class constant-pool count cap self-test", st_java},
};
static void list_selftests(void) {

7
tests/01_engine-ftp-line.test Executable file
View File

@@ -0,0 +1,7 @@
#!/bin/bash
#
set -euo pipefail
# get_ftp_line bounds a hostile CRLF-less FTP reply into its 1024-byte buffer.
httrack -O /dev/null -#test=ftp-line run | grep -q "ftp-line self-test OK"

View File

@@ -0,0 +1,7 @@
#!/bin/bash
#
set -euo pipefail
# ftp_split_userpass bounds an over-long user:pass@ from a hostile ftp:// URL.
httrack -O /dev/null -#test=ftp-userpass run | grep -q "ftp-userpass self-test OK"

7
tests/01_engine-java.test Executable file
View File

@@ -0,0 +1,7 @@
#!/bin/bash
#
set -euo pipefail
# .class constant-pool count is capped to the file size (calloc DoS).
httrack -O /dev/null -#test=java run | grep -q "java constant-pool cap self-test OK"

View File

@@ -0,0 +1,9 @@
#!/bin/bash
#
set -euo pipefail
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
# followed through, not turned into a self-pointing "moved" stub. The decision
# helper is exercised by the engine self-test.
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"

View File

@@ -35,15 +35,19 @@ TESTS = \
01_engine-entities.test \
01_engine-filelist.test \
01_engine-filter.test \
01_engine-ftp-line.test \
01_engine-ftp-userpass.test \
01_engine-hashtable.test \
01_engine-idna.test \
01_engine-escape-room.test \
01_engine-inplace-escape.test \
01_engine-java.test \
01_engine-makeindex.test \
01_engine-mime.test \
01_engine-parse.test \
01_engine-pause.test \
01_engine-rcfile.test \
01_engine-redirect.test \
01_engine-relative.test \
01_engine-robots.test \
01_engine-savename.test \