Compare commits

...

5 Commits

Author SHA1 Message Date
Xavier Roche
efa3896fa4 tests: cover the %xx-encoded UTF-8 flush path in unescape-bounds
The raw-byte cases never take the utfBufferJ = lastJ rollback branch, so a
wrong flush offset there would have passed (review finding).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-02 22:37:55 +02:00
Xavier Roche
d2b71d878c configure: use the dylib plugin name on Darwin
libtool names the module libhtsjava.N.dylib there, so the .so.N form can
never load; caught by 31_local-javaclass.test on the macOS CI job (the old
hardcoded .so.2 was just as dead, silently).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-02 22:37:55 +02:00
Xavier Roche
1249c42f1e htsftp: assert nonzero buffer sizes, harden the userpass self-test
ftp_split_userpass underflows its size-1 math on a zero size; assert the
precondition now that the function is public in htsftp.h. The self-test
gains a tight-size run with guard bytes and exact-content checks, which the
256-byte buffers alone could not fail on an off-by-one.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-02 22:28:04 +02:00
Xavier Roche
5c1ffaf3a6 Resurrect the java .class parser on modern Unix builds
The plugin was dead three ways on a current Linux build: hts_plug() was
compiled hidden (-fvisibility=hidden, EXTERNAL_FUNCTION expanded to nothing
on ELF), hts_create_opt() dlopens libhtsjava.so.2 which no longer exists
since the soname moved to .so.3, and JAVA_HEADER.magic is 'unsigned long'
(8 bytes on LP64) under a 10-byte fread, so major/count came from
uninitialized bytes and the 0xCAFEBABE check never matched.

EXTERNAL_FUNCTION now forces default visibility on ELF, the dlopen name is
derived from VERSION_INFO at configure time, and the header fields are
fixed-width. 31_local-javaclass.test crawls a generated .class and asserts
a resource named only in its constant pool is fetched; it fails if any of
the three regresses.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-02 22:27:55 +02:00
Xavier Roche
f38472c697 htsencoding: bound the raw UTF-8 flush in hts_unescapeUrlSpecial
The completed-sequence flush memcpy ends with a 'continue' that skips the
per-byte NUL-reserve guard, so a raw multi-byte character landing at the
exact end of dest let the trailing NUL write dest[max] (1-byte OOB, found
by the post-#474 review pass; ASan-verified via the extended
-#test=unescape-bounds).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-02 22:27:43 +02:00
10 changed files with 85 additions and 11 deletions

View File

@@ -63,6 +63,16 @@ AC_SUBST(LT_CV_OBJDIR,$lt_cv_objdir)
# Export version info
AC_SUBST(VERSION_INFO)
# Versioned plugin name for dlopen() in hts_create_opt(); soname major is
# libtool's current - age, so this tracks VERSION_INFO bumps automatically.
HTS_SONAME_MAJOR=$((${VERSION_INFO%%:*} - ${VERSION_INFO##*:}))
case "$host_os" in
darwin*) HTS_LIBHTSJAVA_NAME="libhtsjava.$HTS_SONAME_MAJOR.dylib" ;;
*) HTS_LIBHTSJAVA_NAME="libhtsjava.so.$HTS_SONAME_MAJOR" ;;
esac
AC_DEFINE_UNQUOTED([HTS_LIBHTSJAVA_NAME], ["$HTS_LIBHTSJAVA_NAME"],
[Versioned libhtsjava runtime name, derived from VERSION_INFO])
### Default CFLAGS
DEFAULT_CFLAGS="-Wall -Wformat -Wformat-security \
-Wmultichar -Wwrite-strings -Wcast-qual -Wcast-align \

View File

@@ -69,11 +69,15 @@ typedef struct t_hts_callbackarg t_hts_callbackarg;
typedef struct t_hts_callbackarg t_hts_callbackarg;
#endif
/* Marks a symbol an external wrapper module exports back to the engine
(dllexport on Windows, nothing elsewhere). */
/* Marks a symbol an external wrapper module exports back to the engine.
Must override -fvisibility=hidden on ELF, or dlopen()ed plugins (htsjava)
hide their own hts_plug()/hts_unplug() entry points. */
#ifndef EXTERNAL_FUNCTION
#ifdef _WIN32
#define EXTERNAL_FUNCTION __declspec(dllexport)
#elif ((defined(__GNUC__) && (__GNUC__ >= 4)) || \
(defined(HAVE_VISIBILITY) && HAVE_VISIBILITY))
#define EXTERNAL_FUNCTION __attribute__((visibility("default")))
#else
#define EXTERNAL_FUNCTION
#endif

View File

@@ -300,6 +300,11 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
/* Was the character read successfully ? */
if (nRead == utfBufferSize) {
/* the 'continue' below skips the NUL-reserve guard: re-check */
if (utfBufferJ + utfBufferSize >= max) {
return -1;
}
/* Rollback write position to sequence start write position */
j = utfBufferJ;

View File

@@ -133,6 +133,8 @@ void ftp_split_userpass(const char *src, const char *end, char *user,
size_t user_size, char *pass, size_t pass_size) {
size_t n = 0;
assertf(user_size > 0 && pass_size > 0); /* the size-1 math underflows on 0 */
while (src[n] != '\0' && src[n] != ':') {
if (n < user_size - 1)
user[n] = src[n];

View File

@@ -71,7 +71,8 @@ int run_launch_ftp(FTPDownloadStruct * params);
int send_line(T_SOC soc, const char *data);
int get_ftp_line(T_SOC soc, char *line, size_t line_size, int timeout);
/* Split a "user[:pass]@" prefix (end = jump_identification result) into
bounded, NUL-terminated user/pass buffers, truncating to fit. */
bounded, NUL-terminated user/pass buffers, truncating to fit.
Both sizes must be nonzero. */
void ftp_split_userpass(const char *src, const char *end, char *user,
size_t user_size, char *pass, size_t pass_size);
T_SOC get_datasocket(char *to_send, size_t to_send_size);

View File

@@ -33,15 +33,19 @@ Please visit our Website: http://www.httrack.com
#ifndef HTSJAVA_DEFH
#define HTSJAVA_DEFH
#include <stdint.h>
#ifndef HTS_DEF_FWSTRUCT_JAVA_HEADER
#define HTS_DEF_FWSTRUCT_JAVA_HEADER
typedef struct JAVA_HEADER JAVA_HEADER;
#endif
/* 10-byte on-disk .class header image, fread() directly: fields need exact
widths (LP64's 8-byte 'unsigned long' magic never matched 0xCAFEBABE). */
struct JAVA_HEADER {
unsigned long int magic;
unsigned short int minor;
unsigned short int major;
unsigned short int count;
uint32_t magic;
uint16_t minor;
uint16_t major;
uint16_t count;
};
#ifndef HTS_DEF_FWSTRUCT_RESP_STRUCT

View File

@@ -6023,9 +6023,11 @@ HTSEXT_API httrackp *hts_create_opt(void) {
"htsswf", "htsjava", "httrack-plugin", NULL
};
#else
static const char *defaultModules[] = {
"libhtsswf.so.1", "libhtsjava.so.2", "httrack-plugin", NULL
};
#ifndef HTS_LIBHTSJAVA_NAME
#define HTS_LIBHTSJAVA_NAME "libhtsjava.so" /* non-autoconf fallback */
#endif
static const char *defaultModules[] = {"libhtsswf.so.1", HTS_LIBHTSJAVA_NAME,
"httrack-plugin", NULL};
#endif
httrackp *opt = malloc(sizeof(httrackp));

View File

@@ -736,6 +736,17 @@ static int st_unescape_bounds(httrackp *opt, int argc, char **argv) {
assertf(hts_unescapeUrl("abcd", dest, sizeof(dest)) == -1);
assertf(hts_unescapeEntities("abc", dest, sizeof(dest)) == 0);
assertf(strcmp(dest, "abc") == 0);
/* raw multi-byte UTF-8 flush path (bypasses the per-byte guard) */
assertf(hts_unescapeUrl("ab\xC3\xA9", dest, sizeof(dest)) == -1);
assertf(hts_unescapeUrl("a\xC3\xA9", dest, sizeof(dest)) == 0);
assertf(strcmp(dest, "a\xC3\xA9") == 0);
{
/* %xx-encoded flush path (utfBufferJ = lastJ rollback) */
char wide[8];
assertf(hts_unescapeUrl("%C3%A9", wide, sizeof(wide)) == 0);
assertf(strcmp(wide, "\xC3\xA9") == 0);
}
printf("unescape-bounds self-test OK\n");
return 0;
}
@@ -1842,6 +1853,17 @@ static int st_ftpuser(httrackp *opt, int argc, char **argv) {
ftp_split_userpass(in, in + 802, user, sizeof(user), pass, sizeof(pass));
assertf(strlen(user) == sizeof(user) - 1);
assertf(strlen(pass) == sizeof(pass) - 1);
{
/* tight sizes + guard byte catch an off-by-one the 256 case can't */
char ubuf[16], pbuf[16];
memset(ubuf, 'Z', sizeof(ubuf));
memset(pbuf, 'Z', sizeof(pbuf));
ftp_split_userpass(in, in + 802, ubuf, 8, pbuf, 8);
assertf(strcmp(ubuf, "uuuuuuu") == 0);
assertf(strcmp(pbuf, "ppppppp") == 0);
assertf(ubuf[8] == 'Z' && pbuf[8] == 'Z');
}
printf("ftp-userpass self-test OK\n");
return 0;
}

View File

@@ -0,0 +1,23 @@
#!/bin/bash
# The java plugin must load (versioned dlopen name) and parse a .class
# constant pool: a resource named only inside Foo.class gets crawled.
set -e
: "${top_srcdir:=..}"
tmproot=$(mktemp -d)
trap 'rm -rf "$tmproot"' EXIT
mkdir "$tmproot/javaclass"
cat >"$tmproot/javaclass/index.html" <<'EOF'
<html><body><a href="Foo.class">applet</a></body></html>
EOF
printf 'GIF89a' >"$tmproot/javaclass/hello.gif"
# magic/minor/major, count=2, one CONSTANT_Utf8 "hello.gif", class/superclass
printf '\xCA\xFE\xBA\xBE\x00\x00\x00\x32\x00\x02\x01\x00\x09hello.gif\x00\x00\x00\x00' \
>"$tmproot/javaclass/Foo.class"
bash "$top_srcdir/tests/local-crawl.sh" --root "$tmproot" --errors 0 \
--found 'javaclass/Foo.class' \
--found 'javaclass/hello.gif' \
httrack 'BASEURL/javaclass/index.html'

View File

@@ -90,6 +90,7 @@ TESTS = \
27_local-cookies-file.test \
28_local-pause.test \
29_local-redirect-fragment.test \
30_local-fragment-link.test
30_local-fragment-link.test \
31_local-javaclass.test
CLEANFILES = check-network_sh.cache