Compare commits

..

3 Commits

Author SHA1 Message Date
Xavier Roche
2d71f27659 tests: group zlib-dependent self-tests under 01_zlib-*
The MSan job runs the offline 01_engine-* self-tests but must skip any that
exercise the system zlib: uninstrumented libz floods MemorySanitizer with
false positives (MSan can't see libz initialize its own internal state, so
every byte deflate/inflate produces reads as "uninitialized"). That was a
grep -v -- '-cache' exclusion, a list that would grow with each new zlib test.

Rename the three cache tests to a 01_zlib-* prefix so the MSan job selects
01_engine-* with no exclusion list. They still run in the normal suite and
under ASan+UBSan (full make check), where uninstrumented libz is fine. The
deflate Accept-Encoding test (PR #459) follows the same convention.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-29 08:34:01 +02:00
Xavier Roche
b68de172fa Follow <source>/<track> as embedded near-links (#451) (#457)
src/srcset are already extracted from any element via hts_detect[], so
<source>/<track> URLs were captured. But hts_detect_embed only listed
<img src>, so at the recursion-depth boundary under --near these media
were treated as plain links and dropped, unlike <img>. Add a separate
HTML5 media table (keeping the legacy one clang-format-stable) and chain
the embed lookup through both.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 00:18:28 +02:00
Xavier Roche
aabfd34380 Refresh minor legacy constants (#453) (#456)
Three independent low-risk items under tracking issue #453:

- HTTP status enum/reason map: add 429 (Too Many Requests) and 451
  (Unavailable For Legal Reasons); appended to the HTTPStatusCode enum
  (installed header, tail-append only) and to infostatuscode_const().
- Drop the 1990s relic port 31337 from the local proxy bind fallback list.
- Pin a TLS protocol floor (TLS1.2 via SSL_CTX_set_min_proto_version, with an
  SSL_OP_NO_* fallback for OpenSSL < 1.1.0) so obsolete SSLv3/TLS1.0/1.1 aren't
  negotiated. No cert verification added: that remains by design.

Adds a -#test=status engine self-test (01_engine-status.test) asserting the
reason phrases for 429/451.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 00:12:17 +02:00
11 changed files with 55 additions and 8 deletions

View File

@@ -269,8 +269,9 @@ jobs:
MSAN_OPTIONS: abort_on_error=1:halt_on_error=1
run: |
set -euo pipefail
# Engine self-tests only; the cache trio pulls in uninstrumented zlib.
tests="$(cd tests && ls 01_engine-*.test | grep -v -- '-cache' | tr '\n' ' ')"
# 01_engine-* only; zlib-dependent self-tests are named 01_zlib-* and
# skipped here (uninstrumented libz floods MSan with false positives).
tests="$(cd tests && ls 01_engine-*.test | tr '\n' ' ')"
make check TESTS="$tests"
- name: Print the test log on failure

View File

@@ -129,6 +129,8 @@ typedef enum HTTPStatusCode {
HTTP_UNSUPPORTED_MEDIA_TYPE = 415,
HTTP_REQUESTED_RANGE_NOT_SATISFIABLE = 416,
HTTP_EXPECTATION_FAILED = 417,
HTTP_TOO_MANY_REQUESTS = 429,
HTTP_UNAVAILABLE_FOR_LEGAL_REASONS = 451,
HTTP_INTERNAL_SERVER_ERROR = 500,
HTTP_NOT_IMPLEMENTED = 501,
HTTP_BAD_GATEWAY = 502,

View File

@@ -64,7 +64,7 @@ Please visit our Website: http://www.httrack.com
// catch_url_init(&port,&return_host);
HTSEXT_API T_SOC catch_url_init_std(int *port_prox, char *adr_prox) {
T_SOC soc;
int try_to_listen_to[] = { 8080, 3128, 80, 81, 82, 8081, 3129, 31337, 0, -1 };
int try_to_listen_to[] = {8080, 3128, 80, 81, 82, 8081, 3129, 0, -1};
int i = 0;
do {

View File

@@ -1951,6 +1951,10 @@ HTSEXT_API const char *infostatuscode_const(int statuscode) {
return "Requested Range Not Satisfiable";
case 417:
return "Expectation Failed";
case 429:
return "Too Many Requests";
case 451:
return "Unavailable For Legal Reasons";
case 500:
return "Internal Server Error";
case 501:
@@ -5797,6 +5801,13 @@ HTSEXT_API int hts_init(void) {
abortLog("unable to initialize TLS: SSL_CTX_new()");
assertf("unable to initialize TLS" == NULL);
}
/* Pin a TLS floor (no SSLv3/TLS1.0/1.1); no cert verify, by design. */
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
SSL_CTX_set_min_proto_version(openssl_ctx, TLS1_2_VERSION);
#else
SSL_CTX_set_options(openssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1);
#endif
}
#endif

View File

@@ -1318,6 +1318,24 @@ static int st_useragent(httrackp *opt, int argc, char **argv) {
return 0;
}
/* HTTP status code -> reason phrase, including the modern 429/451. */
static int st_status(httrackp *opt, int argc, char **argv) {
const char *s;
(void) opt;
(void) argc;
(void) argv;
s = infostatuscode_const(429);
assertf(s != NULL && strcmp(s, "Too Many Requests") == 0);
s = infostatuscode_const(451);
assertf(s != NULL && strcmp(s, "Unavailable For Legal Reasons") == 0);
/* A spot-check of a long-standing code, and an unknown one. */
s = infostatuscode_const(404);
assertf(s != NULL && strcmp(s, "Not Found") == 0);
assertf(infostatuscode_const(799) == NULL);
printf("status self-test OK\n");
return 0;
}
/* ------------------------------------------------------------ */
/* Registry: name -> handler, with a usage hint and a one-line description. */
/* ------------------------------------------------------------ */
@@ -1365,6 +1383,7 @@ static const struct selftest_entry {
{"dns", "", "DNS resolver/cache self-test", st_dns},
{"cookies", "", "cookie request-header self-test", st_cookies},
{"useragent", "", "default User-Agent self-test", st_useragent},
{"status", "", "HTTP status code -> reason phrase self-test", st_status},
};
static void list_selftests(void) {

View File

@@ -497,6 +497,12 @@ static const char *GetHttpMessage(int statuscode) {
case 417:
return "Expectation Failed";
break;
case 429:
return "Too Many Requests";
break;
case 451:
return "Unavailable For Legal Reasons";
break;
case 500:
return "Internal Server Error";
break;

7
tests/01_engine-status.test Executable file
View File

@@ -0,0 +1,7 @@
#!/bin/bash
#
set -euo pipefail
# HTTP status -> reason phrase, including the modern 429/451 (#453).
httrack -O /dev/null -#test=status run | grep -q "status self-test OK"

View File

@@ -6,7 +6,7 @@
# Golden cache-format regression test (driven by 'httrack -#test=cache-golden <dir>').
#
# 01_engine-cache.test writes the cache with the same build it reads back (a
# 01_zlib-cache.test writes the cache with the same build it reads back (a
# round-trip), so it cannot catch a read-path or ZIP-format regression where
# writer and reader drift together. This reads a *committed* cache frozen by an
# earlier build and asserts a fixed set of entries still decodes field- and

View File

@@ -1,4 +1,4 @@
# Committed binary fixture read by 01_engine-cache-golden.test. List it
# Committed binary fixture read by 01_zlib-cache-golden.test. List it
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
# silently drop it from the dist tarball and break "make distcheck".
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
@@ -25,9 +25,6 @@ TEST_EXTENSIONS = .test
TEST_LOG_COMPILER = $(BASH)
TESTS = \
00_runnable.test \
01_engine-cache.test \
01_engine-cache-golden.test \
01_engine-cache-writefail.test \
01_engine-charset.test \
01_engine-cmdline.test \
01_engine-cookies.test \
@@ -47,10 +44,14 @@ TESTS = \
01_engine-savename.test \
01_engine-selftest-dispatch.test \
01_engine-simplify.test \
01_engine-status.test \
01_engine-stripquery.test \
01_engine-strsafe.test \
01_engine-urlhack.test \
01_engine-useragent.test \
01_zlib-cache.test \
01_zlib-cache-golden.test \
01_zlib-cache-writefail.test \
02_manpage-regen.test \
02_update-cache.test \
10_crawl-simple.test \