Compare commits

..

3 Commits

Author SHA1 Message Date
Xavier Roche
b68de172fa Follow <source>/<track> as embedded near-links (#451) (#457)
src/srcset are already extracted from any element via hts_detect[], so
<source>/<track> URLs were captured. But hts_detect_embed only listed
<img src>, so at the recursion-depth boundary under --near these media
were treated as plain links and dropped, unlike <img>. Add a separate
HTML5 media table (keeping the legacy one clang-format-stable) and chain
the embed lookup through both.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 00:18:28 +02:00
Xavier Roche
aabfd34380 Refresh minor legacy constants (#453) (#456)
Three independent low-risk items under tracking issue #453:

- HTTP status enum/reason map: add 429 (Too Many Requests) and 451
  (Unavailable For Legal Reasons); appended to the HTTPStatusCode enum
  (installed header, tail-append only) and to infostatuscode_const().
- Drop the 1990s relic port 31337 from the local proxy bind fallback list.
- Pin a TLS protocol floor (TLS1.2 via SSL_CTX_set_min_proto_version, with an
  SSL_OP_NO_* fallback for OpenSSL < 1.1.0) so obsolete SSLv3/TLS1.0/1.1 aren't
  negotiated. No cert verification added: that remains by design.

Adds a -#test=status engine self-test (01_engine-status.test) asserting the
reason phrases for 429/451.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 00:12:17 +02:00
Xavier Roche
65ff9e0f11 Modernize the default User-Agent (#449) (#455)
The default UA was "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)",
a 25-year-old string naming a dead OS and trivially fingerprinted. Replace
it with an honest crawler UA carrying the HTTrack token and a reference URL:
"Mozilla/5.0 (compatible; HTTrack/3.x; +https://www.httrack.com/)".

Both the engine default (hts_create_opt) and the webhttrack mini-server
config default now share one HTS_DEFAULT_USER_AGENT macro, built from
HTTRACK_AFF_VERSION so the version token tracks releases without going stale.

Closes #449

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 23:18:28 +02:00
9 changed files with 96 additions and 11 deletions

View File

@@ -129,6 +129,8 @@ typedef enum HTTPStatusCode {
HTTP_UNSUPPORTED_MEDIA_TYPE = 415,
HTTP_REQUESTED_RANGE_NOT_SATISFIABLE = 416,
HTTP_EXPECTATION_FAILED = 417,
HTTP_TOO_MANY_REQUESTS = 429,
HTTP_UNAVAILABLE_FOR_LEGAL_REASONS = 451,
HTTP_INTERNAL_SERVER_ERROR = 500,
HTTP_NOT_IMPLEMENTED = 501,
HTTP_BAD_GATEWAY = 502,

View File

@@ -64,7 +64,7 @@ Please visit our Website: http://www.httrack.com
// catch_url_init(&port,&return_host);
HTSEXT_API T_SOC catch_url_init_std(int *port_prox, char *adr_prox) {
T_SOC soc;
int try_to_listen_to[] = { 8080, 3128, 80, 81, 82, 8081, 3129, 31337, 0, -1 };
int try_to_listen_to[] = {8080, 3128, 80, 81, 82, 8081, 3129, 0, -1};
int i = 0;
do {

View File

@@ -1951,6 +1951,10 @@ HTSEXT_API const char *infostatuscode_const(int statuscode) {
return "Requested Range Not Satisfiable";
case 417:
return "Expectation Failed";
case 429:
return "Too Many Requests";
case 451:
return "Unavailable For Legal Reasons";
case 500:
return "Internal Server Error";
case 501:
@@ -5797,6 +5801,13 @@ HTSEXT_API int hts_init(void) {
abortLog("unable to initialize TLS: SSL_CTX_new()");
assertf("unable to initialize TLS" == NULL);
}
/* Pin a TLS floor (no SSLv3/TLS1.0/1.1); no cert verify, by design. */
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
SSL_CTX_set_min_proto_version(openssl_ctx, TLS1_2_VERSION);
#else
SSL_CTX_set_options(openssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1);
#endif
}
#endif

View File

@@ -1309,7 +1309,8 @@ static int st_useragent(httrackp *opt, int argc, char **argv) {
(void) argv;
assertf(ua != NULL);
assertf(strcmp(ua, HTS_DEFAULT_USER_AGENT) == 0);
/* Macro-independent teeth: token, self-id URL, no Mozilla/4.x. */
/* Teeth independent of the macro: honest token + self-identifier, and no
legacy Mozilla/4.x fake-browser string (rejects the whole relic family). */
assertf(strstr(ua, "HTTrack/") != NULL);
assertf(strstr(ua, "+https://www.httrack.com/") != NULL);
assertf(strstr(ua, "Mozilla/4.") == NULL);
@@ -1317,6 +1318,24 @@ static int st_useragent(httrackp *opt, int argc, char **argv) {
return 0;
}
/* HTTP status code -> reason phrase, including the modern 429/451. */
static int st_status(httrackp *opt, int argc, char **argv) {
const char *s;
(void) opt;
(void) argc;
(void) argv;
s = infostatuscode_const(429);
assertf(s != NULL && strcmp(s, "Too Many Requests") == 0);
s = infostatuscode_const(451);
assertf(s != NULL && strcmp(s, "Unavailable For Legal Reasons") == 0);
/* A spot-check of a long-standing code, and an unknown one. */
s = infostatuscode_const(404);
assertf(s != NULL && strcmp(s, "Not Found") == 0);
assertf(infostatuscode_const(799) == NULL);
printf("status self-test OK\n");
return 0;
}
/* ------------------------------------------------------------ */
/* Registry: name -> handler, with a usage hint and a one-line description. */
/* ------------------------------------------------------------ */
@@ -1364,6 +1383,7 @@ static const struct selftest_entry {
{"dns", "", "DNS resolver/cache self-test", st_dns},
{"cookies", "", "cookie request-header self-test", st_cookies},
{"useragent", "", "default User-Agent self-test", st_useragent},
{"status", "", "HTTP status code -> reason phrase self-test", st_status},
};
static void list_selftests(void) {

View File

@@ -80,6 +80,10 @@ htspair_t hts_detect_embed[] = {
{NULL, NULL}
};
/* HTML5 media siblings of <img src>: same near-link treatment (#451) */
static const htspair_t hts_detect_embed_html5[] = {
{"source", "src"}, {"source", "srcset"}, {"track", "src"}, {NULL, NULL}};
/* Internal */
static int hts_acceptlink_(httrackp * opt, int ptr, const char *adr,
const char *fil, const char *tag,
@@ -136,6 +140,17 @@ static int cmp_token(const char *tag, const char *cmp) {
&& !isalnum((unsigned char) tag[p]));
}
/* TRUE if (tag, attribute) matches an embedded-asset pair in the table */
static hts_boolean is_embed_pair(const htspair_t *table, const char *tag,
const char *attribute) {
int i;
for (i = 0; table[i].tag != NULL; i++) {
if (cmp_token(tag, table[i].tag) && cmp_token(attribute, table[i].attr))
return HTS_TRUE;
}
return HTS_FALSE;
}
static int hts_acceptlink_(httrackp * opt, int ptr,
const char *adr, const char *fil, const char *tag,
const char *attribute, int *set_prio_to,
@@ -163,15 +178,9 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
/* Built-in known tags (<img src=..>, ..) */
if (forbidden_url != 0 && opt->nearlink && tag != NULL && attribute != NULL) {
int i;
for(i = 0; hts_detect_embed[i].tag != NULL; i++) {
if (cmp_token(tag, hts_detect_embed[i].tag)
&& cmp_token(attribute, hts_detect_embed[i].attr)
) {
embedded_triggered = 1;
break;
}
if (is_embed_pair(hts_detect_embed, tag, attribute) ||
is_embed_pair(hts_detect_embed_html5, tag, attribute)) {
embedded_triggered = 1;
}
}

View File

@@ -497,6 +497,12 @@ static const char *GetHttpMessage(int statuscode) {
case 417:
return "Expectation Failed";
break;
case 429:
return "Too Many Requests";
break;
case 451:
return "Unavailable For Legal Reasons";
break;
case 500:
return "Internal Server Error";
break;

View File

@@ -323,4 +323,33 @@ grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
# HTML5 <source>/<track> follow as embedded near-links past the -r2 depth boundary (#451).
# img.gif positive control; plain.gif (bare <a href>) negative control proves the gate is selective.
site10="$tmp/html5media"
mkdir -p "$site10"
for f in img ss plain; do gif "$site10/$f.gif"; done
printf 'x' >"$site10/v.webm"
printf 'x' >"$site10/subs.vtt"
cat >"$site10/index.html" <<EOF
<html><body><a href="leaf.html">leaf</a></body></html>
EOF
cat >"$site10/leaf.html" <<EOF
<html><body>
<img src="img.gif">
<picture><source srcset="ss.gif 2x"></picture>
<video><source src="v.webm"></video>
<video><track src="subs.vtt"></video>
<a href="plain.gif">plain link past the boundary</a>
</body></html>
EOF
out10="$tmp/html5media-out"
rm -rf "$out10"
mkdir -p "$out10"
httrack "file://$site10/index.html" -O "$out10" --quiet --near -r2 >"$out10/.log" 2>&1 || true
found "img.gif" "$out10"
found "ss.gif" "$out10"
found "v.webm" "$out10"
found "subs.vtt" "$out10"
notfound "plain.gif" "$out10"
exit 0

7
tests/01_engine-status.test Executable file
View File

@@ -0,0 +1,7 @@
#!/bin/bash
#
set -euo pipefail
# HTTP status -> reason phrase, including the modern 429/451 (#453).
httrack -O /dev/null -#test=status run | grep -q "status self-test OK"

View File

@@ -47,6 +47,7 @@ TESTS = \
01_engine-savename.test \
01_engine-selftest-dispatch.test \
01_engine-simplify.test \
01_engine-status.test \
01_engine-stripquery.test \
01_engine-strsafe.test \
01_engine-urlhack.test \