mirror of
https://github.com/xroche/httrack.git
synced 2026-06-29 13:35:17 +03:00
Compare commits
7 Commits
3.49.10
...
tests/zlib
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2d71f27659 | ||
|
|
b68de172fa | ||
|
|
aabfd34380 | ||
|
|
65ff9e0f11 | ||
|
|
730a1c8c5b | ||
|
|
f9ee4702a2 | ||
|
|
cca83e5f4a |
49
.github/workflows/ci.yml
vendored
49
.github/workflows/ci.yml
vendored
@@ -61,6 +61,50 @@ jobs:
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Reproduce the Debian buildds: they build in a minimal chroot with no
|
||||
# python3, so the local-server tests must SKIP (exit 77), not fail. GitHub
|
||||
# runners ship python3, so every other job hides this path; here we remove it
|
||||
# before `make check`. This is the guard that would have caught the 3.49.10-1
|
||||
# FTBFS (28_local-pause failed instead of skipping when python3 was absent).
|
||||
buildd-no-python3:
|
||||
name: build (no python3, Debian buildd)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
set -euo pipefail
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential autoconf automake libtool autoconf-archive \
|
||||
zlib1g-dev libssl-dev
|
||||
|
||||
- name: Configure
|
||||
run: |
|
||||
set -euo pipefail
|
||||
autoreconf -fi
|
||||
./configure
|
||||
|
||||
- name: Build
|
||||
run: make -j"$(nproc)"
|
||||
|
||||
- name: Test without python3
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Hide every python3* so `command -v python3` fails like it does in the
|
||||
# buildd chroot; masking with /bin/false would still resolve.
|
||||
sudo find /usr/bin /usr/local/bin -maxdepth 1 -name 'python3*' \
|
||||
-exec mv {} {}.hidden \;
|
||||
! command -v python3
|
||||
make check
|
||||
|
||||
- name: Print the test log on failure
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Portability: build and test on macOS (Darwin/clang) on a native runner --
|
||||
# no VM. The tree has no __APPLE__ branches, so Darwin exercises the
|
||||
# generic-Unix path on a second libc and kernel. brew's openssl@3 is keg-only,
|
||||
@@ -225,8 +269,9 @@ jobs:
|
||||
MSAN_OPTIONS: abort_on_error=1:halt_on_error=1
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Engine self-tests only; the cache trio pulls in uninstrumented zlib.
|
||||
tests="$(cd tests && ls 01_engine-*.test | grep -v -- '-cache' | tr '\n' ' ')"
|
||||
# 01_engine-* only; zlib-dependent self-tests are named 01_zlib-* and
|
||||
# skipped here (uninstrumented libz floods MSan with false positives).
|
||||
tests="$(cd tests && ls 01_engine-*.test | tr '\n' ' ')"
|
||||
make check TESTS="$tests"
|
||||
|
||||
- name: Print the test log on failure
|
||||
|
||||
@@ -129,6 +129,8 @@ typedef enum HTTPStatusCode {
|
||||
HTTP_UNSUPPORTED_MEDIA_TYPE = 415,
|
||||
HTTP_REQUESTED_RANGE_NOT_SATISFIABLE = 416,
|
||||
HTTP_EXPECTATION_FAILED = 417,
|
||||
HTTP_TOO_MANY_REQUESTS = 429,
|
||||
HTTP_UNAVAILABLE_FOR_LEGAL_REASONS = 451,
|
||||
HTTP_INTERNAL_SERVER_ERROR = 500,
|
||||
HTTP_NOT_IMPLEMENTED = 501,
|
||||
HTTP_BAD_GATEWAY = 502,
|
||||
|
||||
@@ -3,12 +3,12 @@
|
||||
|
||||
# Change this to download files
|
||||
if false; then
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
|
||||
rm -f CP932.TXT CP936.TXT CP949.TXT CP950.TXT
|
||||
fi
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ Please visit our Website: http://www.httrack.com
|
||||
// catch_url_init(&port,&return_host);
|
||||
HTSEXT_API T_SOC catch_url_init_std(int *port_prox, char *adr_prox) {
|
||||
T_SOC soc;
|
||||
int try_to_listen_to[] = { 8080, 3128, 80, 81, 82, 8081, 3129, 31337, 0, -1 };
|
||||
int try_to_listen_to[] = {8080, 3128, 80, 81, 82, 8081, 3129, 0, -1};
|
||||
int i = 0;
|
||||
|
||||
do {
|
||||
|
||||
@@ -30,12 +30,14 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htssafe.h"
|
||||
|
||||
/* static int decode_entity(const unsigned int hash, const size_t len);
|
||||
*/
|
||||
/* static int decode_entity(const uint64_t hash, const size_t len);
|
||||
*/
|
||||
#include "htsentities.h"
|
||||
|
||||
/* hexadecimal conversion */
|
||||
@@ -50,30 +52,31 @@ static int get_hex_value(char c) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Numerical Recipes,
|
||||
see <http://en.wikipedia.org/wiki/Linear_congruential_generator> */
|
||||
#define HASH_PRIME ( 1664525 )
|
||||
#define HASH_CONST ( 1013904223 )
|
||||
#define HASH_ADD(HASH, C) do { \
|
||||
(HASH) *= HASH_PRIME; \
|
||||
(HASH) += HASH_CONST; \
|
||||
(HASH) += (C); \
|
||||
} while(0)
|
||||
/* 64-bit FNV-1a; must match htsentities.sh, which keys the entity table on it.
|
||||
*/
|
||||
#define HASH_INIT 0xcbf29ce484222325ULL
|
||||
#define HASH_PRIME 0x100000001b3ULL
|
||||
#define HASH_ADD(HASH, C) \
|
||||
do { \
|
||||
(HASH) ^= (unsigned char) (C); \
|
||||
(HASH) *= HASH_PRIME; \
|
||||
} while (0)
|
||||
|
||||
int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) {
|
||||
size_t i, j, ampStart, ampStartDest;
|
||||
int uc;
|
||||
int hex;
|
||||
unsigned int hash;
|
||||
uint64_t hash;
|
||||
|
||||
assertf(max != 0);
|
||||
for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0,
|
||||
uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) {
|
||||
for (i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0, uc = -1, hex = 0,
|
||||
hash = HASH_INIT;
|
||||
src[i] != '\0'; i++) {
|
||||
/* start of entity */
|
||||
if (src[i] == '&') {
|
||||
ampStart = i;
|
||||
ampStartDest = j;
|
||||
hash = 0;
|
||||
hash = HASH_INIT;
|
||||
uc = -1;
|
||||
}
|
||||
/* inside a potential entity */
|
||||
@@ -174,14 +177,11 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
|
||||
}
|
||||
/* alphanumerical entity */
|
||||
else {
|
||||
/* alphanum and not too far ('ϑ' is the longest) */
|
||||
if (i <= ampStart + 10 &&
|
||||
(
|
||||
(src[i] >= '0' && src[i] <= '9')
|
||||
|| (src[i] >= 'A' && src[i] <= 'Z')
|
||||
|| (src[i] >= 'a' && src[i] <= 'z')
|
||||
)
|
||||
) {
|
||||
/* alphanum, capped at the longest name
|
||||
* '∳' (31) */
|
||||
if (i <= ampStart + 31 && ((src[i] >= '0' && src[i] <= '9') ||
|
||||
(src[i] >= 'A' && src[i] <= 'Z') ||
|
||||
(src[i] >= 'a' && src[i] <= 'z'))) {
|
||||
/* compute hash */
|
||||
HASH_ADD(hash, (unsigned char) src[i]);
|
||||
} else {
|
||||
|
||||
13586
src/htsentities.h
13586
src/htsentities.h
File diff suppressed because it is too large
Load Diff
@@ -1,75 +1,92 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Regenerate htsentities.h from the WHATWG named character references.
|
||||
|
||||
src=html40.txt
|
||||
url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt
|
||||
set -euo pipefail
|
||||
|
||||
src=entities.json
|
||||
url=https://html.spec.whatwg.org/entities.json
|
||||
dest=htsentities.h
|
||||
|
||||
(
|
||||
cat <<EOF
|
||||
/*
|
||||
-- ${dest} --
|
||||
FILE GENERATED BY $0, DO NOT MODIFY
|
||||
# 64-bit FNV-1a of $1, printed as a C constant. Must match the hash in
|
||||
# htsencoding.c. The offset basis is stored as its wrapped (signed) bit pattern;
|
||||
# bash arithmetic is 64-bit two's complement, so the result is bit-exact.
|
||||
fnv1a() {
|
||||
local s=$1 i c h=$((0xcbf29ce484222325))
|
||||
for ((i = 0; i < ${#s}; i++)); do
|
||||
printf -v c '%d' "'${s:i:1}"
|
||||
h=$(((h ^ (c & 0xff)) * 0x100000001b3))
|
||||
done
|
||||
printf '0x%016xULL' "$h"
|
||||
}
|
||||
|
||||
We compute the LCG hash
|
||||
(see <http://en.wikipedia.org/wiki/Linear_congruential_generator>)
|
||||
for each entity. We should in theory check using strncmp() that we
|
||||
actually have the correct entity, but this is actually statistically
|
||||
not needed.
|
||||
if [ ! -f "$src" ]; then
|
||||
curl -fsS "$url" -o "$src"
|
||||
fi
|
||||
|
||||
We may want to do better, but we expect the hash function to be uniform, and
|
||||
let the compiler be smart enough to optimize the switch (for example by
|
||||
checking in log2() intervals)
|
||||
|
||||
This code has been generated using the evil $0 script.
|
||||
*/
|
||||
# Keep ';'-terminated single-codepoint names; the ~93 multi-codepoint refs can't
|
||||
# fit decode_entity's single-codepoint return and are skipped (left verbatim).
|
||||
pairs=$(jq -r '
|
||||
to_entries
|
||||
| map(select((.key | endswith(";")) and (.value.codepoints | length == 1)))
|
||||
| sort_by(.key)
|
||||
| .[] | "\(.key | ltrimstr("&") | rtrimstr(";"))\t\(.value.codepoints[0])"' "$src")
|
||||
|
||||
static int decode_entity(const unsigned int hash, const size_t len) {
|
||||
# Skipped multi-codepoint names, kept to prove none aliases an emitted hash.
|
||||
skipped=$(jq -r '
|
||||
to_entries
|
||||
| map(select((.key | endswith(";")) and (.value.codepoints | length > 1)))
|
||||
| .[] | .key | ltrimstr("&") | rtrimstr(";")' "$src")
|
||||
|
||||
cases=""
|
||||
emit_hashes=""
|
||||
while IFS=$'\t' read -r name cp; do
|
||||
hash=$(fnv1a "$name")
|
||||
cases+=" /* $name */"$'\n'
|
||||
cases+=" case $hash:"$'\n'
|
||||
cases+=" if (len == ${#name}) {"$'\n'
|
||||
cases+=" return $cp;"$'\n'
|
||||
cases+=" }"$'\n'
|
||||
cases+=" break;"$'\n'
|
||||
emit_hashes+="$hash"$'\n'
|
||||
done <<<"$pairs"
|
||||
|
||||
skip_hashes=""
|
||||
while IFS= read -r name; do
|
||||
[ -n "$name" ] && skip_hashes+="$(fnv1a "$name")"$'\n'
|
||||
done <<<"$skipped"
|
||||
|
||||
# The switch keys on the hash alone, so the dispatch is correct only while every
|
||||
# emitted name hashes uniquely; prove it here, no runtime name compare needed.
|
||||
dups=$(printf '%s' "$emit_hashes" | sort | uniq -d || true)
|
||||
if [ -n "$dups" ]; then
|
||||
echo "FATAL: two entity names share a hash (duplicate switch case); change the hash:" >&2
|
||||
echo "$dups" >&2
|
||||
exit 1
|
||||
fi
|
||||
# A skipped name colliding with an emitted hash would mis-decode instead of
|
||||
# staying verbatim; forbid that too.
|
||||
aliased=$(comm -12 <(printf '%s' "$emit_hashes" | sort -u) <(printf '%s' "$skip_hashes" | sort -u) || true)
|
||||
if [ -n "$aliased" ]; then
|
||||
echo "FATAL: a skipped multi-codepoint name aliases an emitted hash:" >&2
|
||||
echo "$aliased" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cat >"$dest" <<EOF
|
||||
/* GENERATED by $0 from the WHATWG named character references
|
||||
(${url}). DO NOT EDIT.
|
||||
Dispatch keys on a 64-bit FNV-1a hash of the entity name; the generator
|
||||
aborts on any hash collision, so no runtime name compare is needed. */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
static int decode_entity(const uint64_t hash, const size_t len) {
|
||||
switch(hash) {
|
||||
EOF
|
||||
(
|
||||
if test -f ${src}; then
|
||||
cat ${src}
|
||||
else
|
||||
GET "${url}"
|
||||
fi
|
||||
) |
|
||||
grep -E '^<!ENTITY [a-zA-Z0-9_]' |
|
||||
sed \
|
||||
-e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
|
||||
-e 's/-->$//' \
|
||||
-e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/' |
|
||||
(
|
||||
read -r A
|
||||
while test -n "$A"; do
|
||||
ent="${A%% *}"
|
||||
code=$(echo "$A" | cut -f2 -d' ')
|
||||
# compute hash
|
||||
hash=0
|
||||
i=0
|
||||
a=1664525
|
||||
c=1013904223
|
||||
m="$((1 << 32))"
|
||||
while test "$i" -lt ${#ent}; do
|
||||
d="$(echo -n "${ent:${i}:1}" | hexdump -v -e '/1 "%d"')"
|
||||
hash="$((((hash * a) % (m) + d + c) % (m)))"
|
||||
i=$((i + 1))
|
||||
done
|
||||
echo -e " /* $A */"
|
||||
echo -e " case ${hash}u:"
|
||||
echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {"
|
||||
echo -e " return ${code};"
|
||||
echo -e " }"
|
||||
echo -e " break;"
|
||||
|
||||
# next
|
||||
read -r A
|
||||
done
|
||||
)
|
||||
cat <<EOF
|
||||
}
|
||||
${cases} }
|
||||
/* unknown */
|
||||
return -1;
|
||||
}
|
||||
EOF
|
||||
) >${dest}
|
||||
|
||||
echo "wrote $dest ($(grep -c '^ case ' "$dest") entities)" >&2
|
||||
|
||||
@@ -229,6 +229,10 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_DEFAULT_FOOTER \
|
||||
"<!-- Mirrored from %s%s by HTTrack Website Copier/" HTTRACK_AFF_VERSION \
|
||||
" " HTTRACK_AFF_AUTHORS ", %s -->"
|
||||
/* Honest crawler User-Agent; no fake OS/browser to go stale. */
|
||||
#define HTS_DEFAULT_USER_AGENT \
|
||||
"Mozilla/5.0 (compatible; HTTrack/" HTTRACK_AFF_VERSION \
|
||||
"; +https://www.httrack.com/)"
|
||||
#define HTTRACK_WEB "http://www.httrack.com"
|
||||
#define HTS_UPDATE_WEBSITE \
|
||||
"http://www.httrack.com/" \
|
||||
|
||||
93
src/htslib.c
93
src/htslib.c
@@ -563,6 +563,39 @@ const char *hts_mime[][2] = {
|
||||
{"", ""}
|
||||
};
|
||||
|
||||
/* Modern web formats (post-2010), kept in their own table: appending to the
|
||||
legacy hts_mime[] above makes clang-format reflow its whole initializer.
|
||||
Scanned after hts_mime[], so it never shadows a legacy mapping. */
|
||||
static const char *hts_mime_modern[][2] = {
|
||||
{"image/webp", "webp"},
|
||||
{"image/avif", "avif"},
|
||||
{"image/heic", "heic"},
|
||||
{"font/woff", "woff"},
|
||||
{"font/woff2", "woff2"},
|
||||
{"font/ttf", "ttf"},
|
||||
{"font/otf", "otf"},
|
||||
{"application/json", "json"},
|
||||
{"application/ld+json", "jsonld"},
|
||||
{"application/manifest+json", "webmanifest"},
|
||||
{"application/wasm", "wasm"},
|
||||
{"text/javascript", "js"},
|
||||
{"text/javascript", "mjs"},
|
||||
{"text/markdown", "md"},
|
||||
{"video/mp4", "mp4"},
|
||||
{"video/webm", "webm"},
|
||||
{"video/ogg", "ogv"},
|
||||
{"video/mp2t", "ts"},
|
||||
{"audio/mp4", "m4a"},
|
||||
{"audio/aac", "aac"},
|
||||
{"audio/ogg", "oga"},
|
||||
{"audio/opus", "opus"},
|
||||
{"audio/flac", "flac"},
|
||||
{"audio/webm", "weba"},
|
||||
{"application/x-7z-compressed", "7z"},
|
||||
{"application/x-rar-compressed", "rar"},
|
||||
{"application/zstd", "zst"},
|
||||
{"", ""}};
|
||||
|
||||
// Reserved (RFC2396)
|
||||
#define CIS(c,ch) ( ((unsigned char)(c)) == (ch) )
|
||||
#define CHAR_RESERVED(c) ( CIS(c,';') \
|
||||
@@ -1918,6 +1951,10 @@ HTSEXT_API const char *infostatuscode_const(int statuscode) {
|
||||
return "Requested Range Not Satisfiable";
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
case 501:
|
||||
@@ -4308,6 +4345,20 @@ void guess_httptype(httrackp * opt, char *s, const char *fil) {
|
||||
(void) get_httptype_sized(opt, s, HTS_MIMETYPE_SIZE, fil, 1);
|
||||
}
|
||||
|
||||
// first match in a NUL-terminated {mime,ext} table. key selects the lookup
|
||||
// column (0=mime, 1=ext); returns the other column, or NULL if no row matches
|
||||
// (a "*" partner means the row carries no value).
|
||||
static const char *hts_mime_lookup(const char *(*table)[2], int key,
|
||||
const char *needle) {
|
||||
int j;
|
||||
|
||||
for (j = 0; strnotempty(table[j][1]); j++) {
|
||||
if (strfield2(table[j][key], needle) && table[j][!key][0] != '*')
|
||||
return table[j][!key];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// write the mime type for fil into s (capacity ssize)
|
||||
// flag: 1 to always return a type (the "application/..." / octet-stream
|
||||
// fallback) returns 1 if a type was written to s, 0 otherwise
|
||||
@@ -4331,17 +4382,15 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
while ((a > fil) && (*a != '.') && (*a != '/'))
|
||||
a--;
|
||||
if (a >= fil && *a == '.' && strlen(a) < 32) {
|
||||
int j = 0;
|
||||
const char *mime;
|
||||
|
||||
a++;
|
||||
while(strnotempty(hts_mime[j][1])) {
|
||||
if (strfield2(hts_mime[j][1], a)) {
|
||||
if (hts_mime[j][0][0] != '*') { // a match exists
|
||||
strlcpybuff(s, hts_mime[j][0], ssize);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
j++;
|
||||
mime = hts_mime_lookup(hts_mime, 1, a);
|
||||
if (mime == NULL)
|
||||
mime = hts_mime_lookup(hts_mime_modern, 1, a);
|
||||
if (mime != NULL) {
|
||||
strlcpybuff(s, mime, ssize);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (flag) {
|
||||
@@ -4476,18 +4525,16 @@ int get_userhttptype(httrackp * opt, char *s, const char *fil) {
|
||||
// returns 1 if an extension was found (and written to s), 0 otherwise
|
||||
int give_mimext(char *s, size_t ssize, const char *st) {
|
||||
int ok = 0;
|
||||
int j = 0;
|
||||
const char *ext;
|
||||
|
||||
st = hts_effective_mime(st); /* no declared type: derive an html ext */
|
||||
s[0] = '\0';
|
||||
while((!ok) && (strnotempty(hts_mime[j][1]))) {
|
||||
if (strfield2(hts_mime[j][0], st)) {
|
||||
if (hts_mime[j][1][0] != '*') { // a match exists
|
||||
strlcpybuff(s, hts_mime[j][1], ssize);
|
||||
ok = 1;
|
||||
}
|
||||
}
|
||||
j++;
|
||||
ext = hts_mime_lookup(hts_mime, 0, st);
|
||||
if (ext == NULL)
|
||||
ext = hts_mime_lookup(hts_mime_modern, 0, st);
|
||||
if (ext != NULL) {
|
||||
strlcpybuff(s, ext, ssize);
|
||||
ok = 1;
|
||||
}
|
||||
// wrap "x" mimetypes, such as:
|
||||
// application/x-mp3
|
||||
@@ -5754,6 +5801,13 @@ HTSEXT_API int hts_init(void) {
|
||||
abortLog("unable to initialize TLS: SSL_CTX_new()");
|
||||
assertf("unable to initialize TLS" == NULL);
|
||||
}
|
||||
/* Pin a TLS floor (no SSLv3/TLS1.0/1.1); no cert verify, by design. */
|
||||
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
|
||||
SSL_CTX_set_min_proto_version(openssl_ctx, TLS1_2_VERSION);
|
||||
#else
|
||||
SSL_CTX_set_options(openssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
|
||||
SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -6005,8 +6059,7 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
opt->shell = HTS_FALSE;
|
||||
opt->proxy.active = 0; // pas de proxy
|
||||
opt->user_agent_send = HTS_TRUE;
|
||||
StringCopy(opt->user_agent,
|
||||
"Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)");
|
||||
StringCopy(opt->user_agent, HTS_DEFAULT_USER_AGENT);
|
||||
StringCopy(opt->referer, "");
|
||||
StringCopy(opt->from, "");
|
||||
opt->savename_83 = HTS_SAVENAME_83_LONG; // long names by default
|
||||
|
||||
@@ -239,6 +239,14 @@ static void basic_selftests(void) {
|
||||
assertf(strcmp(ext, "html") == 0);
|
||||
assertf(give_mimext(ext, sizeof(ext), "no/such-mime-type") == 0);
|
||||
assertf(ext[0] == '\0');
|
||||
// modern web formats -> extension. Avoid MIME types the
|
||||
// application/<=4-char-subtype fallback could fabricate without a row.
|
||||
assertf(give_mimext(ext, sizeof(ext), "image/webp") == 1);
|
||||
assertf(strcmp(ext, "webp") == 0);
|
||||
assertf(give_mimext(ext, sizeof(ext), "application/manifest+json") == 1);
|
||||
assertf(strcmp(ext, "webmanifest") == 0);
|
||||
assertf(give_mimext(ext, sizeof(ext), "font/woff2") == 1);
|
||||
assertf(strcmp(ext, "woff2") == 0);
|
||||
}
|
||||
// convtolower(): lower-cases into the caller buffer (bounded by its size).
|
||||
{
|
||||
@@ -293,6 +301,16 @@ static void basic_selftests(void) {
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"x.gif", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "image/gif") == 0);
|
||||
// modern extensions map back to their MIME type
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"x.webp", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "image/webp") == 0);
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"app.wasm", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "application/wasm") == 0);
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"mod.mjs", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "text/javascript") == 0);
|
||||
// no extension and flag==0: nothing written, returns 0
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"noextfile", 0) == 0);
|
||||
@@ -1284,6 +1302,40 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Default User-Agent: honest HTTrack token, no resurrected Windows 98. */
|
||||
static int st_useragent(httrackp *opt, int argc, char **argv) {
|
||||
const char *ua = StringBuff(opt->user_agent);
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
assertf(ua != NULL);
|
||||
assertf(strcmp(ua, HTS_DEFAULT_USER_AGENT) == 0);
|
||||
/* Teeth independent of the macro: honest token + self-identifier, and no
|
||||
legacy Mozilla/4.x fake-browser string (rejects the whole relic family). */
|
||||
assertf(strstr(ua, "HTTrack/") != NULL);
|
||||
assertf(strstr(ua, "+https://www.httrack.com/") != NULL);
|
||||
assertf(strstr(ua, "Mozilla/4.") == NULL);
|
||||
printf("useragent self-test OK: %s\n", ua);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* HTTP status code -> reason phrase, including the modern 429/451. */
|
||||
static int st_status(httrackp *opt, int argc, char **argv) {
|
||||
const char *s;
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
s = infostatuscode_const(429);
|
||||
assertf(s != NULL && strcmp(s, "Too Many Requests") == 0);
|
||||
s = infostatuscode_const(451);
|
||||
assertf(s != NULL && strcmp(s, "Unavailable For Legal Reasons") == 0);
|
||||
/* A spot-check of a long-standing code, and an unknown one. */
|
||||
s = infostatuscode_const(404);
|
||||
assertf(s != NULL && strcmp(s, "Not Found") == 0);
|
||||
assertf(infostatuscode_const(799) == NULL);
|
||||
printf("status self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* Registry: name -> handler, with a usage hint and a one-line description. */
|
||||
/* ------------------------------------------------------------ */
|
||||
@@ -1330,6 +1382,8 @@ static const struct selftest_entry {
|
||||
st_cache_writefail},
|
||||
{"dns", "", "DNS resolver/cache self-test", st_dns},
|
||||
{"cookies", "", "cookie request-header self-test", st_cookies},
|
||||
{"useragent", "", "default User-Agent self-test", st_useragent},
|
||||
{"status", "", "HTTP status code -> reason phrase self-test", st_status},
|
||||
};
|
||||
|
||||
static void list_selftests(void) {
|
||||
|
||||
@@ -358,12 +358,12 @@ int smallserver(T_SOC soc, char *url, char *method, char *data, char *path) {
|
||||
{NULL, 0}
|
||||
};
|
||||
initStrElt initStr[] = {
|
||||
{"user", "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)"},
|
||||
{"footer",
|
||||
"<!-- Mirrored from %s%s by HTTrack Website Copier/3.x [XR&CO'2014], %s -->"},
|
||||
{"url2", "+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"},
|
||||
{NULL, NULL}
|
||||
};
|
||||
{"user", HTS_DEFAULT_USER_AGENT},
|
||||
{"footer", "<!-- Mirrored from %s%s by HTTrack Website Copier/3.x "
|
||||
"[XR&CO'2014], %s -->"},
|
||||
{"url2",
|
||||
"+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"},
|
||||
{NULL, NULL}};
|
||||
int i = 0;
|
||||
|
||||
for(i = 0; initInt[i].name; i++) {
|
||||
|
||||
@@ -80,6 +80,10 @@ htspair_t hts_detect_embed[] = {
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
/* HTML5 media siblings of <img src>: same near-link treatment (#451) */
|
||||
static const htspair_t hts_detect_embed_html5[] = {
|
||||
{"source", "src"}, {"source", "srcset"}, {"track", "src"}, {NULL, NULL}};
|
||||
|
||||
/* Internal */
|
||||
static int hts_acceptlink_(httrackp * opt, int ptr, const char *adr,
|
||||
const char *fil, const char *tag,
|
||||
@@ -136,6 +140,17 @@ static int cmp_token(const char *tag, const char *cmp) {
|
||||
&& !isalnum((unsigned char) tag[p]));
|
||||
}
|
||||
|
||||
/* TRUE if (tag, attribute) matches an embedded-asset pair in the table */
|
||||
static hts_boolean is_embed_pair(const htspair_t *table, const char *tag,
|
||||
const char *attribute) {
|
||||
int i;
|
||||
for (i = 0; table[i].tag != NULL; i++) {
|
||||
if (cmp_token(tag, table[i].tag) && cmp_token(attribute, table[i].attr))
|
||||
return HTS_TRUE;
|
||||
}
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
const char *adr, const char *fil, const char *tag,
|
||||
const char *attribute, int *set_prio_to,
|
||||
@@ -163,15 +178,9 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
|
||||
/* Built-in known tags (<img src=..>, ..) */
|
||||
if (forbidden_url != 0 && opt->nearlink && tag != NULL && attribute != NULL) {
|
||||
int i;
|
||||
|
||||
for(i = 0; hts_detect_embed[i].tag != NULL; i++) {
|
||||
if (cmp_token(tag, hts_detect_embed[i].tag)
|
||||
&& cmp_token(attribute, hts_detect_embed[i].attr)
|
||||
) {
|
||||
embedded_triggered = 1;
|
||||
break;
|
||||
}
|
||||
if (is_embed_pair(hts_detect_embed, tag, attribute) ||
|
||||
is_embed_pair(hts_detect_embed_html5, tag, attribute)) {
|
||||
embedded_triggered = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -497,6 +497,12 @@ static const char *GetHttpMessage(int statuscode) {
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
break;
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
break;
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
break;
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
break;
|
||||
|
||||
@@ -18,6 +18,21 @@ ent '&' '&'
|
||||
ent '<>' '<>'
|
||||
ent 'é' 'é'
|
||||
|
||||
# HTML5 names from the WHATWG set
|
||||
ent '…' '…'
|
||||
ent '⋃' '⋃'
|
||||
# longest name (31 chars) exercises the name-length cap
|
||||
ent '∳' '∳'
|
||||
# astral codepoint -> 4-byte UTF-8
|
||||
ent '𝔸' '𝔸'
|
||||
# multi-codepoint refs are skipped at generation, so left verbatim
|
||||
ent 'fj' 'fj'
|
||||
|
||||
# common HTML4 names still decode (regression guard against accidental drops)
|
||||
ent '©®™' '©®™'
|
||||
ent '—–' '—–'
|
||||
ent 'αβ' 'αβ'
|
||||
|
||||
# numeric: decimal and hex
|
||||
ent 'AB' 'AB'
|
||||
ent 'A' 'A'
|
||||
|
||||
@@ -323,4 +323,33 @@ grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
|
||||
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
|
||||
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
|
||||
|
||||
# HTML5 <source>/<track> follow as embedded near-links past the -r2 depth boundary (#451).
|
||||
# img.gif positive control; plain.gif (bare <a href>) negative control proves the gate is selective.
|
||||
site10="$tmp/html5media"
|
||||
mkdir -p "$site10"
|
||||
for f in img ss plain; do gif "$site10/$f.gif"; done
|
||||
printf 'x' >"$site10/v.webm"
|
||||
printf 'x' >"$site10/subs.vtt"
|
||||
cat >"$site10/index.html" <<EOF
|
||||
<html><body><a href="leaf.html">leaf</a></body></html>
|
||||
EOF
|
||||
cat >"$site10/leaf.html" <<EOF
|
||||
<html><body>
|
||||
<img src="img.gif">
|
||||
<picture><source srcset="ss.gif 2x"></picture>
|
||||
<video><source src="v.webm"></video>
|
||||
<video><track src="subs.vtt"></video>
|
||||
<a href="plain.gif">plain link past the boundary</a>
|
||||
</body></html>
|
||||
EOF
|
||||
out10="$tmp/html5media-out"
|
||||
rm -rf "$out10"
|
||||
mkdir -p "$out10"
|
||||
httrack "file://$site10/index.html" -O "$out10" --quiet --near -r2 >"$out10/.log" 2>&1 || true
|
||||
found "img.gif" "$out10"
|
||||
found "ss.gif" "$out10"
|
||||
found "v.webm" "$out10"
|
||||
found "subs.vtt" "$out10"
|
||||
notfound "plain.gif" "$out10"
|
||||
|
||||
exit 0
|
||||
|
||||
7
tests/01_engine-status.test
Executable file
7
tests/01_engine-status.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# HTTP status -> reason phrase, including the modern 429/451 (#453).
|
||||
httrack -O /dev/null -#test=status run | grep -q "status self-test OK"
|
||||
7
tests/01_engine-useragent.test
Executable file
7
tests/01_engine-useragent.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Default User-Agent (#449): honest HTTrack token, no Windows 98 relic.
|
||||
httrack -O /dev/null -#test=useragent run | grep -q "useragent self-test OK"
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
# Golden cache-format regression test (driven by 'httrack -#test=cache-golden <dir>').
|
||||
#
|
||||
# 01_engine-cache.test writes the cache with the same build it reads back (a
|
||||
# 01_zlib-cache.test writes the cache with the same build it reads back (a
|
||||
# round-trip), so it cannot catch a read-path or ZIP-format regression where
|
||||
# writer and reader drift together. This reads a *committed* cache frozen by an
|
||||
# earlier build and asserts a fixed set of entries still decodes field- and
|
||||
@@ -9,6 +9,13 @@ set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# python3 runs the local server (mirror local-crawl.sh); skip when absent, else
|
||||
# run() swallows its exit-77 and the serverless 0s/0s crawl looks like a fail.
|
||||
command -v python3 >/dev/null || {
|
||||
echo "python3 not found; skipping local crawl tests"
|
||||
exit 77
|
||||
}
|
||||
|
||||
run() { # echoes the wall-clock seconds of one crawl
|
||||
local t0 t1
|
||||
t0=$(date +%s)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Committed binary fixture read by 01_engine-cache-golden.test. List it
|
||||
# Committed binary fixture read by 01_zlib-cache-golden.test. List it
|
||||
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
||||
# silently drop it from the dist tarball and break "make distcheck".
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
@@ -25,9 +25,6 @@ TEST_EXTENSIONS = .test
|
||||
TEST_LOG_COMPILER = $(BASH)
|
||||
TESTS = \
|
||||
00_runnable.test \
|
||||
01_engine-cache.test \
|
||||
01_engine-cache-golden.test \
|
||||
01_engine-cache-writefail.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-cookies.test \
|
||||
@@ -47,9 +44,14 @@ TESTS = \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-status.test \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
01_engine-urlhack.test \
|
||||
01_engine-useragent.test \
|
||||
01_zlib-cache.test \
|
||||
01_zlib-cache-golden.test \
|
||||
01_zlib-cache-writefail.test \
|
||||
02_manpage-regen.test \
|
||||
02_update-cache.test \
|
||||
10_crawl-simple.test \
|
||||
|
||||
Reference in New Issue
Block a user