Compare commits

..

1 Commits

Author SHA1 Message Date
Xavier Roche
20880c1a4d Bound htslib.c pointer-destination buffer writes (batch 9)
Continues the htssafe.h pointer-destination migration (X1), where the
strcpybuff/strcatbuff macros silently fall back to a raw strcpy/strcat
when the destination is a bare char* rather than a sized array.

In htslib.c:
* fil_normalized() rebuilds the sorted query through an htsbuff bounded
  builder over the malloc'd copyBuff, then copies it back with strlcpybuff
  (capacity is the known qLen + 1).
* treathead() bounds the Location: copy with strlcpybuff against the
  location_buffer[HTS_URLMAXSIZE*2] contract.
* give_mimext(), convtolower() and cut_path() are internal (hidden, not
  HTSEXT_API), so they take an explicit destination size and the callers
  pass it: give_mimext in htsname.c/htscoremain.c/htslib.c, convtolower in
  htshash.c. cut_path has no callers.

Add strlncatbuff(dst, src, size, n) to htssafe.h: a bounded n-limited
append with explicit capacity, the missing parallel to strlcatbuff.

Cover fil_normalized query-sort, give_mimext, convtolower and cut_path with
the -#7 basic_selftests.

get_httptype() and adr_normalized() are left for a follow-up: both are
exported (HTSEXT_API), and get_httptype() exposes a real latent overflow
(a .docx/.pptx/.xlsx URL writes a 65-73 char mime type into 64-byte
contenttype callers) whose fix is a public-ABI decision.

htslib.c pointer-destination warnings: 14 -> 4.

Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-16 03:48:52 +02:00
17 changed files with 92 additions and 574 deletions

View File

@@ -1,5 +0,0 @@
[flake8]
# Match black's formatting so the two tools don't fight.
max-line-length = 88
# E203/W503 conflict with black's slice and line-break style.
extend-ignore = E203, W503

View File

@@ -285,6 +285,46 @@ static void basic_selftests(void) {
assertf(end == NULL && strcmp(tok, "a\\") == 0);
}
}
// fil_normalized(): canonicalizes a URL path. Query arguments are sorted
// alphabetically (by the text after each '?'/'&') and the query is rebuilt
// through a bounded builder; outside the query, "//" collapses to "/".
// Regression for that builder.
{
char norm[256];
assertf(strcmp(fil_normalized("/p?b=2&a=1&c=3", norm), "/p?a=1&b=2&c=3") ==
0);
assertf(strcmp(fil_normalized("/a//b", norm), "/a/b") == 0);
}
// give_mimext(): mime type -> file extension, bounded into the caller buffer.
{
char ext[16];
give_mimext(ext, sizeof(ext), "image/gif");
assertf(strcmp(ext, "gif") == 0);
give_mimext(ext, sizeof(ext), "text/html");
assertf(strcmp(ext, "html") == 0);
give_mimext(ext, sizeof(ext), "no/such-mime-type");
assertf(ext[0] == '\0');
}
// convtolower(): lower-cases into the caller buffer (bounded by its size).
{
char low[64];
assertf(strcmp(convtolower(low, sizeof(low), "ABC/Def.HTML"),
"abc/def.html") == 0);
}
// cut_path(): splits a path into directory (with trailing '/') and basename,
// each bounded by its buffer size.
{
char full[] = "/dir/sub/file.html";
char path[256];
char pname[256];
cut_path(full, path, sizeof(path), pname, sizeof(pname));
assertf(strcmp(path, "/dir/sub/") == 0);
assertf(strcmp(pname, "file.html") == 0);
}
}
/* Self-tests for the htssafe.h bounded string ops (driven by httrack -#8).
@@ -2605,7 +2645,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
printf("%s is '%s'\n", argv[na + 1], mime);
ext[0] = '\0';
give_mimext(ext, mime);
give_mimext(ext, sizeof(ext), mime);
if (ext[0]) {
printf("and its local type is '.%s'\n", ext);
}

View File

@@ -76,7 +76,7 @@ static coucal_key key_duphandler(void *arg, coucal_key_const name) {
/* Key sav hashes are using case-insensitive version */
static coucal_hashkeys key_sav_hashes(void *arg, coucal_key_const key) {
hash_struct *const hash = (hash_struct*) arg;
convtolower(hash->catbuff, (const char*) key);
convtolower(hash->catbuff, sizeof(hash->catbuff), (const char *) key);
return coucal_hash_string(hash->catbuff);
}

View File

@@ -1530,8 +1530,9 @@ void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * ret
if (retour->location) {
while(is_realspace(*(rcvd + p)))
p++; // sauter espaces
if ((int) strlen(rcvd + p) < HTS_URLMAXSIZE) // pas trop long?
strcpybuff(retour->location, rcvd + p);
if ((int) strlen(rcvd + p) < HTS_URLMAXSIZE) // not too long?
/* location aliases location_buffer[HTS_URLMAXSIZE * 2] */
strlcpybuff(retour->location, rcvd + p, HTS_URLMAXSIZE * 2);
else // erreur.. ignorer
retour->location[0] = '\0';
}
@@ -3444,16 +3445,17 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
/* Replace query by sorted query */
copyBuff = malloct(qLen + 1);
assertf(copyBuff != NULL);
copyBuff[0] = '\0';
for(i = 0; i < ampargs; i++) {
if (i == 0)
strcatbuff(copyBuff, "?");
else
strcatbuff(copyBuff, "&");
strcatbuff(copyBuff, amps[i] + 1);
{
htsbuff cb = htsbuff_ptr(copyBuff, qLen + 1);
for (i = 0; i < ampargs; i++) {
htsbuff_cat(&cb, i == 0 ? "?" : "&");
htsbuff_cat(&cb, amps[i] + 1);
}
assertf(cb.len == qLen);
}
assertf(strlen(copyBuff) == qLen);
strcpybuff(query, copyBuff);
/* query points into dest where the original qLen-byte query was */
strlcpybuff(query, copyBuff, qLen + 1);
/* Cleanup */
freet(amps);
@@ -3894,9 +3896,9 @@ HTSEXT_API size_t escape_for_html_print_full(const char *const s, char *const de
#undef ADD_CHAR
// conversion minuscules, avec buffer
char *convtolower(char *catbuff, const char *a) {
strcpybuff(catbuff, a);
// lower-case conversion into caller buffer (capacity catbuffsize)
char *convtolower(char *catbuff, size_t catbuffsize, const char *a) {
strlcpybuff(catbuff, a, catbuffsize);
hts_lowcase(catbuff); // lower case
return catbuff;
}
@@ -4073,15 +4075,15 @@ int get_userhttptype(httrackp * opt, char *s, const char *fil) {
// renvoyer extesion d'un type mime..
// ex: "image/gif" -> gif
void give_mimext(char *s, const char *st) {
void give_mimext(char *s, size_t ssize, const char *st) {
int ok = 0;
int j = 0;
s[0] = '\0';
while((!ok) && (strnotempty(hts_mime[j][1]))) {
if (strfield2(hts_mime[j][0], st)) {
if (hts_mime[j][1][0] != '*') { // Une correspondance existe
strcpybuff(s, hts_mime[j][1]);
if (hts_mime[j][1][0] != '*') { // a match exists
strlcpybuff(s, hts_mime[j][1], ssize);
ok = 1;
}
}
@@ -4102,7 +4104,7 @@ void give_mimext(char *s, const char *st) {
if (a) {
if ((int) strlen(a) >= 1) {
if ((int) strlen(a) <= 4) {
strcpybuff(s, a);
strlcpybuff(s, a, ssize);
ok = 1;
}
}
@@ -4206,7 +4208,7 @@ int may_bogus_multiple(httrackp * opt, const char *mime, const char *filename) {
char ext[64];
ext[0] = '\0';
give_mimext(ext, mime);
give_mimext(ext, sizeof(ext), mime);
if (ext[0] != 0) { /* we have an extension for that */
const size_t ext_size = strlen(ext);
const char *file = strrchr(filename, '/'); /* fetch terminal filename */
@@ -4930,7 +4932,8 @@ void hts_freeall(void) {
// cut path and project name
// patch also initial path
void cut_path(char *fullpath, char *path, char *pname) {
void cut_path(char *fullpath, char *path, size_t path_size, char *pname,
size_t pname_size) {
path[0] = pname[0] = '\0';
if (strnotempty(fullpath)) {
if ((fullpath[strlen(fullpath) - 1] == '/')
@@ -4946,8 +4949,8 @@ void cut_path(char *fullpath, char *path, char *pname) {
a--;
if (*a == '/')
a++;
strcpybuff(pname, a);
strncatbuff(path, fullpath, (int) (a - fullpath));
strlcpybuff(pname, a, pname_size);
strlncatbuff(path, fullpath, path_size, (size_t) (a - fullpath));
}
}
}

View File

@@ -252,7 +252,7 @@ int ishtml_ext(const char *a);
int ishttperror(int err);
int get_userhttptype(httrackp * opt, char *s, const char *fil);
void give_mimext(char *s, const char *st);
void give_mimext(char *s, size_t ssize, const char *st);
int may_bogus_multiple(httrackp * opt, const char *mime, const char *filename);
int may_unknown2(httrackp * opt, const char *mime, const char *filename);
@@ -264,7 +264,7 @@ void code64(unsigned char *a, int size_a, unsigned char *b, int crlf);
#define copychar(catbuff,a) concat(catbuff,(a),NULL)
char *convtolower(char *catbuff, const char *a);
char *convtolower(char *catbuff, size_t catbuffsize, const char *a);
void hts_lowcase(char *s);
void hts_replace(char *s, char from, char to);
int multipleStringMatch(const char *s, const char *match);
@@ -276,7 +276,8 @@ void fprintfio(FILE * fp, const char *buff, const char *prefix);
int sig_ignore_flag(int setflag); // flag ignore
#endif
void cut_path(char *fullpath, char *path, char *pname);
void cut_path(char *fullpath, char *path, size_t path_size, char *pname,
size_t pname_size);
int fexist(const char *s);
int fexist_utf8(const char *s);

View File

@@ -344,7 +344,7 @@ int url_savename(lien_adrfilsave *const afs,
mime[0] = ext[0] = '\0';
get_userhttptype(opt, mime, fil);
if (strnotempty(mime)) {
give_mimext(ext, mime);
give_mimext(ext, sizeof(ext), mime);
if (strnotempty(ext)) {
ext_chg = 1;
}
@@ -378,7 +378,7 @@ int url_savename(lien_adrfilsave *const afs,
ext_chg = 2; /* change filename */
strcpybuff(ext, r.cdispo);
} else if (!may_unknown2(opt, r.contenttype, fil)) { // on peut patcher à priori?
give_mimext(s, r.contenttype); // obtenir extension
give_mimext(s, sizeof(s), r.contenttype); // get extension
if (strnotempty(s) > 0) { // on a reconnu l'extension
ext_chg = 1;
strcpybuff(ext, s);
@@ -403,7 +403,7 @@ int url_savename(lien_adrfilsave *const afs,
mime[0] = ext[0] = '\0';
get_userhttptype(opt, mime, fil);
if (strnotempty(mime)) {
give_mimext(ext, mime);
give_mimext(ext, sizeof(ext), mime);
if (strnotempty(ext)) {
ext_chg = 1;
}
@@ -421,7 +421,8 @@ int url_savename(lien_adrfilsave *const afs,
} else if (!may_unknown2(opt, headers->r.contenttype, headers->url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
char s[16];
s[0] = '\0';
give_mimext(s, headers->r.contenttype); // obtenir extension
give_mimext(s, sizeof(s),
headers->r.contenttype); // get extension
if (strnotempty(s) > 0) { // on a reconnu l'extension
ext_chg = 1;
strcpybuff(ext, s);
@@ -431,7 +432,7 @@ int url_savename(lien_adrfilsave *const afs,
else if (mime_type != NULL) {
ext[0] = '\0';
if (*mime_type) {
give_mimext(ext, mime_type);
give_mimext(ext, sizeof(ext), mime_type);
}
if (strnotempty(ext)) {
char mime_from_file[128];
@@ -646,7 +647,8 @@ int url_savename(lien_adrfilsave *const afs,
ext_chg = 2; /* change filename */
strcpybuff(ext, back[b].r.cdispo);
} else if (!may_unknown2(opt, back[b].r.contenttype, back[b].url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
give_mimext(s, back[b].r.contenttype); // obtenir extension
give_mimext(s, sizeof(s),
back[b].r.contenttype); // get extension
if (strnotempty(s) > 0) { // on a reconnu l'extension
ext_chg = 1;
strcpybuff(ext, s);

View File

@@ -237,6 +237,15 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), (size_t) -1, \
"overflow while appending '" #B "' to '"#A"'", __FILE__, __LINE__)
/**
* Append at most "N" characters of "B" to "A", "A" having a maximum capacity
* of "S".
*/
#define strlncatbuff(A, B, S, N) \
strncat_safe_(A, S, B, HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
N, "overflow while appending '" #B "' to '" #A "'", __FILE__, \
__LINE__)
/**
* Copy characters of "B" to "A", "A" having a maximum capacity of "S".
*/

View File

@@ -1,15 +0,0 @@
#!/bin/bash
#
# Cookie chain against the local test server (replaces the old online
# ut/cookies/*.php fixtures). entrance.php sets cat/cake; second.php checks
# them and sets badger; third.php checks all three. A missing or wrong cookie
# returns 500, which would surface as an httrack error and a missing file, so a
# clean 3-files/0-errors run proves the cookie jar is replayed across links.
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \
--found 'cookies/entrance.html' \
--found 'cookies/second.html' \
--found 'cookies/third.html' \
httrack 'BASEURL/cookies/entrance.php'

View File

@@ -1,18 +0,0 @@
#!/bin/bash
#
# HTTPS crawl against the local test server, using the shipped self-signed
# cert. httrack does not verify certs (htslib.c: SSL_CTX_new with no
# SSL_CTX_set_verify), so the self-signed cert is accepted as-is and this
# exercises the real TLS path offline. basic.html links to link.html with four
# distinct query strings, each saved under a hashed name -> 5 files.
: "${top_srcdir:=..}"
if test "$HTTPS_SUPPORT" == "no"; then
echo "no https support compiled, skipping"
exit 77
fi
bash "$top_srcdir/tests/local-crawl.sh" --tls --errors 0 --files 5 \
--found 'simple/basic.html' \
httrack 'BASEURL/simple/basic.html'

View File

@@ -1,7 +1,4 @@
# Note: EXTRA_DIST globs are NOT expanded by automake; list fixtures explicitly.
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
local-crawl.sh local-server.py server.crt server.key \
server-root/simple/basic.html server-root/simple/link.html
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh
TESTS_ENVIRONMENT =
TESTS_ENVIRONMENT += PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH
@@ -38,8 +35,6 @@ TESTS = \
11_crawl-international.test \
11_crawl-longurl.test \
11_crawl-parsing.test \
12_crawl_https.test \
13_local-cookies.test \
14_local-https.test
12_crawl_https.test
CLEANFILES = check-network_sh.cache

View File

@@ -476,12 +476,7 @@ target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
# Note: EXTRA_DIST globs are NOT expanded by automake; list fixtures explicitly.
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
local-crawl.sh local-server.py server.crt server.key \
server-root/simple/basic.html server-root/simple/link.html
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh
# note: libtool should handle that
### TESTS_ENVIRONMENT += $(SHLIBPATH_VAR)="$(top_builddir)/src/$(LT_CV_OBJDIR)$${$(SHLIBPATH_VAR):+$(PATH_SEPARATOR)}$$$(SHLIBPATH_VAR)"
TESTS_ENVIRONMENT = PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH \
@@ -514,9 +509,7 @@ TESTS = \
11_crawl-international.test \
11_crawl-longurl.test \
11_crawl-parsing.test \
12_crawl_https.test \
13_local-cookies.test \
14_local-https.test
12_crawl_https.test
CLEANFILES = check-network_sh.cache
all: all-am

View File

@@ -1,235 +0,0 @@
#!/bin/bash
#
# Launcher for httrack crawl tests against the local Python test server.
#
# Starts tests/local-server.py on an ephemeral port, discovers the port from
# the server's stdout, then runs httrack against http(s)://127.0.0.1:$PORT and
# audits the mirror. The server is always killed and the tmpdir removed on exit.
#
# The token BASEURL in any httrack argument is replaced with the discovered
# http(s)://127.0.0.1:$PORT base. --found/--directory paths are relative to the
# discovered host root (127.0.0.1_<port>/), since the random port leaks into
# the mirror directory name.
#
# Usage:
# bash local-crawl.sh [--tls] [--root DIR] \
# --errors N --files N --found PATH ... --directory PATH ... \
# httrack BASEURL/some/path [httrack-args...]
set -u
testdir=$(cd "$(dirname "$0")" && pwd)
server="${testdir}/local-server.py"
root="${LOCAL_SERVER_ROOT:-${testdir}/server-root}"
cert="${testdir}/server.crt"
key="${testdir}/server.key"
tls=
verbose=
tmpdir=
serverpid=
crawlpid=
function warning {
echo "** $*" >&2
return 0
}
function die {
warning "$*"
exit 1
}
function debug {
test -n "$verbose" && echo "$*" >&2
return 0
}
function info { printf "[%s] ..\t" "$*" >&2; }
function result { echo "$*" >&2; }
function cleanup {
if test -n "$crawlpid"; then
kill -9 "$crawlpid" 2>/dev/null
crawlpid=
fi
if test -n "$serverpid"; then
kill "$serverpid" 2>/dev/null
# Reap it so the port is released before we rm the tmpdir/log.
wait "$serverpid" 2>/dev/null
serverpid=
fi
if test -n "$tmpdir" && test -d "$tmpdir"; then
test -n "$nopurge" || rm -rf "$tmpdir"
fi
}
function assert_equals {
info "$1"
if test ! "$2" == "$3"; then
result "expected '$2', got '$3'"
exit 1
fi
result "OK ($2)"
}
nopurge=
trap cleanup EXIT HUP INT QUIT PIPE TERM
# python3 is required; mirror check-network.sh's skip-with-77 convention.
command -v python3 >/dev/null || ! echo "python3 not found; skipping local crawl tests" || exit 77
tmptopdir=${TMPDIR:-/tmp}
test -d "$tmptopdir" || mkdir -p "$tmptopdir" || die "no temporary directory; set TMPDIR"
tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create tmpdir"
# --- parse leading control flags --------------------------------------------
declare -a audit=()
scheme=http
pos=0
args=("$@")
nargs=$#
while test "$pos" -lt "$nargs"; do
case "${args[$pos]}" in
--debug) verbose=1 ;;
--no-purge)
nopurge=1
audit+=("--no-purge")
;;
--tls)
tls=1
scheme=https
;;
--root)
pos=$((pos + 1))
root="${args[$pos]}"
;;
--errors | --files)
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
pos=$((pos + 1))
;;
--found | --not-found | --directory)
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
pos=$((pos + 1))
;;
httrack)
pos=$((pos + 1))
break
;;
*) die "unrecognized option ${args[$pos]}" ;;
esac
pos=$((pos + 1))
done
# --- start the server --------------------------------------------------------
test -r "$server" || die "cannot read $server"
serverlog="${tmpdir}/server.log"
serverargs=(--root "$root")
if test -n "$tls"; then
serverargs+=(--tls --cert "$cert" --key "$key")
fi
debug "starting python3 $server ${serverargs[*]}"
python3 "$server" "${serverargs[@]}" >"$serverlog" 2>&1 &
serverpid=$!
# Wait for the "PORT <n>" line (server prints it once bound).
port=
for _ in $(seq 1 50); do
if test -s "$serverlog"; then
line=$(head -n1 "$serverlog")
if test "${line%% *}" == "PORT"; then
port="${line#PORT }"
break
fi
fi
kill -0 "$serverpid" 2>/dev/null || die "server exited early: $(cat "$serverlog")"
sleep 0.1
done
test -n "$port" || die "could not discover server port: $(cat "$serverlog")"
debug "server listening on ${scheme}://127.0.0.1:${port}"
baseurl="${scheme}://127.0.0.1:${port}"
# --- substitute BASEURL in the remaining (httrack) args ----------------------
declare -a hts=()
while test "$pos" -lt "$nargs"; do
hts+=("${args[$pos]//BASEURL/$baseurl}")
pos=$((pos + 1))
done
# --- run httrack -------------------------------------------------------------
which httrack >/dev/null || die "could not find httrack"
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')
test -n "$ver" || die "could not run httrack"
out="${tmpdir}/crawl"
mkdir "$out" || die "could not create $out"
# Localhost is fast; disable the rate/bandwidth safety limits but keep a
# max-time backstop so a hang cannot wedge the suite.
declare -a moreargs=(--quiet --max-time=120 --timeout=30 --disable-security-limits --robots=0)
log="${tmpdir}/log"
info "running httrack ${hts[*]}"
httrack -O "$out" --user-agent="httrack $ver local ($(uname -omrs))" "${moreargs[@]}" "${hts[@]}" >"$log" 2>&1 &
crawlpid=$!
wait "$crawlpid"
crawlres=$?
crawlpid=
# httrack exits 0 even on hard connect/DNS errors, so this is a backstop only;
# the real guard is the audit below (--errors 0 plus the host-root existence check).
test "$crawlres" -eq 0 || ! result "httrack exited $crawlres" || {
cat "$log" >&2
exit 1
}
result "OK"
grep -iE "^[0-9:]*[[:space:]]Error:" "${out}/hts-log.txt" >&2
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
hostroot=
for cand in "${out}/127.0.0.1_${port}" "${out}/127.0.0.1"; do
if test -d "$cand"; then
hostroot="$cand"
break
fi
done
test -n "$hostroot" || die "could not find host root under $out"
debug "host root: $hostroot"
# --- audit -------------------------------------------------------------------
i=0
while test "$i" -lt "${#audit[@]}"; do
case "${audit[$i]}" in
--errors)
i=$((i + 1))
assert_equals "checking errors" "${audit[$i]}" \
"$(grep -iEc "^[0-9:]*[[:space:]]Error:" "${out}/hts-log.txt")"
;;
--files)
i=$((i + 1))
nFiles=$(grep -E "^HTTrack Website Copier/[^ ]* mirror complete in " "${out}/hts-log.txt" |
sed -e 's/.*[[:space:]]\([^ ]*\)[[:space:]]files written.*/\1/g')
assert_equals "checking files" "${audit[$i]}" "$nFiles"
;;
--found)
i=$((i + 1))
info "checking for ${audit[$i]}"
if test -f "${hostroot}/${audit[$i]}"; then result "OK"; else
result "not found"
exit 1
fi
;;
--not-found)
i=$((i + 1))
info "checking absence of ${audit[$i]}"
if test ! -f "${hostroot}/${audit[$i]}"; then result "OK"; else
result "present"
exit 1
fi
;;
--directory)
i=$((i + 1))
info "checking for dir ${audit[$i]}"
if test -d "${hostroot}/${audit[$i]}"; then result "OK"; else
result "not found"
exit 1
fi
;;
esac
i=$((i + 1))
done

View File

@@ -1,182 +0,0 @@
#!/usr/bin/env python3
"""Self-contained local web server for httrack's crawl tests.
Serves static fixtures from a docroot plus a handful of dynamic endpoints
(cookies, ...) so httrack can be exercised over loopback, deterministically and
offline, instead of crawling the live ut.httrack.com.
Binds to an ephemeral port (port 0) and prints the chosen port to stdout as
"PORT <n>\n" so a launcher can discover it. Pass --tls to wrap the socket with
the shipped self-signed test cert; httrack does not verify certs, so no CA
trust plumbing is needed.
stdlib only (http.server + ssl) -- no new build or runtime dependency.
"""
import argparse
import os
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
from urllib.parse import quote, unquote, urlsplit
# Cookie chain replicated from the old ut/cookies/*.php fixtures.
COOKIE_PATH = "/cookies/"
COOKIES = {
"cat": "dog",
"cake": "is a lie!",
"badger": "mushroom, with 'ants'",
}
PAGE = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
\t"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
<head>
\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
\t<title>Sample test</title>
</head>
<body>
{body}
</body>
</html>
"""
class Handler(SimpleHTTPRequestHandler):
# Quieter logging; the launcher captures httrack's own log anyway.
def log_message(self, fmt, *args):
if os.environ.get("LOCAL_SERVER_VERBOSE"):
super().log_message(fmt, *args)
# --- helpers -----------------------------------------------------------
def request_cookies(self):
"""Parse the Cookie header into {name: decoded-value}.
Mirrors PHP's $_COOKIE: values are url-decoded, matching the encoding
applied when the cookie was set (see set_cookie)."""
jar = {}
raw = self.headers.get("Cookie", "")
for pair in raw.split(";"):
pair = pair.strip()
if "=" in pair:
name, value = pair.split("=", 1)
jar[name.strip()] = unquote(value.strip())
return jar
def set_cookie(self, name, value):
"""Queue a Set-Cookie header, url-encoding the value like PHP's
setcookie() so spaces/quotes/commas stay a single token that httrack
can store and replay verbatim."""
self._set_cookies.append(f"{name}={quote(value)}; Path={COOKIE_PATH}")
def send_html(self, body, status=200, extra_status=None):
encoded = PAGE.format(body=body).encode("utf-8")
self.send_response(status, extra_status)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(encoded)))
for cookie in self._set_cookies:
self.send_header("Set-Cookie", cookie)
self.end_headers()
if self.command != "HEAD":
self.wfile.write(encoded)
def fail_cookie(self, what):
# The old PHPs answered 500 with the reason in the status line.
self.send_html("", status=500, extra_status=f"The {what} is missing or invalid")
# --- dynamic routes ----------------------------------------------------
def route_entrance(self):
self.set_cookie("cat", COOKIES["cat"])
self.set_cookie("cake", COOKIES["cake"])
self.send_html('\tThis is a <a href="second.php">link</a>')
def route_second(self):
jar = self.request_cookies()
if jar.get("cat") != COOKIES["cat"]:
return self.fail_cookie("cat")
if jar.get("cake") != COOKIES["cake"]:
return self.fail_cookie("cake")
self.set_cookie("badger", COOKIES["badger"])
self.send_html('\tThis is a <a href="third.php">link</a>')
def route_third(self):
jar = self.request_cookies()
if jar.get("cat") != COOKIES["cat"]:
return self.fail_cookie("cat")
if jar.get("cake") != COOKIES["cake"]:
return self.fail_cookie("cake")
if jar.get("badger") != COOKIES["badger"]:
return self.fail_cookie("badger")
self.send_html("\tThis is a test.")
def route_robots(self):
body = b"User-agent: *\nDisallow:\n"
self.send_response(200)
self.send_header("Content-Type", "text/plain")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
if self.command != "HEAD":
self.wfile.write(body)
ROUTES = {
"/cookies/entrance.php": route_entrance,
"/cookies/second.php": route_second,
"/cookies/third.php": route_third,
"/robots.txt": route_robots,
}
# --- dispatch ----------------------------------------------------------
def dispatch(self):
self._set_cookies = []
path = urlsplit(self.path).path
handler = self.ROUTES.get(path)
if handler is not None:
handler(self)
return True
return False
def do_GET(self):
if not self.dispatch():
super().do_GET()
def do_HEAD(self):
if not self.dispatch():
super().do_HEAD()
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--root", required=True, help="docroot for static files")
parser.add_argument("--bind", default="127.0.0.1", help="bind address")
parser.add_argument("--tls", action="store_true", help="serve HTTPS")
parser.add_argument("--cert", help="TLS certificate (PEM)")
parser.add_argument("--key", help="TLS private key (PEM)")
args = parser.parse_args()
root = os.path.abspath(args.root)
def factory(*a, **kw):
return Handler(*a, directory=root, **kw)
httpd = ThreadingHTTPServer((args.bind, 0), factory)
if args.tls:
import ssl
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ctx.load_cert_chain(certfile=args.cert, keyfile=args.key)
httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
port = httpd.socket.getsockname()[1]
# The launcher reads this line to discover the ephemeral port.
print(f"PORT {port}", flush=True)
try:
httpd.serve_forever()
except KeyboardInterrupt:
pass
if __name__ == "__main__":
main()

View File

@@ -1,18 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="fr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>Sample test</title>
</head>
<body>
This is a <a href="link.html?v=1">link</a>
This is a <a href='link.html?v=2'>link</a>
This is a <a href="./link.html?v=3">link</a>
This is a <a href=link.html?v=4>link</a>
</body>

View File

@@ -1,3 +0,0 @@
This is a link.
Go back to <a href="basic.html">home</a>.

View File

@@ -1,21 +0,0 @@
-----BEGIN CERTIFICATE-----
MIIDbzCCAlegAwIBAgIUdWkDDomnY3WW95UqJ+UOASuR/i0wDQYJKoZIhvcNAQEL
BQAwODESMBAGA1UEAwwJMTI3LjAuMC4xMSIwIAYDVQQKDBlIVFRyYWNrIGxvY2Fs
IHRlc3Qgc2VydmVyMCAXDTI2MDYxNTE0NDQxMFoYDzIwNTYwNjA3MTQ0NDEwWjA4
MRIwEAYDVQQDDAkxMjcuMC4wLjExIjAgBgNVBAoMGUhUVHJhY2sgbG9jYWwgdGVz
dCBzZXJ2ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDx78mogNhT
noWwRa51NeGtapQ1PfTYLlIMUzuloFXOsR1/ozRkFucqHNftF22wf0gg4VQJSBSf
3rwj79vsnt3nyaD03bTAafpHXkd+IJxQowiG8TfOJF0R/Qg9g7DCE66R9agQpMJC
SGxIin9p/4ld4Hn6869d4hNq4fHxNf/qkj2cnf8DYxrldz2FGsi6yMed4tzz2Am4
ZbPgwep+fy843ZdYrVIms9vJluNa9E+6Vpw9FwdjzQ/IBBMLvGaC2pDkc95YelaE
nQrAlTO/0l5vjc8XuTQFlo3DbUg+WEld/pxvCqsd/q1mqjL0WbxtXl2zCwGzAoJx
rjVEPfA8QSbtAgMBAAGjbzBtMB0GA1UdDgQWBBTHE0KKW8REV4HxajzVsIBxz3iL
9zAfBgNVHSMEGDAWgBTHE0KKW8REV4HxajzVsIBxz3iL9zAPBgNVHRMBAf8EBTAD
AQH/MBoGA1UdEQQTMBGHBH8AAAGCCWxvY2FsaG9zdDANBgkqhkiG9w0BAQsFAAOC
AQEAYlTEftrwGJBXuPmtxhmtw2HO/VTC4TGnq67hH5H+ptwgZJuuxCQ5KW6flTyp
FTyMhha33WD4EBL3wqqJsWr9Y4BXqi4G0lRqXBcC1oIUa2VYIDMER7kaY1qTSqE8
ARpwdB2BhvngAzDLc+4Jt4jQMRGr8fHAwxpDBoIZ1knbyzYNP73Bajse6/8YtxUu
nB2BsldjZnLvyHvRxUpWp92OyQih4jYSrlN6olDFlKDg7++kMhkHtJQW9a1t54VN
0ZXrB1ZRuHUUvGBq26x71riTWor7HNOSQaGeCMQjZNQkh5tfshNygUGSZVXTEwhG
xSrOL7NqBt2+EkVwf7LjGzjmBw==
-----END CERTIFICATE-----

View File

@@ -1,28 +0,0 @@
-----BEGIN PRIVATE KEY-----
MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDx78mogNhTnoWw
Ra51NeGtapQ1PfTYLlIMUzuloFXOsR1/ozRkFucqHNftF22wf0gg4VQJSBSf3rwj
79vsnt3nyaD03bTAafpHXkd+IJxQowiG8TfOJF0R/Qg9g7DCE66R9agQpMJCSGxI
in9p/4ld4Hn6869d4hNq4fHxNf/qkj2cnf8DYxrldz2FGsi6yMed4tzz2Am4ZbPg
wep+fy843ZdYrVIms9vJluNa9E+6Vpw9FwdjzQ/IBBMLvGaC2pDkc95YelaEnQrA
lTO/0l5vjc8XuTQFlo3DbUg+WEld/pxvCqsd/q1mqjL0WbxtXl2zCwGzAoJxrjVE
PfA8QSbtAgMBAAECggEACgNK4klq1T3IpKdNoBY5yoE7CbUQZBNkBpSPRxHgBezj
SVFfgrZGnOySrIJSt4JHtuynG2Hl+0ku74HRep/ck+eOsh5W3mZvGvMLnGxhwR3u
Or99osTIgU0VQTkpC0SLQ16FCnih0uJycNIikdLR7uuya1tt1OyIBzK7XlNGIywT
p85zJc7/6TfTC9eM7lqh7JGR7KplBxSvgZL1pUr7y4rNpKms6uzOvPND79CcKnbU
BBA9Tu4qdOkoOljsZKkvh3pihxyG9X6d8QTZ/uX3pkvliwSFBc+Sz9EootA3/4r5
gVWpQ2t/AY7fY4hqzLIX/HivVaPj3cWk1G+SHm0XNQKBgQD5I9rijqFvV/p6FmUl
FbnjJFFHHgZLivlGxAC5vOyJNQQaqdeDzg7yMotNmQTggVGjT6sjdosQb3n+ctuk
EhQnZSU5VkNKv1+PTR35WrRkaECCaqz3Pv79pV9GVcX3it7UuYjNiOeSPqINWe+X
49JwnJFz+qQ1BchAwOis4zkENwKBgQD4mShDaYLOO97VpgZj4cGxHHWyEK9CRQvp
I7HxRmfaWS3JHwb88lOmALEU6pAj5cYJPAznv8BnUWcVHalZbkQ1JWYtUJRqj6OI
Ym7rw/nm4Ay5ijbdEism173dSk3IjOe+PdAlxzsOuVzYdBTqElmeQWtBzhY9aHvX
r+A02C2j+wKBgHHDo6Gsi57yR5gUPd9vSlCkNtEIrss0DJv5yHMIB+KnaNZcE+NF
5qFF30Jxyz5RDtxJ9tXcvaeln8lG3XDQKI/MqfDCqTuqo5ImHrfMaW8oA70JxS2p
gHqGVzkg1aMxsIrmpcdk6olnPExocvWivGdbtzeEjhMALu8Sp6y6nUCFAoGBAK5h
KLgYw/OMVaQCIMthaa+l6f0s7PMMYe1453H6VBD6qz4/8HPwO7LfG1gzrUYxADgs
ElVh0UHn/On383nS+i9Ze5Hfyyvwc+LQQURKJPrJQMPJavCptPE7NmiKnYNHK6vr
yh0l4oxShAklbCJBGvICq4zuVfVfXDeQnDIVTfaPAoGBAMCrZqYdOUhUu+aUqxZq
qO/TTQxrxftU63jGUg+o042TdgI4KWLn07wvHJ8/E2OqF35eXenvcuKbNLI1l72J
4cp+3cUv8iAXThTRYEztr5CS/wta4o4CNN8zfjn5dV9AI4Hmt4V7EaGWpBcViGbj
n0Mhag+dO8DHuenqi1yfMrAt
-----END PRIVATE KEY-----