mirror of
https://github.com/xroche/httrack.git
synced 2026-06-16 07:13:45 +03:00
Compare commits
22 Commits
ci/mkdeb-s
...
feature/lo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f9f27b924d | ||
|
|
a6fc0e9dab | ||
|
|
f227135d16 | ||
|
|
223564eaca | ||
|
|
7db49a64b6 | ||
|
|
f1c04c10eb | ||
|
|
17fc54869d | ||
|
|
d2e43549d8 | ||
|
|
a9b16d96ea | ||
|
|
4ed828ff78 | ||
|
|
82ace34c4d | ||
|
|
3970eb3706 | ||
|
|
d3c41b31e8 | ||
|
|
f8367eeac7 | ||
|
|
9279a4b349 | ||
|
|
b52e8c4c0f | ||
|
|
665f51d1a0 | ||
|
|
e4e5d4699a | ||
|
|
a50691c0f8 | ||
|
|
5f96e86818 | ||
|
|
6002bc20ca | ||
|
|
bdbc741597 |
5
.flake8
Normal file
5
.flake8
Normal file
@@ -0,0 +1,5 @@
|
||||
[flake8]
|
||||
# Match black's formatting so the two tools don't fight.
|
||||
max-line-length = 88
|
||||
# E203/W503 conflict with black's slice and line-break style.
|
||||
extend-ignore = E203, W503
|
||||
133
.github/workflows/ci.yml
vendored
133
.github/workflows/ci.yml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
env:
|
||||
CC: ${{ matrix.cc }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
@@ -69,7 +69,7 @@ jobs:
|
||||
name: build (macOS arm64, clang)
|
||||
runs-on: macos-14
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
@@ -104,7 +104,7 @@ jobs:
|
||||
name: build (linux i386, gcc -m32)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
@@ -133,6 +133,97 @@ jobs:
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Memory safety: build and run the suite under AddressSanitizer +
|
||||
# UndefinedBehaviorSanitizer. The offline engine self-tests drive the parsers
|
||||
# that chew on untrusted crawled input (charset, mime, HTML, entities, IDNA,
|
||||
# filters, cache) straight through the sanitizers, so a buffer overrun,
|
||||
# use-after-free, or signed overflow there fails the build instead of slipping
|
||||
# past a plain -O2 build. gcc's runtimes; one job is enough (the bug class is
|
||||
# arch-independent and the matrix already covers compile portability).
|
||||
sanitize:
|
||||
name: sanitize (ASan+UBSan, gcc)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
set -euo pipefail
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential autoconf automake libtool autoconf-archive \
|
||||
zlib1g-dev libssl-dev
|
||||
|
||||
- name: Configure (sanitized)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
autoreconf -fi
|
||||
./configure CC=gcc \
|
||||
CFLAGS="-fsanitize=address,undefined -fno-sanitize-recover=all -g -O1 -fno-omit-frame-pointer" \
|
||||
LDFLAGS="-fsanitize=address,undefined"
|
||||
|
||||
- name: Build
|
||||
run: make -j"$(nproc)"
|
||||
|
||||
- name: Test (sanitized)
|
||||
# Leaks at exit are out of scope (the CLI frees little on the way out);
|
||||
# we want memory-safety errors, so turn leak detection off and make every
|
||||
# other finding abort the run.
|
||||
#
|
||||
# Poison fresh allocations with 0xCA and freed blocks with 0xCB (decimal
|
||||
# 202/203) so memory never reads back as accidental zeros: a missing-NUL
|
||||
# fread buffer then runs strlen off into the redzone instead of stopping
|
||||
# at a lucky zero. Distinct bytes tell the two apart in a dump (0xCA =
|
||||
# uninitialized, 0xCB = use-after-free). ASan caps its malloc fill at 4096
|
||||
# bytes by default, so max_malloc_fill_size lifts it to cover large cache
|
||||
# buffers; free_fill flags use-after-free reads.
|
||||
env:
|
||||
ASAN_OPTIONS: detect_leaks=0:abort_on_error=1:halt_on_error=1:strict_string_checks=1:malloc_fill_byte=202:max_malloc_fill_size=2147483647:free_fill_byte=203:max_free_fill_size=2147483647
|
||||
UBSAN_OPTIONS: print_stacktrace=1:halt_on_error=1
|
||||
run: make check
|
||||
|
||||
- name: Print the test log on failure
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Optional-dependency build: compile and test with HTTPS/OpenSSL disabled --
|
||||
# the configuration users on minimal systems build, and one libssl is not even
|
||||
# installed here so configure cannot silently re-enable it. The matrix above
|
||||
# always has libssl, so the #if HTS_USEOPENSSL branches would otherwise never
|
||||
# be compiled and could rot unnoticed.
|
||||
no-ssl:
|
||||
name: build (no openssl, --disable-https)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install build dependencies (no libssl)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential autoconf automake libtool autoconf-archive zlib1g-dev
|
||||
|
||||
- name: Configure (https disabled)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
autoreconf -fi
|
||||
./configure --disable-https
|
||||
|
||||
- name: Build
|
||||
run: make -j"$(nproc)"
|
||||
|
||||
- name: Test
|
||||
run: make check
|
||||
|
||||
- name: Print the test log on failure
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Validate the Debian packaging via the same script maintainers release with.
|
||||
# One amd64/gcc run is enough: packaging (control/rules/manifest/lintian/quilt
|
||||
# source build) is arch- and compiler-independent, and the build matrix above
|
||||
@@ -141,7 +232,7 @@ jobs:
|
||||
name: deb package (lintian)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
@@ -167,13 +258,41 @@ jobs:
|
||||
export DEB_BUILD_OPTIONS="noautodbgsym parallel=$(nproc)"
|
||||
bash tools/mkdeb.sh --unsigned --no-release-artifacts
|
||||
|
||||
# Release-tarball integrity: `make distcheck` rolls the dist tarball, then
|
||||
# configures, builds and tests it out-of-tree from a read-only source tree and
|
||||
# checks nothing is left behind. Catches a file referenced in *_SOURCES or
|
||||
# EXTRA_DIST but missing from the tarball -- the same "ships broken to users"
|
||||
# class as a stale committed Makefile.in.
|
||||
distcheck:
|
||||
name: distcheck (release tarball)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
set -euo pipefail
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential autoconf automake libtool autoconf-archive \
|
||||
zlib1g-dev libssl-dev
|
||||
|
||||
- name: distcheck
|
||||
run: |
|
||||
set -euo pipefail
|
||||
autoreconf -fi
|
||||
./configure
|
||||
make -j"$(nproc)" distcheck
|
||||
|
||||
dco:
|
||||
name: DCO sign-off
|
||||
# Only checkable on a PR, where we have the base..head commit range.
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -202,7 +321,7 @@ jobs:
|
||||
name: lint (shellcheck, shfmt)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Install linters
|
||||
env:
|
||||
@@ -231,7 +350,7 @@ jobs:
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
||||
@@ -114,5 +114,12 @@ EXTRA_DIST = httrack.h webhttrack \
|
||||
proxy/proxytrack.h \
|
||||
proxy/store.h \
|
||||
proxy/proxytrack.vcproj \
|
||||
coucal/* \
|
||||
*.dsw *.dsp *.vcproj
|
||||
coucal/LICENSE \
|
||||
coucal/Makefile \
|
||||
coucal/README.md \
|
||||
coucal/sample.c \
|
||||
coucal/tests.c \
|
||||
htsjava.vcproj \
|
||||
httrack.dsp httrack.dsw httrack.vcproj \
|
||||
libhttrack.dsp libhttrack.dsw libhttrack.vcproj \
|
||||
webhttrack.dsp webhttrack.dsw webhttrack.vcproj
|
||||
|
||||
@@ -565,8 +565,15 @@ EXTRA_DIST = httrack.h webhttrack \
|
||||
proxy/proxytrack.h \
|
||||
proxy/store.h \
|
||||
proxy/proxytrack.vcproj \
|
||||
coucal/* \
|
||||
*.dsw *.dsp *.vcproj
|
||||
coucal/LICENSE \
|
||||
coucal/Makefile \
|
||||
coucal/README.md \
|
||||
coucal/sample.c \
|
||||
coucal/tests.c \
|
||||
htsjava.vcproj \
|
||||
httrack.dsp httrack.dsw httrack.vcproj \
|
||||
libhttrack.dsp libhttrack.dsw libhttrack.vcproj \
|
||||
webhttrack.dsp webhttrack.dsw webhttrack.vcproj
|
||||
|
||||
all: all-am
|
||||
|
||||
|
||||
Submodule src/coucal updated: 73ada07555...fadf29bd2a
@@ -939,7 +939,7 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
FILE *const fp = FOPEN(fconv(catbuff, sizeof(catbuff), previous_save), "rb");
|
||||
|
||||
if (fp != NULL) {
|
||||
r.adr = (char *) malloct((int) r.size + 4);
|
||||
r.adr = (char *) malloct((int) r.size + 1);
|
||||
if (r.adr != NULL) {
|
||||
if (r.size > 0
|
||||
&& fread(r.adr, 1, (int) r.size, fp) != r.size) {
|
||||
@@ -948,7 +948,8 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
r.statuscode = STATUSCODE_INVALID;
|
||||
sprintf(r.msg, "Read error in cache disk data: %s",
|
||||
strerror(last_errno));
|
||||
}
|
||||
} else if (r.size >= 0)
|
||||
*(r.adr + r.size) = '\0';
|
||||
} else {
|
||||
r.statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(r.msg,
|
||||
@@ -965,7 +966,7 @@ static htsblk cache_readex_new(httrackp * opt, cache_back * cache,
|
||||
// Data in cache.
|
||||
else {
|
||||
// lire fichier (d'un coup)
|
||||
r.adr = (char *) malloct((int) r.size + 4);
|
||||
r.adr = (char *) malloct((int) r.size + 1);
|
||||
if (r.adr != NULL) {
|
||||
if (unzReadCurrentFile((unzFile) cache->zipInput, r.adr, (int) r.size) != r.size) { // erreur
|
||||
freet(r.adr);
|
||||
@@ -1245,13 +1246,14 @@ static htsblk cache_readex_old(httrackp * opt, cache_back * cache,
|
||||
FILE *fp = FOPEN(fconv(catbuff, sizeof(catbuff), return_save), "rb");
|
||||
|
||||
if (fp != NULL) {
|
||||
r.adr = (char *) malloct((size_t) r.size + 4);
|
||||
r.adr = (char *) malloct((size_t) r.size + 1);
|
||||
if (r.adr != NULL) {
|
||||
if (r.size > 0
|
||||
&& fread(r.adr, 1, (size_t) r.size, fp) != r.size) {
|
||||
r.statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(r.msg, "Read error in cache disk data");
|
||||
}
|
||||
} else if (r.size >= 0)
|
||||
*(r.adr + r.size) = '\0';
|
||||
} else {
|
||||
r.statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(r.msg,
|
||||
@@ -1266,7 +1268,7 @@ static htsblk cache_readex_old(httrackp * opt, cache_back * cache,
|
||||
}
|
||||
} else {
|
||||
// lire fichier (d'un coup)
|
||||
r.adr = (char *) malloct((size_t) r.size + 4);
|
||||
r.adr = (char *) malloct((size_t) r.size + 1);
|
||||
if (r.adr != NULL) {
|
||||
if (fread(r.adr, 1, (size_t) r.size, cache->olddat) != r.size) { // erreur
|
||||
freet(r.adr);
|
||||
@@ -1369,10 +1371,11 @@ int cache_readdata(cache_back * cache, const char *str1, const char *str2,
|
||||
|
||||
cache_rint(cache->olddat, &len);
|
||||
if (len > 0) {
|
||||
char *mem_buff = (char *) malloct(len + 4); /* Plus byte 0 */
|
||||
char *mem_buff = (char *) malloct(len + 1); /* trailing \0 */
|
||||
|
||||
if (mem_buff) {
|
||||
if (fread(mem_buff, 1, len, cache->olddat) == len) { // lire tout (y compris statuscode etc)*/
|
||||
mem_buff[len] = '\0';
|
||||
*inbuff = mem_buff;
|
||||
*inlen = len;
|
||||
return 1;
|
||||
|
||||
@@ -182,6 +182,16 @@ static int check_entry(httrackp *opt, cache_back *cache, const char *adr,
|
||||
fail++;
|
||||
}
|
||||
|
||||
/* The loaded body must be NUL-terminated at [size]: cache_readex's strlen()
|
||||
consumers (htscore.c:1046, htscache.c) rely on it, and a missing
|
||||
terminator is a heap over-read. The buffer is malloc(size + slack), so
|
||||
reading [size] is in bounds. */
|
||||
if (r.adr != NULL && r.adr[r.size] != '\0') {
|
||||
fprintf(stderr, "cache-selftest: %s%s: body not NUL-terminated at [size]\n",
|
||||
adr, fil);
|
||||
fail++;
|
||||
}
|
||||
|
||||
#undef CHECK_STR
|
||||
|
||||
if (r.adr != NULL) {
|
||||
@@ -208,6 +218,107 @@ static void gen_body(char *buf, size_t len, int kind) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Exercise the disk-fallback read path: a record stored with X-In-Cache: 0
|
||||
keeps its body on disk (not in the ZIP), and cache_readex must load it from
|
||||
there. The one-shot crawl tests never re-read such a body into memory, so
|
||||
this path otherwise has no runtime coverage. We store the header with
|
||||
all_in_cache=0 and a non-hypertext content-type (-> X-In-Cache: 0), create
|
||||
the body at the exact fconv()-resolved path the reader uses, then read it
|
||||
back and assert it round-trips and is NUL-terminated. */
|
||||
static int disk_fallback_selftest(httrackp *opt) {
|
||||
int fail = 0;
|
||||
cache_back cache;
|
||||
htsblk r;
|
||||
char catbuff[HTS_URLMAXSIZE * 2];
|
||||
char *path;
|
||||
char *locbuf;
|
||||
FILE *fp;
|
||||
const char *const adr = "example.com";
|
||||
const char *const fil = "/blob.bin";
|
||||
char save[HTS_URLMAXSIZE * 2];
|
||||
/* no embedded NUL: were the read to leave this un-terminated, a later
|
||||
strlen() would run off the end (the bug this guards) */
|
||||
static const char body[] = "BINARY-on-disk-body-0123456789-no-trailing-nul";
|
||||
const size_t body_len = sizeof(body) - 1;
|
||||
|
||||
/* X-Save must start with path_html_utf8 so the reader resolves it verbatim
|
||||
(otherwise it re-roots it as a pre-3.40 relative path); then the body we
|
||||
create at fconv(save) is exactly where cache_readex looks for it. */
|
||||
fconcat(save, sizeof(save), StringBuff(opt->path_html_utf8),
|
||||
"example.com/blob.bin");
|
||||
|
||||
/* write only the header (X-In-Cache: 0); the body stays on disk */
|
||||
selftest_open_for_write(&cache, opt);
|
||||
{
|
||||
htsblk w;
|
||||
char locw[4];
|
||||
char *bodycopy = malloct(body_len);
|
||||
|
||||
hts_init_htsblk(&w);
|
||||
w.statuscode = 200;
|
||||
w.size = (LLint) body_len;
|
||||
strcpybuff(w.msg, "OK");
|
||||
strcpybuff(w.contenttype, "application/octet-stream");
|
||||
locw[0] = '\0';
|
||||
w.location = locw;
|
||||
w.is_write = 0;
|
||||
memcpy(bodycopy, body, body_len);
|
||||
w.adr = bodycopy;
|
||||
cache_add(opt, &cache, &w, adr, fil, save, 0 /* all_in_cache */, NULL);
|
||||
freet(bodycopy);
|
||||
}
|
||||
selftest_close(&cache);
|
||||
|
||||
/* create the on-disk body where the reader will look for it */
|
||||
path = fconv(catbuff, sizeof(catbuff), save);
|
||||
(void) structcheck(path);
|
||||
fp = FOPEN(path, "wb");
|
||||
if (fp == NULL) {
|
||||
fprintf(stderr, "cache-selftest: disk-fallback: cannot create '%s'\n",
|
||||
path);
|
||||
return 1;
|
||||
}
|
||||
if (fwrite(body, 1, body_len, fp) != body_len) {
|
||||
fprintf(stderr, "cache-selftest: disk-fallback: short write to '%s'\n",
|
||||
path);
|
||||
fail++;
|
||||
}
|
||||
fclose(fp);
|
||||
|
||||
/* read it back: takes the X-In-Cache: 0 disk-fallback branch */
|
||||
selftest_open_for_read(&cache, opt);
|
||||
locbuf = malloct(HTS_URLMAXSIZE * 2);
|
||||
locbuf[0] = '\0';
|
||||
r = cache_readex(opt, &cache, adr, fil, "", locbuf, NULL, 1);
|
||||
if (r.statuscode != 200) {
|
||||
fprintf(stderr,
|
||||
"cache-selftest: disk-fallback: statuscode %d, expected 200"
|
||||
" (path not taken or read failed)\n",
|
||||
r.statuscode);
|
||||
fail++;
|
||||
}
|
||||
if (r.size != (LLint) body_len) {
|
||||
fprintf(stderr,
|
||||
"cache-selftest: disk-fallback: size " LLintP ", expected %d\n",
|
||||
(LLint) r.size, (int) body_len);
|
||||
fail++;
|
||||
} else if (r.adr == NULL || memcmp(r.adr, body, body_len) != 0) {
|
||||
fprintf(stderr, "cache-selftest: disk-fallback: body mismatch\n");
|
||||
fail++;
|
||||
}
|
||||
/* the loaded body must be NUL-terminated at [size] */
|
||||
if (r.adr != NULL && r.adr[r.size] != '\0') {
|
||||
fprintf(stderr, "cache-selftest: disk-fallback: body not NUL-terminated\n");
|
||||
fail++;
|
||||
}
|
||||
if (r.adr != NULL) {
|
||||
freet(r.adr);
|
||||
}
|
||||
freet(locbuf);
|
||||
selftest_close(&cache);
|
||||
return fail;
|
||||
}
|
||||
|
||||
int cache_selftests(httrackp *opt, const char *dir) {
|
||||
int failures = 0;
|
||||
cache_back cache;
|
||||
@@ -257,6 +368,10 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
strcatbuff(base, "/");
|
||||
}
|
||||
StringCopy(opt->path_log, base);
|
||||
/* the disk-fallback pass resolves on-disk body paths through fconv(), which
|
||||
is rooted at path_html; keep it inside the test directory too */
|
||||
StringCopy(opt->path_html, base);
|
||||
StringCopy(opt->path_html_utf8, base);
|
||||
}
|
||||
opt->cache = 1;
|
||||
|
||||
@@ -366,6 +481,9 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
"", body_updated, strlen(body_updated));
|
||||
selftest_close(&cache);
|
||||
|
||||
/* pass 5: the disk-fallback read path (X-In-Cache: 0, body on disk) */
|
||||
failures += disk_fallback_selftest(opt);
|
||||
|
||||
for (i = 0; i < large_count; i++) {
|
||||
freet(large_body[i]);
|
||||
}
|
||||
|
||||
@@ -633,13 +633,12 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
// c'est plus propre et plus logique que d'entrer à la main les liens dans la pile
|
||||
// on bénéficie ainsi des vérifications et des tests du robot pour les liens "primaires"
|
||||
primary = (char *) malloct(primary_len);
|
||||
if (primary) {
|
||||
primary[0] = '\0';
|
||||
} else {
|
||||
if (!primary) {
|
||||
printf("PANIC! : Not enough memory [%d]\n", __LINE__);
|
||||
XH_extuninit;
|
||||
return 0;
|
||||
}
|
||||
htsbuff primarybuff = htsbuff_ptr(primary, primary_len);
|
||||
|
||||
while(*a) {
|
||||
int i;
|
||||
@@ -687,11 +686,11 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
strcatbuff(tempo, "*"); // ajouter un *
|
||||
}
|
||||
}
|
||||
if (type)
|
||||
strcpybuff(filters[filptr], "+");
|
||||
else
|
||||
strcpybuff(filters[filptr], "-");
|
||||
strcatbuff(filters[filptr], tempo);
|
||||
{
|
||||
htsbuff fb = htsbuff_ptr(filters[filptr], HTS_URLMAXSIZE * 2);
|
||||
htsbuff_cpy(&fb, type ? "+" : "-");
|
||||
htsbuff_cat(&fb, tempo);
|
||||
}
|
||||
filptr++;
|
||||
|
||||
/* sanity check */
|
||||
@@ -726,12 +725,10 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
}
|
||||
url[i++] = '\0';
|
||||
|
||||
//strcatbuff(primary,"<PRIMARY=\"");
|
||||
if (strstr(url, ":/") == NULL)
|
||||
strcatbuff(primary, "http://");
|
||||
strcatbuff(primary, url);
|
||||
//strcatbuff(primary,"\">");
|
||||
strcatbuff(primary, "\n");
|
||||
htsbuff_cat(&primarybuff, "http://");
|
||||
htsbuff_cat(&primarybuff, url);
|
||||
htsbuff_cat(&primarybuff, "\n");
|
||||
}
|
||||
} // while
|
||||
|
||||
@@ -762,7 +759,6 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
int filelist_ptr = 0;
|
||||
int n = 0;
|
||||
char BIGSTK line[HTS_URLMAXSIZE * 2];
|
||||
char *primary_ptr = primary + strlen(primary);
|
||||
|
||||
while(filelist_ptr < filelist_sz) {
|
||||
int count =
|
||||
@@ -771,13 +767,10 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
if (count && line[0]) {
|
||||
n++;
|
||||
if (strstr(line, ":/") == NULL) {
|
||||
strcpybuff(primary_ptr, "http://");
|
||||
primary_ptr += strlen(primary_ptr);
|
||||
htsbuff_cat(&primarybuff, "http://");
|
||||
}
|
||||
strcpybuff(primary_ptr, line);
|
||||
primary_ptr += strlen(primary_ptr);
|
||||
strcpybuff(primary_ptr, "\n");
|
||||
primary_ptr += 1;
|
||||
htsbuff_cat(&primarybuff, line);
|
||||
htsbuff_cat(&primarybuff, "\n");
|
||||
}
|
||||
}
|
||||
// fclose(fp);
|
||||
@@ -2193,16 +2186,19 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_log),
|
||||
"hts-cache/new.lst"), "rb");
|
||||
if (new_lst != NULL && sz != (size_t) -1) {
|
||||
char *adr = (char *) malloct(sz);
|
||||
/* +1 for the NUL below: new.lst is read raw, and the strstr()
|
||||
that follows needs a terminated C string. */
|
||||
char *adr = (char *) malloct(sz + 1);
|
||||
|
||||
if (adr) {
|
||||
if (fread(adr, 1, sz, new_lst) == sz) {
|
||||
adr[sz] = '\0';
|
||||
char line[1100];
|
||||
int purge = 0;
|
||||
|
||||
while(!feof(old_lst)) {
|
||||
linput(old_lst, line, 1000);
|
||||
if (!strstr(adr, line)) { // fichier non trouvé dans le nouveau?
|
||||
if (!strstr(adr, line)) { // not found in the new list?
|
||||
char BIGSTK file[HTS_URLMAXSIZE * 2];
|
||||
|
||||
strcpybuff(file, StringBuff(opt->path_html));
|
||||
@@ -2450,9 +2446,10 @@ void host_ban(httrackp * opt, int ptr,
|
||||
// interdire host
|
||||
assertf((*_FILTERS_PTR) < opt->maxfilter);
|
||||
if (*_FILTERS_PTR < opt->maxfilter) {
|
||||
strcpybuff(_FILTERS[*_FILTERS_PTR], "-");
|
||||
strcatbuff(_FILTERS[*_FILTERS_PTR], host);
|
||||
strcatbuff(_FILTERS[*_FILTERS_PTR], "/*"); // host/ * interdit
|
||||
htsbuff fb = htsbuff_ptr(_FILTERS[*_FILTERS_PTR], HTS_URLMAXSIZE * 2);
|
||||
htsbuff_cpy(&fb, "-");
|
||||
htsbuff_cat(&fb, host);
|
||||
htsbuff_cat(&fb, "/*"); // forbid host/*
|
||||
(*_FILTERS_PTR)++;
|
||||
}
|
||||
// oups
|
||||
@@ -3515,7 +3512,7 @@ char *next_token(char *p, int flag) {
|
||||
p--;
|
||||
do {
|
||||
p++;
|
||||
if (flag && (*p == '\\')) { // sauter \x ou \"
|
||||
if (flag && (*p == '\\')) { // skip \x or \"
|
||||
if (quote) {
|
||||
char c = '\0';
|
||||
|
||||
@@ -3524,20 +3521,14 @@ char *next_token(char *p, int flag) {
|
||||
else if (*(p + 1) == '"')
|
||||
c = '"';
|
||||
if (c) {
|
||||
char BIGSTK tempo[8192];
|
||||
|
||||
tempo[0] = c;
|
||||
tempo[1] = '\0';
|
||||
strcatbuff(tempo, p + 2);
|
||||
strcpybuff(p, tempo);
|
||||
/* unescape the 2 chars to one, shifting left in place */
|
||||
*p = c;
|
||||
memmove(p + 1, p + 2, strlen(p + 2) + 1);
|
||||
}
|
||||
}
|
||||
} else if (*p == 34) { // guillemets (de fin)
|
||||
char BIGSTK tempo[8192];
|
||||
|
||||
tempo[0] = '\0';
|
||||
strcatbuff(tempo, p + 1);
|
||||
strcpybuff(p, tempo); /* wipe "" */
|
||||
} else if (*p == 34) { // closing quote
|
||||
/* drop the quote, shifting the rest left in place */
|
||||
memmove(p, p + 1, strlen(p + 1) + 1);
|
||||
p--;
|
||||
/* */
|
||||
quote = !quote;
|
||||
@@ -3877,7 +3868,7 @@ int htsAddLink(htsmoduleStruct * str, char *link) {
|
||||
afs.af.adr, afs.save, savename(), tempo);
|
||||
if (str->localLink
|
||||
&& str->localLinkSize > (int) strlen(tempo) + 1) {
|
||||
strcpybuff(str->localLink, tempo);
|
||||
strlcpybuff(str->localLink, tempo, str->localLinkSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3889,11 +3880,11 @@ int htsAddLink(htsmoduleStruct * str, char *link) {
|
||||
lien);
|
||||
if (str->localLink
|
||||
&& str->localLinkSize > (int) (strlen(afs.af.adr) + strlen(afs.af.fil) + 8)) {
|
||||
str->localLink[0] = '\0';
|
||||
htsbuff lb = htsbuff_ptr(str->localLink, str->localLinkSize);
|
||||
if (!link_has_authority(afs.af.adr))
|
||||
strcpybuff(str->localLink, "http://");
|
||||
strcatbuff(str->localLink, afs.af.adr);
|
||||
strcatbuff(str->localLink, afs.af.fil);
|
||||
htsbuff_cat(&lb, "http://");
|
||||
htsbuff_cat(&lb, afs.af.adr);
|
||||
htsbuff_cat(&lb, afs.af.fil);
|
||||
}
|
||||
r = -1;
|
||||
}
|
||||
|
||||
@@ -236,6 +236,55 @@ static void basic_selftests(void) {
|
||||
}
|
||||
freet(slots);
|
||||
}
|
||||
// next_token(): in-place token scanner. Strips surrounding quotes, unescapes
|
||||
// \" and \\ when flag is set, and returns the token terminator (the space, or
|
||||
// NULL at end of string). The unquote/unescape rewrites the string in place
|
||||
// by shifting left, so the result is always shorter -- regression for that
|
||||
// compaction.
|
||||
{
|
||||
char tok[64];
|
||||
|
||||
// plain token: unchanged, returns a pointer AT the separating space (exact
|
||||
// position, not just any space -- a strchr-style impl would land elsewhere
|
||||
// once quotes shift the content)
|
||||
strcpybuff(tok, "abc def");
|
||||
{
|
||||
char *const end = next_token(tok, 0);
|
||||
assertf(end == tok + 3 && *end == ' ' && strcmp(tok, "abc def") == 0);
|
||||
}
|
||||
// surrounding quotes stripped, returns the (post-shift) trailing space
|
||||
strcpybuff(tok, "\"ab\" cd");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == tok + 2 && *end == ' ' && strcmp(tok, "ab cd") == 0);
|
||||
}
|
||||
// a space inside quotes does not end the token; end of string returns NULL
|
||||
strcpybuff(tok, "\"a b\"c");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == NULL && strcmp(tok, "a bc") == 0);
|
||||
}
|
||||
// \" and \\ are unescaped to literal " and \ in place
|
||||
strcpybuff(tok, "\"a\\\"b\\\\c\"");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == NULL && strcmp(tok, "a\"b\\c") == 0);
|
||||
}
|
||||
// unterminated quote: the opening quote is dropped, the rest survives, and
|
||||
// the scan runs to the NUL (returns NULL)
|
||||
strcpybuff(tok, "\"ab");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == NULL && strcmp(tok, "ab") == 0);
|
||||
}
|
||||
// trailing lone backslash in a quote: *(p+1) is the NUL, not an escape, so
|
||||
// the backslash is kept intact (and there is no over-read past the NUL)
|
||||
strcpybuff(tok, "\"a\\");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == NULL && strcmp(tok, "a\\") == 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Self-tests for the htssafe.h bounded string ops (driven by httrack -#8).
|
||||
|
||||
@@ -145,8 +145,13 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
|
||||
if (!hex) {
|
||||
if (src[i] >= '0' && src[i] <= '9') {
|
||||
const int h = src[i] - '0';
|
||||
uc *= 10;
|
||||
uc += h;
|
||||
/* Guard before multiplying: a codepoint past the Unicode max
|
||||
(0x10FFFF) is invalid anyway, so stop rather than overflow uc. */
|
||||
if (uc > (0x10FFFF - h) / 10) {
|
||||
ampStart = (size_t) -1;
|
||||
} else {
|
||||
uc = uc * 10 + h;
|
||||
}
|
||||
} else {
|
||||
/* abandon */
|
||||
ampStart = (size_t) -1;
|
||||
@@ -156,8 +161,11 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
|
||||
else {
|
||||
const int h = get_hex_value(src[i]);
|
||||
if (h != -1) {
|
||||
uc *= 16;
|
||||
uc += h;
|
||||
if (uc > (0x10FFFF - h) / 16) {
|
||||
ampStart = (size_t) -1;
|
||||
} else {
|
||||
uc = uc * 16 + h;
|
||||
}
|
||||
} else {
|
||||
/* abandon */
|
||||
ampStart = (size_t) -1;
|
||||
|
||||
@@ -334,7 +334,7 @@ void index_finish(const char *indexpath, int mode) {
|
||||
if (fp_tmpproject) {
|
||||
tab = (char **) malloct(sizeof(char *) * (hts_primindex_size + 2));
|
||||
if (tab) {
|
||||
blk = malloct(size + 4);
|
||||
blk = malloct(size + 1);
|
||||
if (blk) {
|
||||
fseek(fp_tmpproject, 0, SEEK_SET);
|
||||
if ((INTsys) fread(blk, 1, size, fp_tmpproject) == size) {
|
||||
@@ -343,6 +343,7 @@ void index_finish(const char *indexpath, int mode) {
|
||||
int i;
|
||||
FILE *fp;
|
||||
|
||||
blk[size] = '\0';
|
||||
while((b = strchr(a, '\n')) && (index < hts_primindex_size)) {
|
||||
tab[index++] = a;
|
||||
*b = '\0';
|
||||
|
||||
@@ -3416,8 +3416,17 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (RUN_CALLBACK4(opt, postprocess, &cAddr, &cSize, urladr(), urlfil()) == 1) {
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"engine: postprocess-html: callback modified data, applying %d bytes", cSize);
|
||||
TypedArraySize(output_buffer) = 0;
|
||||
TypedArrayAppend(output_buffer, cAddr, cSize);
|
||||
/* The callback either edits output_buffer in place (cAddr
|
||||
unchanged) or hands back its own buffer (cAddr changed). Only
|
||||
the latter needs a copy: re-appending output_buffer onto itself
|
||||
would read freed memory, as the append's realloc can relocate
|
||||
the block out from under cAddr. */
|
||||
if (cAddr != TypedArrayElts(output_buffer)) {
|
||||
TypedArraySize(output_buffer) = 0;
|
||||
TypedArrayAppend(output_buffer, cAddr, cSize);
|
||||
} else {
|
||||
TypedArraySize(output_buffer) = (size_t) cSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1162,7 +1162,7 @@ static PT_Element PT_ReadCache__New_u(PT_Index index_, const char *url,
|
||||
FILE *fp = fopen(file_convert(catbuff, sizeof(catbuff), previous_save), "rb");
|
||||
|
||||
if (fp != NULL) {
|
||||
r->adr = (char *) malloc(r->size + 4);
|
||||
r->adr = (char *) malloc(r->size + 1);
|
||||
if (r->adr != NULL) {
|
||||
if (r->size > 0
|
||||
&& fread(r->adr, 1, r->size, fp) != r->size) {
|
||||
@@ -1172,6 +1172,7 @@ static PT_Element PT_ReadCache__New_u(PT_Index index_, const char *url,
|
||||
sprintf(r->msg, "Read error in cache disk data: %s",
|
||||
strerror(last_errno));
|
||||
}
|
||||
r->adr[r->size] = '\0';
|
||||
} else {
|
||||
r->statuscode = STATUSCODE_INVALID;
|
||||
strcpy(r->msg,
|
||||
|
||||
15
tests/13_local-cookies.test
Executable file
15
tests/13_local-cookies.test
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Cookie chain against the local test server (replaces the old online
|
||||
# ut/cookies/*.php fixtures). entrance.php sets cat/cake; second.php checks
|
||||
# them and sets badger; third.php checks all three. A missing or wrong cookie
|
||||
# returns 500, which would surface as an httrack error and a missing file, so a
|
||||
# clean 3-files/0-errors run proves the cookie jar is replayed across links.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \
|
||||
--found 'cookies/entrance.html' \
|
||||
--found 'cookies/second.html' \
|
||||
--found 'cookies/third.html' \
|
||||
httrack 'BASEURL/cookies/entrance.php'
|
||||
18
tests/14_local-https.test
Executable file
18
tests/14_local-https.test
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# HTTPS crawl against the local test server, using the shipped self-signed
|
||||
# cert. httrack does not verify certs (htslib.c: SSL_CTX_new with no
|
||||
# SSL_CTX_set_verify), so the self-signed cert is accepted as-is and this
|
||||
# exercises the real TLS path offline. basic.html links to link.html with four
|
||||
# distinct query strings, each saved under a hashed name -> 5 files.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
if test "$HTTPS_SUPPORT" == "no"; then
|
||||
echo "no https support compiled, skipping"
|
||||
exit 77
|
||||
fi
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --tls --errors 0 --files 5 \
|
||||
--found 'simple/basic.html' \
|
||||
httrack 'BASEURL/simple/basic.html'
|
||||
@@ -1,4 +1,7 @@
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh
|
||||
# Note: EXTRA_DIST globs are NOT expanded by automake; list fixtures explicitly.
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
local-crawl.sh local-server.py server.crt server.key \
|
||||
server-root/simple/basic.html server-root/simple/link.html
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
TESTS_ENVIRONMENT += PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH
|
||||
@@ -35,6 +38,8 @@ TESTS = \
|
||||
11_crawl-international.test \
|
||||
11_crawl-longurl.test \
|
||||
11_crawl-parsing.test \
|
||||
12_crawl_https.test
|
||||
12_crawl_https.test \
|
||||
13_local-cookies.test \
|
||||
14_local-https.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -476,7 +476,12 @@ target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh
|
||||
|
||||
# Note: EXTRA_DIST globs are NOT expanded by automake; list fixtures explicitly.
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
local-crawl.sh local-server.py server.crt server.key \
|
||||
server-root/simple/basic.html server-root/simple/link.html
|
||||
|
||||
# note: libtool should handle that
|
||||
### TESTS_ENVIRONMENT += $(SHLIBPATH_VAR)="$(top_builddir)/src/$(LT_CV_OBJDIR)$${$(SHLIBPATH_VAR):+$(PATH_SEPARATOR)}$$$(SHLIBPATH_VAR)"
|
||||
TESTS_ENVIRONMENT = PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH \
|
||||
@@ -509,7 +514,9 @@ TESTS = \
|
||||
11_crawl-international.test \
|
||||
11_crawl-longurl.test \
|
||||
11_crawl-parsing.test \
|
||||
12_crawl_https.test
|
||||
12_crawl_https.test \
|
||||
13_local-cookies.test \
|
||||
14_local-https.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
all: all-am
|
||||
|
||||
235
tests/local-crawl.sh
Executable file
235
tests/local-crawl.sh
Executable file
@@ -0,0 +1,235 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Launcher for httrack crawl tests against the local Python test server.
|
||||
#
|
||||
# Starts tests/local-server.py on an ephemeral port, discovers the port from
|
||||
# the server's stdout, then runs httrack against http(s)://127.0.0.1:$PORT and
|
||||
# audits the mirror. The server is always killed and the tmpdir removed on exit.
|
||||
#
|
||||
# The token BASEURL in any httrack argument is replaced with the discovered
|
||||
# http(s)://127.0.0.1:$PORT base. --found/--directory paths are relative to the
|
||||
# discovered host root (127.0.0.1_<port>/), since the random port leaks into
|
||||
# the mirror directory name.
|
||||
#
|
||||
# Usage:
|
||||
# bash local-crawl.sh [--tls] [--root DIR] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
|
||||
set -u
|
||||
|
||||
testdir=$(cd "$(dirname "$0")" && pwd)
|
||||
server="${testdir}/local-server.py"
|
||||
root="${LOCAL_SERVER_ROOT:-${testdir}/server-root}"
|
||||
cert="${testdir}/server.crt"
|
||||
key="${testdir}/server.key"
|
||||
|
||||
tls=
|
||||
verbose=
|
||||
tmpdir=
|
||||
serverpid=
|
||||
crawlpid=
|
||||
|
||||
function warning {
|
||||
echo "** $*" >&2
|
||||
return 0
|
||||
}
|
||||
function die {
|
||||
warning "$*"
|
||||
exit 1
|
||||
}
|
||||
function debug {
|
||||
test -n "$verbose" && echo "$*" >&2
|
||||
return 0
|
||||
}
|
||||
function info { printf "[%s] ..\t" "$*" >&2; }
|
||||
function result { echo "$*" >&2; }
|
||||
|
||||
function cleanup {
|
||||
if test -n "$crawlpid"; then
|
||||
kill -9 "$crawlpid" 2>/dev/null
|
||||
crawlpid=
|
||||
fi
|
||||
if test -n "$serverpid"; then
|
||||
kill "$serverpid" 2>/dev/null
|
||||
# Reap it so the port is released before we rm the tmpdir/log.
|
||||
wait "$serverpid" 2>/dev/null
|
||||
serverpid=
|
||||
fi
|
||||
if test -n "$tmpdir" && test -d "$tmpdir"; then
|
||||
test -n "$nopurge" || rm -rf "$tmpdir"
|
||||
fi
|
||||
}
|
||||
|
||||
function assert_equals {
|
||||
info "$1"
|
||||
if test ! "$2" == "$3"; then
|
||||
result "expected '$2', got '$3'"
|
||||
exit 1
|
||||
fi
|
||||
result "OK ($2)"
|
||||
}
|
||||
|
||||
nopurge=
|
||||
trap cleanup EXIT HUP INT QUIT PIPE TERM
|
||||
|
||||
# python3 is required; mirror check-network.sh's skip-with-77 convention.
|
||||
command -v python3 >/dev/null || ! echo "python3 not found; skipping local crawl tests" || exit 77
|
||||
|
||||
tmptopdir=${TMPDIR:-/tmp}
|
||||
test -d "$tmptopdir" || mkdir -p "$tmptopdir" || die "no temporary directory; set TMPDIR"
|
||||
tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create tmpdir"
|
||||
|
||||
# --- parse leading control flags --------------------------------------------
|
||||
declare -a audit=()
|
||||
scheme=http
|
||||
pos=0
|
||||
args=("$@")
|
||||
nargs=$#
|
||||
while test "$pos" -lt "$nargs"; do
|
||||
case "${args[$pos]}" in
|
||||
--debug) verbose=1 ;;
|
||||
--no-purge)
|
||||
nopurge=1
|
||||
audit+=("--no-purge")
|
||||
;;
|
||||
--tls)
|
||||
tls=1
|
||||
scheme=https
|
||||
;;
|
||||
--root)
|
||||
pos=$((pos + 1))
|
||||
root="${args[$pos]}"
|
||||
;;
|
||||
--errors | --files)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--found | --not-found | --directory)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
httrack)
|
||||
pos=$((pos + 1))
|
||||
break
|
||||
;;
|
||||
*) die "unrecognized option ${args[$pos]}" ;;
|
||||
esac
|
||||
pos=$((pos + 1))
|
||||
done
|
||||
|
||||
# --- start the server --------------------------------------------------------
|
||||
test -r "$server" || die "cannot read $server"
|
||||
serverlog="${tmpdir}/server.log"
|
||||
serverargs=(--root "$root")
|
||||
if test -n "$tls"; then
|
||||
serverargs+=(--tls --cert "$cert" --key "$key")
|
||||
fi
|
||||
debug "starting python3 $server ${serverargs[*]}"
|
||||
python3 "$server" "${serverargs[@]}" >"$serverlog" 2>&1 &
|
||||
serverpid=$!
|
||||
|
||||
# Wait for the "PORT <n>" line (server prints it once bound).
|
||||
port=
|
||||
for _ in $(seq 1 50); do
|
||||
if test -s "$serverlog"; then
|
||||
line=$(head -n1 "$serverlog")
|
||||
if test "${line%% *}" == "PORT"; then
|
||||
port="${line#PORT }"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
kill -0 "$serverpid" 2>/dev/null || die "server exited early: $(cat "$serverlog")"
|
||||
sleep 0.1
|
||||
done
|
||||
test -n "$port" || die "could not discover server port: $(cat "$serverlog")"
|
||||
debug "server listening on ${scheme}://127.0.0.1:${port}"
|
||||
|
||||
baseurl="${scheme}://127.0.0.1:${port}"
|
||||
|
||||
# --- substitute BASEURL in the remaining (httrack) args ----------------------
|
||||
declare -a hts=()
|
||||
while test "$pos" -lt "$nargs"; do
|
||||
hts+=("${args[$pos]//BASEURL/$baseurl}")
|
||||
pos=$((pos + 1))
|
||||
done
|
||||
|
||||
# --- run httrack -------------------------------------------------------------
|
||||
which httrack >/dev/null || die "could not find httrack"
|
||||
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')
|
||||
test -n "$ver" || die "could not run httrack"
|
||||
|
||||
out="${tmpdir}/crawl"
|
||||
mkdir "$out" || die "could not create $out"
|
||||
# Localhost is fast; disable the rate/bandwidth safety limits but keep a
|
||||
# max-time backstop so a hang cannot wedge the suite.
|
||||
declare -a moreargs=(--quiet --max-time=120 --timeout=30 --disable-security-limits --robots=0)
|
||||
log="${tmpdir}/log"
|
||||
info "running httrack ${hts[*]}"
|
||||
httrack -O "$out" --user-agent="httrack $ver local ($(uname -omrs))" "${moreargs[@]}" "${hts[@]}" >"$log" 2>&1 &
|
||||
crawlpid=$!
|
||||
wait "$crawlpid"
|
||||
crawlres=$?
|
||||
crawlpid=
|
||||
# httrack exits 0 even on hard connect/DNS errors, so this is a backstop only;
|
||||
# the real guard is the audit below (--errors 0 plus the host-root existence check).
|
||||
test "$crawlres" -eq 0 || ! result "httrack exited $crawlres" || {
|
||||
cat "$log" >&2
|
||||
exit 1
|
||||
}
|
||||
result "OK"
|
||||
grep -iE "^[0-9:]*[[:space:]]Error:" "${out}/hts-log.txt" >&2
|
||||
|
||||
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||
hostroot=
|
||||
for cand in "${out}/127.0.0.1_${port}" "${out}/127.0.0.1"; do
|
||||
if test -d "$cand"; then
|
||||
hostroot="$cand"
|
||||
break
|
||||
fi
|
||||
done
|
||||
test -n "$hostroot" || die "could not find host root under $out"
|
||||
debug "host root: $hostroot"
|
||||
|
||||
# --- audit -------------------------------------------------------------------
|
||||
i=0
|
||||
while test "$i" -lt "${#audit[@]}"; do
|
||||
case "${audit[$i]}" in
|
||||
--errors)
|
||||
i=$((i + 1))
|
||||
assert_equals "checking errors" "${audit[$i]}" \
|
||||
"$(grep -iEc "^[0-9:]*[[:space:]]Error:" "${out}/hts-log.txt")"
|
||||
;;
|
||||
--files)
|
||||
i=$((i + 1))
|
||||
nFiles=$(grep -E "^HTTrack Website Copier/[^ ]* mirror complete in " "${out}/hts-log.txt" |
|
||||
sed -e 's/.*[[:space:]]\([^ ]*\)[[:space:]]files written.*/\1/g')
|
||||
assert_equals "checking files" "${audit[$i]}" "$nFiles"
|
||||
;;
|
||||
--found)
|
||||
i=$((i + 1))
|
||||
info "checking for ${audit[$i]}"
|
||||
if test -f "${hostroot}/${audit[$i]}"; then result "OK"; else
|
||||
result "not found"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--not-found)
|
||||
i=$((i + 1))
|
||||
info "checking absence of ${audit[$i]}"
|
||||
if test ! -f "${hostroot}/${audit[$i]}"; then result "OK"; else
|
||||
result "present"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--directory)
|
||||
i=$((i + 1))
|
||||
info "checking for dir ${audit[$i]}"
|
||||
if test -d "${hostroot}/${audit[$i]}"; then result "OK"; else
|
||||
result "not found"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
182
tests/local-server.py
Executable file
182
tests/local-server.py
Executable file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Self-contained local web server for httrack's crawl tests.
|
||||
|
||||
Serves static fixtures from a docroot plus a handful of dynamic endpoints
|
||||
(cookies, ...) so httrack can be exercised over loopback, deterministically and
|
||||
offline, instead of crawling the live ut.httrack.com.
|
||||
|
||||
Binds to an ephemeral port (port 0) and prints the chosen port to stdout as
|
||||
"PORT <n>\n" so a launcher can discover it. Pass --tls to wrap the socket with
|
||||
the shipped self-signed test cert; httrack does not verify certs, so no CA
|
||||
trust plumbing is needed.
|
||||
|
||||
stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||
from urllib.parse import quote, unquote, urlsplit
|
||||
|
||||
# Cookie chain replicated from the old ut/cookies/*.php fixtures.
|
||||
COOKIE_PATH = "/cookies/"
|
||||
COOKIES = {
|
||||
"cat": "dog",
|
||||
"cake": "is a lie!",
|
||||
"badger": "mushroom, with 'ants'",
|
||||
}
|
||||
|
||||
PAGE = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
\t"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
||||
<head>
|
||||
\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
\t<title>Sample test</title>
|
||||
</head>
|
||||
<body>
|
||||
{body}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
class Handler(SimpleHTTPRequestHandler):
|
||||
# Quieter logging; the launcher captures httrack's own log anyway.
|
||||
def log_message(self, fmt, *args):
|
||||
if os.environ.get("LOCAL_SERVER_VERBOSE"):
|
||||
super().log_message(fmt, *args)
|
||||
|
||||
# --- helpers -----------------------------------------------------------
|
||||
|
||||
def request_cookies(self):
|
||||
"""Parse the Cookie header into {name: decoded-value}.
|
||||
|
||||
Mirrors PHP's $_COOKIE: values are url-decoded, matching the encoding
|
||||
applied when the cookie was set (see set_cookie)."""
|
||||
jar = {}
|
||||
raw = self.headers.get("Cookie", "")
|
||||
for pair in raw.split(";"):
|
||||
pair = pair.strip()
|
||||
if "=" in pair:
|
||||
name, value = pair.split("=", 1)
|
||||
jar[name.strip()] = unquote(value.strip())
|
||||
return jar
|
||||
|
||||
def set_cookie(self, name, value):
|
||||
"""Queue a Set-Cookie header, url-encoding the value like PHP's
|
||||
setcookie() so spaces/quotes/commas stay a single token that httrack
|
||||
can store and replay verbatim."""
|
||||
self._set_cookies.append(f"{name}={quote(value)}; Path={COOKIE_PATH}")
|
||||
|
||||
def send_html(self, body, status=200, extra_status=None):
|
||||
encoded = PAGE.format(body=body).encode("utf-8")
|
||||
self.send_response(status, extra_status)
|
||||
self.send_header("Content-Type", "text/html; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(encoded)))
|
||||
for cookie in self._set_cookies:
|
||||
self.send_header("Set-Cookie", cookie)
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(encoded)
|
||||
|
||||
def fail_cookie(self, what):
|
||||
# The old PHPs answered 500 with the reason in the status line.
|
||||
self.send_html("", status=500, extra_status=f"The {what} is missing or invalid")
|
||||
|
||||
# --- dynamic routes ----------------------------------------------------
|
||||
|
||||
def route_entrance(self):
|
||||
self.set_cookie("cat", COOKIES["cat"])
|
||||
self.set_cookie("cake", COOKIES["cake"])
|
||||
self.send_html('\tThis is a <a href="second.php">link</a>')
|
||||
|
||||
def route_second(self):
|
||||
jar = self.request_cookies()
|
||||
if jar.get("cat") != COOKIES["cat"]:
|
||||
return self.fail_cookie("cat")
|
||||
if jar.get("cake") != COOKIES["cake"]:
|
||||
return self.fail_cookie("cake")
|
||||
self.set_cookie("badger", COOKIES["badger"])
|
||||
self.send_html('\tThis is a <a href="third.php">link</a>')
|
||||
|
||||
def route_third(self):
|
||||
jar = self.request_cookies()
|
||||
if jar.get("cat") != COOKIES["cat"]:
|
||||
return self.fail_cookie("cat")
|
||||
if jar.get("cake") != COOKIES["cake"]:
|
||||
return self.fail_cookie("cake")
|
||||
if jar.get("badger") != COOKIES["badger"]:
|
||||
return self.fail_cookie("badger")
|
||||
self.send_html("\tThis is a test.")
|
||||
|
||||
def route_robots(self):
|
||||
body = b"User-agent: *\nDisallow:\n"
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
"/cookies/third.php": route_third,
|
||||
"/robots.txt": route_robots,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
handler = self.ROUTES.get(path)
|
||||
if handler is not None:
|
||||
handler(self)
|
||||
return True
|
||||
return False
|
||||
|
||||
def do_GET(self):
|
||||
if not self.dispatch():
|
||||
super().do_GET()
|
||||
|
||||
def do_HEAD(self):
|
||||
if not self.dispatch():
|
||||
super().do_HEAD()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--root", required=True, help="docroot for static files")
|
||||
parser.add_argument("--bind", default="127.0.0.1", help="bind address")
|
||||
parser.add_argument("--tls", action="store_true", help="serve HTTPS")
|
||||
parser.add_argument("--cert", help="TLS certificate (PEM)")
|
||||
parser.add_argument("--key", help="TLS private key (PEM)")
|
||||
args = parser.parse_args()
|
||||
|
||||
root = os.path.abspath(args.root)
|
||||
|
||||
def factory(*a, **kw):
|
||||
return Handler(*a, directory=root, **kw)
|
||||
|
||||
httpd = ThreadingHTTPServer((args.bind, 0), factory)
|
||||
|
||||
if args.tls:
|
||||
import ssl
|
||||
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
ctx.load_cert_chain(certfile=args.cert, keyfile=args.key)
|
||||
httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
|
||||
|
||||
port = httpd.socket.getsockname()[1]
|
||||
# The launcher reads this line to discover the ephemeral port.
|
||||
print(f"PORT {port}", flush=True)
|
||||
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
18
tests/server-root/simple/basic.html
Normal file
18
tests/server-root/simple/basic.html
Normal file
@@ -0,0 +1,18 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="fr">
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<title>Sample test</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
This is a <a href="link.html?v=1">link</a>
|
||||
This is a <a href='link.html?v=2'>link</a>
|
||||
This is a <a href="./link.html?v=3">link</a>
|
||||
This is a <a href=link.html?v=4>link</a>
|
||||
|
||||
</body>
|
||||
3
tests/server-root/simple/link.html
Normal file
3
tests/server-root/simple/link.html
Normal file
@@ -0,0 +1,3 @@
|
||||
This is a link.
|
||||
|
||||
Go back to <a href="basic.html">home</a>.
|
||||
21
tests/server.crt
Normal file
21
tests/server.crt
Normal file
@@ -0,0 +1,21 @@
|
||||
-----BEGIN CERTIFICATE-----
|
||||
MIIDbzCCAlegAwIBAgIUdWkDDomnY3WW95UqJ+UOASuR/i0wDQYJKoZIhvcNAQEL
|
||||
BQAwODESMBAGA1UEAwwJMTI3LjAuMC4xMSIwIAYDVQQKDBlIVFRyYWNrIGxvY2Fs
|
||||
IHRlc3Qgc2VydmVyMCAXDTI2MDYxNTE0NDQxMFoYDzIwNTYwNjA3MTQ0NDEwWjA4
|
||||
MRIwEAYDVQQDDAkxMjcuMC4wLjExIjAgBgNVBAoMGUhUVHJhY2sgbG9jYWwgdGVz
|
||||
dCBzZXJ2ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDx78mogNhT
|
||||
noWwRa51NeGtapQ1PfTYLlIMUzuloFXOsR1/ozRkFucqHNftF22wf0gg4VQJSBSf
|
||||
3rwj79vsnt3nyaD03bTAafpHXkd+IJxQowiG8TfOJF0R/Qg9g7DCE66R9agQpMJC
|
||||
SGxIin9p/4ld4Hn6869d4hNq4fHxNf/qkj2cnf8DYxrldz2FGsi6yMed4tzz2Am4
|
||||
ZbPgwep+fy843ZdYrVIms9vJluNa9E+6Vpw9FwdjzQ/IBBMLvGaC2pDkc95YelaE
|
||||
nQrAlTO/0l5vjc8XuTQFlo3DbUg+WEld/pxvCqsd/q1mqjL0WbxtXl2zCwGzAoJx
|
||||
rjVEPfA8QSbtAgMBAAGjbzBtMB0GA1UdDgQWBBTHE0KKW8REV4HxajzVsIBxz3iL
|
||||
9zAfBgNVHSMEGDAWgBTHE0KKW8REV4HxajzVsIBxz3iL9zAPBgNVHRMBAf8EBTAD
|
||||
AQH/MBoGA1UdEQQTMBGHBH8AAAGCCWxvY2FsaG9zdDANBgkqhkiG9w0BAQsFAAOC
|
||||
AQEAYlTEftrwGJBXuPmtxhmtw2HO/VTC4TGnq67hH5H+ptwgZJuuxCQ5KW6flTyp
|
||||
FTyMhha33WD4EBL3wqqJsWr9Y4BXqi4G0lRqXBcC1oIUa2VYIDMER7kaY1qTSqE8
|
||||
ARpwdB2BhvngAzDLc+4Jt4jQMRGr8fHAwxpDBoIZ1knbyzYNP73Bajse6/8YtxUu
|
||||
nB2BsldjZnLvyHvRxUpWp92OyQih4jYSrlN6olDFlKDg7++kMhkHtJQW9a1t54VN
|
||||
0ZXrB1ZRuHUUvGBq26x71riTWor7HNOSQaGeCMQjZNQkh5tfshNygUGSZVXTEwhG
|
||||
xSrOL7NqBt2+EkVwf7LjGzjmBw==
|
||||
-----END CERTIFICATE-----
|
||||
28
tests/server.key
Normal file
28
tests/server.key
Normal file
@@ -0,0 +1,28 @@
|
||||
-----BEGIN PRIVATE KEY-----
|
||||
MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDx78mogNhTnoWw
|
||||
Ra51NeGtapQ1PfTYLlIMUzuloFXOsR1/ozRkFucqHNftF22wf0gg4VQJSBSf3rwj
|
||||
79vsnt3nyaD03bTAafpHXkd+IJxQowiG8TfOJF0R/Qg9g7DCE66R9agQpMJCSGxI
|
||||
in9p/4ld4Hn6869d4hNq4fHxNf/qkj2cnf8DYxrldz2FGsi6yMed4tzz2Am4ZbPg
|
||||
wep+fy843ZdYrVIms9vJluNa9E+6Vpw9FwdjzQ/IBBMLvGaC2pDkc95YelaEnQrA
|
||||
lTO/0l5vjc8XuTQFlo3DbUg+WEld/pxvCqsd/q1mqjL0WbxtXl2zCwGzAoJxrjVE
|
||||
PfA8QSbtAgMBAAECggEACgNK4klq1T3IpKdNoBY5yoE7CbUQZBNkBpSPRxHgBezj
|
||||
SVFfgrZGnOySrIJSt4JHtuynG2Hl+0ku74HRep/ck+eOsh5W3mZvGvMLnGxhwR3u
|
||||
Or99osTIgU0VQTkpC0SLQ16FCnih0uJycNIikdLR7uuya1tt1OyIBzK7XlNGIywT
|
||||
p85zJc7/6TfTC9eM7lqh7JGR7KplBxSvgZL1pUr7y4rNpKms6uzOvPND79CcKnbU
|
||||
BBA9Tu4qdOkoOljsZKkvh3pihxyG9X6d8QTZ/uX3pkvliwSFBc+Sz9EootA3/4r5
|
||||
gVWpQ2t/AY7fY4hqzLIX/HivVaPj3cWk1G+SHm0XNQKBgQD5I9rijqFvV/p6FmUl
|
||||
FbnjJFFHHgZLivlGxAC5vOyJNQQaqdeDzg7yMotNmQTggVGjT6sjdosQb3n+ctuk
|
||||
EhQnZSU5VkNKv1+PTR35WrRkaECCaqz3Pv79pV9GVcX3it7UuYjNiOeSPqINWe+X
|
||||
49JwnJFz+qQ1BchAwOis4zkENwKBgQD4mShDaYLOO97VpgZj4cGxHHWyEK9CRQvp
|
||||
I7HxRmfaWS3JHwb88lOmALEU6pAj5cYJPAznv8BnUWcVHalZbkQ1JWYtUJRqj6OI
|
||||
Ym7rw/nm4Ay5ijbdEism173dSk3IjOe+PdAlxzsOuVzYdBTqElmeQWtBzhY9aHvX
|
||||
r+A02C2j+wKBgHHDo6Gsi57yR5gUPd9vSlCkNtEIrss0DJv5yHMIB+KnaNZcE+NF
|
||||
5qFF30Jxyz5RDtxJ9tXcvaeln8lG3XDQKI/MqfDCqTuqo5ImHrfMaW8oA70JxS2p
|
||||
gHqGVzkg1aMxsIrmpcdk6olnPExocvWivGdbtzeEjhMALu8Sp6y6nUCFAoGBAK5h
|
||||
KLgYw/OMVaQCIMthaa+l6f0s7PMMYe1453H6VBD6qz4/8HPwO7LfG1gzrUYxADgs
|
||||
ElVh0UHn/On383nS+i9Ze5Hfyyvwc+LQQURKJPrJQMPJavCptPE7NmiKnYNHK6vr
|
||||
yh0l4oxShAklbCJBGvICq4zuVfVfXDeQnDIVTfaPAoGBAMCrZqYdOUhUu+aUqxZq
|
||||
qO/TTQxrxftU63jGUg+o042TdgI4KWLn07wvHJ8/E2OqF35eXenvcuKbNLI1l72J
|
||||
4cp+3cUv8iAXThTRYEztr5CS/wta4o4CNN8zfjn5dV9AI4Hmt4V7EaGWpBcViGbj
|
||||
n0Mhag+dO8DHuenqi1yfMrAt
|
||||
-----END PRIVATE KEY-----
|
||||
Reference in New Issue
Block a user