mirror of
https://github.com/xroche/httrack.git
synced 2026-06-28 13:07:35 +03:00
Compare commits
18 Commits
dns-multia
...
fix/urlhac
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
600001b282 | ||
|
|
40a66600ff | ||
|
|
768756e231 | ||
|
|
b138c87a93 | ||
|
|
3de47433b7 | ||
|
|
fb8827718e | ||
|
|
7228210061 | ||
|
|
38882c0aee | ||
|
|
bfc4a016ab | ||
|
|
756d8fb8bd | ||
|
|
5501faa7b1 | ||
|
|
6322b6fb1f | ||
|
|
58f368a91a | ||
|
|
c97b3e233e | ||
|
|
b615a4e7fd | ||
|
|
594cf0da39 | ||
|
|
3845cd1fb3 | ||
|
|
94bffb0804 |
77
.github/workflows/ci.yml
vendored
77
.github/workflows/ci.yml
vendored
@@ -188,6 +188,51 @@ jobs:
|
|||||||
if: failure()
|
if: failure()
|
||||||
run: cat tests/test-suite.log 2>/dev/null || true
|
run: cat tests/test-suite.log 2>/dev/null || true
|
||||||
|
|
||||||
|
# MemorySanitizer catches reads of uninitialized memory (#143's stack-garbage
|
||||||
|
# size filter) that ASan/UBSan miss. It flags any byte an uninstrumented lib
|
||||||
|
# wrote, so the job stays in our own code: offline self-tests only, no openssl
|
||||||
|
# (--disable-https), no zlib cache tests, static (the runtime is not in .so's).
|
||||||
|
msan:
|
||||||
|
name: msan (MemorySanitizer, clang)
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
|
||||||
|
- name: Install build dependencies
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y --no-install-recommends \
|
||||||
|
build-essential clang autoconf automake libtool autoconf-archive \
|
||||||
|
zlib1g-dev
|
||||||
|
|
||||||
|
- name: Configure (MSan, static, no https)
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
autoreconf -fi
|
||||||
|
./configure CC=clang \
|
||||||
|
CFLAGS="-fsanitize=memory -fsanitize-memory-track-origins=2 -fno-sanitize-recover=all -g -O1 -fno-omit-frame-pointer" \
|
||||||
|
LDFLAGS="-fsanitize=memory" \
|
||||||
|
--disable-https --disable-shared --enable-static
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: make -j"$(nproc)"
|
||||||
|
|
||||||
|
- name: Test (offline self-tests under MSan)
|
||||||
|
env:
|
||||||
|
MSAN_OPTIONS: abort_on_error=1:halt_on_error=1
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
# Engine self-tests only; the cache trio pulls in uninstrumented zlib.
|
||||||
|
tests="$(cd tests && ls 01_engine-*.test | grep -v -- '-cache' | tr '\n' ' ')"
|
||||||
|
make check TESTS="$tests"
|
||||||
|
|
||||||
|
- name: Print the test log on failure
|
||||||
|
if: failure()
|
||||||
|
run: cat tests/test-suite.log 2>/dev/null || true
|
||||||
|
|
||||||
# Optional-dependency build: compile and test with HTTPS/OpenSSL disabled --
|
# Optional-dependency build: compile and test with HTTPS/OpenSSL disabled --
|
||||||
# the configuration users on minimal systems build, and one libssl is not even
|
# the configuration users on minimal systems build, and one libssl is not even
|
||||||
# installed here so configure cannot silently re-enable it. The matrix above
|
# installed here so configure cannot silently re-enable it. The matrix above
|
||||||
@@ -232,30 +277,42 @@ jobs:
|
|||||||
deb:
|
deb:
|
||||||
name: deb package (lintian)
|
name: deb package (lintian)
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-24.04
|
||||||
|
# Build and gate inside Debian sid, the upload target. A Debian dpkg-deb
|
||||||
|
# produces archive-legal xz members (an Ubuntu host defaults to zstd, which
|
||||||
|
# the archive's lintian rejects), and sid's lintian carries the same
|
||||||
|
# data-driven checks (embedded-lib fingerprints and the like) the buildds and
|
||||||
|
# UDD apply -- so issues surface here instead of after upload.
|
||||||
|
container: debian:sid
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
|
|
||||||
- name: Install packaging toolchain
|
- name: Install packaging toolchain
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
sudo apt-get update
|
apt-get update
|
||||||
sudo apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
|
ca-certificates git \
|
||||||
build-essential autoconf automake libtool autoconf-archive \
|
build-essential autoconf automake libtool autoconf-archive \
|
||||||
zlib1g-dev libssl-dev \
|
zlib1g-dev libssl-dev \
|
||||||
debhelper devscripts lintian fakeroot
|
debhelper devscripts lintian fakeroot
|
||||||
|
|
||||||
|
- uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
|
||||||
# --unsigned: CI has no GPG key (also skips the release sig/checksums).
|
# --unsigned: CI has no GPG key (also skips the release sig/checksums).
|
||||||
# debuild builds every package, then lintian gates on errors.
|
# mkdeb builds every package then runs the lintian gate (--fail-on=error,
|
||||||
|
# warning); debuild runs the packaged test pass.
|
||||||
#
|
#
|
||||||
# DEB_BUILD_OPTIONS trims work CI does not need (release builds via
|
# DEB_BUILD_OPTIONS trims work CI does not need (release builds via
|
||||||
# mkdeb.sh are untouched): noautodbgsym drops the -dbgsym packages whose
|
# mkdeb.sh are untouched): noautodbgsym drops the -dbgsym packages whose
|
||||||
# LTO payloads are slow to compress and that CI never ships; parallel uses
|
# LTO payloads are slow to compress and that CI never ships; parallel uses
|
||||||
# every core. We let debuild run its test pass -- the only one now that
|
# every core.
|
||||||
# mkdeb no longer runs its own -- so CI exercises the packaged tests.
|
- name: Build and lint Debian packages
|
||||||
- name: Build Debian packages
|
|
||||||
run: |
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
# The workspace volume is owned by the host runner uid, but the
|
||||||
|
# container runs as root, so mkdeb's git calls (superproject and the
|
||||||
|
# coucal submodule) trip "dubious ownership"; mark them all safe.
|
||||||
|
git config --global --add safe.directory "*"
|
||||||
export DEB_BUILD_OPTIONS="noautodbgsym parallel=$(nproc)"
|
export DEB_BUILD_OPTIONS="noautodbgsym parallel=$(nproc)"
|
||||||
bash tools/mkdeb.sh --unsigned --no-release-artifacts
|
bash tools/mkdeb.sh --unsigned --no-release-artifacts
|
||||||
|
|
||||||
|
|||||||
@@ -33,8 +33,9 @@ the operational checklist: toolchain, invariants, and how to ship a change.
|
|||||||
- Be terse. Comment the why, in English; translate French comments you touch.
|
- Be terse. Comment the why, in English; translate French comments you touch.
|
||||||
- Strip AI tells from prose (em-dash overuse, rule-of-three, filler, vague
|
- Strip AI tells from prose (em-dash overuse, rule-of-three, filler, vague
|
||||||
attributions). Ref: Wikipedia "Signs of AI writing". Claude Code: `/humanizer`.
|
attributions). Ref: Wikipedia "Signs of AI writing". Claude Code: `/humanizer`.
|
||||||
- Behavior change → add a test. Fast path: a hidden `httrack -#N` debug
|
- Behavior change → add a test. Fast path: a hidden `httrack -#test=NAME` engine
|
||||||
subcommand (`htscoremain.c`) driven by a `tests/NN_*.test`, over a slow crawl.
|
self-test (registry in `htsselftest.c`; `-#test` lists them) driven by a
|
||||||
|
`tests/NN_*.test`, over a slow crawl.
|
||||||
|
|
||||||
## Review your change adversarially (strongly suggested)
|
## Review your change adversarially (strongly suggested)
|
||||||
Before pushing, and when reviewing others, don't skim for bugs:
|
Before pushing, and when reviewing others, don't skim for bugs:
|
||||||
|
|||||||
@@ -215,9 +215,12 @@ AC_SUBST(OPENSSL_LIBS)
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
### Support IPv6
|
### Support IPv6
|
||||||
|
V6_SUPPORT=no
|
||||||
AC_CHECK_LIB(c, getaddrinfo, [V6_FLAG="-DINET6"
|
AC_CHECK_LIB(c, getaddrinfo, [V6_FLAG="-DINET6"
|
||||||
|
V6_SUPPORT=yes
|
||||||
AC_DEFINE(HTS_INET6, 1, [Check for IPv6])], AC_MSG_WARN([*** IPv6 not found IPv6 compatibility disabled]))
|
AC_DEFINE(HTS_INET6, 1, [Check for IPv6])], AC_MSG_WARN([*** IPv6 not found IPv6 compatibility disabled]))
|
||||||
AC_SUBST(V6_FLAG)
|
AC_SUBST(V6_FLAG)
|
||||||
|
AC_SUBST(V6_SUPPORT)
|
||||||
|
|
||||||
### Check for LFS
|
### Check for LFS
|
||||||
AC_CHECK_LIB(c, fopen64, [LFS_FLAG="-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE"
|
AC_CHECK_LIB(c, fopen64, [LFS_FLAG="-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE"
|
||||||
|
|||||||
5
debian/libhttrack3.lintian-overrides
vendored
5
debian/libhttrack3.lintian-overrides
vendored
@@ -1,3 +1,8 @@
|
|||||||
# The shared libraries ship without a versioned symbols control file (ABI is
|
# The shared libraries ship without a versioned symbols control file (ABI is
|
||||||
# tracked via the SONAME plus a >= upstream-version dependency, see debian/rules).
|
# tracked via the SONAME plus a >= upstream-version dependency, see debian/rules).
|
||||||
libhttrack3: no-symbols-control-file usr/lib/*
|
libhttrack3: no-symbols-control-file usr/lib/*
|
||||||
|
|
||||||
|
# Bundled, locally patched minizip (src/minizip): it adds a zipFlush() API the
|
||||||
|
# system libminizip lacks (htscache.c flushes the cache .zip so an interrupted
|
||||||
|
# crawl leaves a valid archive), plus Android/old-zlib portability fixes.
|
||||||
|
libhttrack3: embedded-library *libminizip*
|
||||||
|
|||||||
3
debian/proxytrack.lintian-overrides
vendored
Normal file
3
debian/proxytrack.lintian-overrides
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# Statically linked against httrack's bundled, patched minizip (see src/minizip
|
||||||
|
# and libhttrack3's override): the zipFlush() API is absent from the system one.
|
||||||
|
proxytrack: embedded-library *libminizip*
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
.\"
|
.\"
|
||||||
.\" This file is generated by man/makeman.sh; do not edit by hand.
|
.\" This file is generated by man/makeman.sh; do not edit by hand.
|
||||||
.\" SPDX-License-Identifier: GPL-3.0-or-later
|
.\" SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
.TH httrack 1 "13 June 2026" "httrack website copier"
|
.TH httrack 1 "27 June 2026" "httrack website copier"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
httrack \- offline browser : copy websites to a local directory
|
httrack \- offline browser : copy websites to a local directory
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
@@ -43,6 +43,7 @@ httrack \- offline browser : copy websites to a local directory
|
|||||||
[ \fB\-x, \-\-replace\-external\fR ]
|
[ \fB\-x, \-\-replace\-external\fR ]
|
||||||
[ \fB\-%x, \-\-disable\-passwords\fR ]
|
[ \fB\-%x, \-\-disable\-passwords\fR ]
|
||||||
[ \fB\-%q, \-\-include\-query\-string\fR ]
|
[ \fB\-%q, \-\-include\-query\-string\fR ]
|
||||||
|
[ \fB\-%g, \-\-strip\-query\fR ]
|
||||||
[ \fB\-o, \-\-generate\-errors\fR ]
|
[ \fB\-o, \-\-generate\-errors\fR ]
|
||||||
[ \fB\-X, \-\-purge\-old[=N]\fR ]
|
[ \fB\-X, \-\-purge\-old[=N]\fR ]
|
||||||
[ \fB\-%p, \-\-preserve\fR ]
|
[ \fB\-%p, \-\-preserve\fR ]
|
||||||
@@ -198,6 +199,8 @@ replace external html links by error pages (\-\-replace\-external)
|
|||||||
do not include any password for external password protected websites (%x0 include) (\-\-disable\-passwords)
|
do not include any password for external password protected websites (%x0 include) (\-\-disable\-passwords)
|
||||||
.IP \-%q
|
.IP \-%q
|
||||||
*include query string for local files (useless, for information purpose only) (%q0 don't include) (\-\-include\-query\-string)
|
*include query string for local files (useless, for information purpose only) (%q0 don't include) (\-\-include\-query\-string)
|
||||||
|
.IP \-%g
|
||||||
|
strip query keys for dedup ([host/pattern=]key1,key2,...) (\-\-strip\-query <param>)
|
||||||
.IP \-o
|
.IP \-o
|
||||||
*generate output html file in case of error (404..) (o0 don't generate) (\-\-generate\-errors)
|
*generate output html file in case of error (404..) (o0 don't generate) (\-\-generate\-errors)
|
||||||
.IP \-X
|
.IP \-X
|
||||||
@@ -225,6 +228,8 @@ tolerant requests (accept bogus responses on some servers, but not standard!) (\
|
|||||||
update hacks: various hacks to limit re\-transfers when updating (identical size, bogus response..) (\-\-updatehack)
|
update hacks: various hacks to limit re\-transfers when updating (identical size, bogus response..) (\-\-updatehack)
|
||||||
.IP \-%u
|
.IP \-%u
|
||||||
url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..) (\-\-urlhack)
|
url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..) (\-\-urlhack)
|
||||||
|
.br
|
||||||
|
opt out of one url\-hack part: \-\-keep\-www\-prefix (www.foo.com<>foo.com), \-\-keep\-double\-slashes (//), \-\-keep\-query\-order (?b&a)
|
||||||
.IP \-%A
|
.IP \-%A
|
||||||
assume that a type (cgi,asp..) is always linked with a mime type (\-%A php3,cgi=text/html;dat,bin=application/x\-zip) (\-\-assume <param>)
|
assume that a type (cgi,asp..) is always linked with a mime type (\-%A php3,cgi=text/html;dat,bin=application/x\-zip) (\-\-assume <param>)
|
||||||
.br
|
.br
|
||||||
@@ -313,12 +318,8 @@ debug HTTP headers in logfile (\-\-debug\-headers)
|
|||||||
.SS Guru options: (do NOT use if possible)
|
.SS Guru options: (do NOT use if possible)
|
||||||
.IP \-#X
|
.IP \-#X
|
||||||
*use optimized engine (limited memory boundary checks) (\-\-fast\-engine)
|
*use optimized engine (limited memory boundary checks) (\-\-fast\-engine)
|
||||||
.IP \-#0
|
.IP \-#test
|
||||||
filter test (\-#0 '*.gif' 'www.bar.com/foo.gif') (\-\-debug\-testfilters <param>)
|
list engine self\-tests (run one with \-#test=NAME [args])
|
||||||
.IP \-#1
|
|
||||||
simplify test (\-#1 ./foo/bar/../foobar)
|
|
||||||
.IP \-#2
|
|
||||||
type test (\-#2 /foo/bar.php)
|
|
||||||
.IP \-#C
|
.IP \-#C
|
||||||
cache list (\-#C '*.com/spider*.gif' (\-\-debug\-cache <param>)
|
cache list (\-#C '*.com/spider*.gif' (\-\-debug\-cache <param>)
|
||||||
.IP \-#R
|
.IP \-#R
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ whttrackrundir = $(bindir)
|
|||||||
whttrackrun_SCRIPTS = webhttrack
|
whttrackrun_SCRIPTS = webhttrack
|
||||||
|
|
||||||
libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||||
htscache_selftest.c htsdns_selftest.c \
|
htscache_selftest.c htsdns_selftest.c htsselftest.c \
|
||||||
htscatchurl.c htsfilters.c htsftp.c htshash.c coucal/coucal.c \
|
htscatchurl.c htsfilters.c htsftp.c htshash.c coucal/coucal.c \
|
||||||
htshelp.c htslib.c htscoremain.c \
|
htshelp.c htslib.c htscoremain.c \
|
||||||
htsname.c htsrobots.c htstools.c htswizard.c \
|
htsname.c htsrobots.c htstools.c htswizard.c \
|
||||||
@@ -66,7 +66,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
|||||||
md5.c \
|
md5.c \
|
||||||
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
||||||
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
||||||
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htsdns_selftest.h htscatchurl.h \
|
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htsdns_selftest.h htsselftest.h htscatchurl.h \
|
||||||
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
||||||
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
||||||
htshelp.h htsindex.h htslib.h htsmd5.h \
|
htshelp.h htsindex.h htslib.h htsmd5.h \
|
||||||
|
|||||||
@@ -60,6 +60,9 @@ Please visit our Website: http://www.httrack.com
|
|||||||
param1 : this option must be alone, and needs one distinct parameter (-P <path>)
|
param1 : this option must be alone, and needs one distinct parameter (-P <path>)
|
||||||
param0 : this option must be alone, but the parameter should be put together (+*.gif)
|
param0 : this option must be alone, but the parameter should be put together (+*.gif)
|
||||||
*/
|
*/
|
||||||
|
/* clang-format off: hand-aligned table; clang-format reflows the whole
|
||||||
|
initializer (2->4 space) on any edit, churning every untouched row. */
|
||||||
|
/* clang-format off */
|
||||||
const char *hts_optalias[][4] = {
|
const char *hts_optalias[][4] = {
|
||||||
/* {"","","",""}, */
|
/* {"","","",""}, */
|
||||||
{"path", "-O", "param1", "output path"},
|
{"path", "-O", "param1", "output path"},
|
||||||
@@ -107,6 +110,8 @@ const char *hts_optalias[][4] = {
|
|||||||
{"disable-passwords", "-%x", "single", ""}, {"disable-password", "-%x",
|
{"disable-passwords", "-%x", "single", ""}, {"disable-password", "-%x",
|
||||||
"single", ""},
|
"single", ""},
|
||||||
{"include-query-string", "-%q", "single", ""},
|
{"include-query-string", "-%q", "single", ""},
|
||||||
|
{"strip-query", "-%g", "param1",
|
||||||
|
"strip [host/pattern=]key1,key2,... from URLs"},
|
||||||
{"generate-errors", "-o", "single", ""},
|
{"generate-errors", "-o", "single", ""},
|
||||||
{"do-not-generate-errors", "-o0", "single", ""},
|
{"do-not-generate-errors", "-o0", "single", ""},
|
||||||
{"purge-old", "-X", "param", ""},
|
{"purge-old", "-X", "param", ""},
|
||||||
@@ -123,6 +128,9 @@ const char *hts_optalias[][4] = {
|
|||||||
{"tolerant", "-%B", "single", ""},
|
{"tolerant", "-%B", "single", ""},
|
||||||
{"updatehack", "-%s", "single", ""}, {"sizehack", "-%s", "single", ""},
|
{"updatehack", "-%s", "single", ""}, {"sizehack", "-%s", "single", ""},
|
||||||
{"urlhack", "-%u", "single", ""},
|
{"urlhack", "-%u", "single", ""},
|
||||||
|
{"keep-www-prefix", "-%j", "single", ""},
|
||||||
|
{"keep-double-slashes", "-%o", "single", ""},
|
||||||
|
{"keep-query-order", "-%y", "single", ""},
|
||||||
{"user-agent", "-F", "param1", "user-agent identity"},
|
{"user-agent", "-F", "param1", "user-agent identity"},
|
||||||
{"referer", "-%R", "param1", "default referer URL"},
|
{"referer", "-%R", "param1", "default referer URL"},
|
||||||
{"from", "-%E", "param1", "from email address"},
|
{"from", "-%E", "param1", "from email address"},
|
||||||
@@ -241,6 +249,7 @@ const char *hts_optalias[][4] = {
|
|||||||
|
|
||||||
{"", "", "", ""}
|
{"", "", "", ""}
|
||||||
};
|
};
|
||||||
|
/* clang-format on */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Check for alias in command-line
|
Check for alias in command-line
|
||||||
|
|||||||
271
src/htsback.c
271
src/htsback.c
@@ -57,7 +57,10 @@ Please visit our Website: http://www.httrack.com
|
|||||||
// DOS
|
// DOS
|
||||||
#include <process.h> /* _beginthread, _endthread */
|
#include <process.h> /* _beginthread, _endthread */
|
||||||
#endif
|
#endif
|
||||||
|
#include <io.h> /* _chsize_s */
|
||||||
|
#define HTS_FTRUNCATE(fp, sz) _chsize_s(_fileno(fp), (sz))
|
||||||
#else
|
#else
|
||||||
|
#define HTS_FTRUNCATE(fp, sz) ftruncate(fileno(fp), (sz))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define VT_CLREOL "\33[K"
|
#define VT_CLREOL "\33[K"
|
||||||
@@ -73,6 +76,8 @@ struct_back *back_new(httrackp *opt, int back_max) {
|
|||||||
|
|
||||||
sback->count = back_max;
|
sback->count = back_max;
|
||||||
sback->lnk = (lien_back *) calloct((back_max + 1), sizeof(lien_back));
|
sback->lnk = (lien_back *) calloct((back_max + 1), sizeof(lien_back));
|
||||||
|
sback->connect_fallback = (hts_connect_fallback *) calloct(
|
||||||
|
(back_max + 1), sizeof(hts_connect_fallback));
|
||||||
sback->ready = coucal_new(0);
|
sback->ready = coucal_new(0);
|
||||||
hts_set_hash_handler(sback->ready, opt);
|
hts_set_hash_handler(sback->ready, opt);
|
||||||
coucal_set_name(sback->ready, "back_new");
|
coucal_set_name(sback->ready, "back_new");
|
||||||
@@ -83,6 +88,7 @@ struct_back *back_new(httrackp *opt, int back_max) {
|
|||||||
sback->lnk[i].r.location = sback->lnk[i].location_buffer;
|
sback->lnk[i].r.location = sback->lnk[i].location_buffer;
|
||||||
sback->lnk[i].status = STATUS_FREE;
|
sback->lnk[i].status = STATUS_FREE;
|
||||||
sback->lnk[i].r.soc = INVALID_SOCKET;
|
sback->lnk[i].r.soc = INVALID_SOCKET;
|
||||||
|
sback->connect_fallback[i].addr_count = -1; // not yet probed
|
||||||
}
|
}
|
||||||
return sback;
|
return sback;
|
||||||
}
|
}
|
||||||
@@ -93,6 +99,7 @@ void back_free(struct_back ** sback) {
|
|||||||
freet((*sback)->lnk);
|
freet((*sback)->lnk);
|
||||||
(*sback)->lnk = NULL;
|
(*sback)->lnk = NULL;
|
||||||
}
|
}
|
||||||
|
freet((*sback)->connect_fallback);
|
||||||
if ((*sback)->ready != NULL) {
|
if ((*sback)->ready != NULL) {
|
||||||
coucal_delete(&(*sback)->ready);
|
coucal_delete(&(*sback)->ready);
|
||||||
(*sback)->ready_size_bytes = 0;
|
(*sback)->ready_size_bytes = 0;
|
||||||
@@ -102,6 +109,72 @@ void back_free(struct_back ** sback) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Per-candidate connect deadline cap (seconds): a connecting slot with another
|
||||||
|
address to try waits at most this long before falling back, instead of the
|
||||||
|
full (default 120s) slot timeout. Caps the dead-IPv6 stall while staying well
|
||||||
|
above a normal handshake. The last candidate still gets the full timeout. */
|
||||||
|
#define HTS_CONNECT_FALLBACK_TIMEOUT 10
|
||||||
|
|
||||||
|
int back_connect_fallback_due(int addr_index, int addr_count, int elapsed,
|
||||||
|
int timeout) {
|
||||||
|
int deadline;
|
||||||
|
|
||||||
|
if (addr_index + 1 >= addr_count) // last (or only) candidate: no fallback
|
||||||
|
return 0;
|
||||||
|
if (timeout <= 0) // no timeout management: never force it
|
||||||
|
return 0;
|
||||||
|
deadline = (timeout < HTS_CONNECT_FALLBACK_TIMEOUT)
|
||||||
|
? timeout
|
||||||
|
: HTS_CONNECT_FALLBACK_TIMEOUT;
|
||||||
|
return elapsed >= deadline;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Pending-connect result for a non-blocking socket reported ready by select():
|
||||||
|
0 = connected, >0 = the connect errno (refused, unreachable, ...), -1 if the
|
||||||
|
probe itself failed. A failed connect is reported writable too, so this is
|
||||||
|
how success is told from failure without blocking. */
|
||||||
|
static int connect_socket_error(T_SOC soc) {
|
||||||
|
int soerr = 0;
|
||||||
|
socklen_t len = (socklen_t) sizeof(soerr);
|
||||||
|
|
||||||
|
if (getsockopt(soc, SOL_SOCKET, SO_ERROR, (char *) &soerr, &len) != 0)
|
||||||
|
return -1;
|
||||||
|
return soerr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Retry a stuck/failed connecting slot against its next resolved address.
|
||||||
|
Closes the current socket and starts a non-blocking connect to the next
|
||||||
|
candidate, leaving the slot in STATUS_CONNECTING. Returns 1 if a new connect
|
||||||
|
was started, 0 if no fallback address remains (caller fails the slot). */
|
||||||
|
static int back_connect_next(httrackp *opt, struct_back *sback, int i) {
|
||||||
|
hts_connect_fallback *const cf = &sback->connect_fallback[i];
|
||||||
|
lien_back *const back = sback->lnk;
|
||||||
|
const int next = cf->addr_index + 1;
|
||||||
|
T_SOC soc;
|
||||||
|
|
||||||
|
if (next >= cf->addr_count)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (back[i].r.soc != INVALID_SOCKET) {
|
||||||
|
deletehttp(&back[i].r);
|
||||||
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
|
}
|
||||||
|
soc = newhttp_addr(opt, back[i].url_adr, &back[i].r, -1, 0, next, NULL);
|
||||||
|
if (soc == INVALID_SOCKET)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
back[i].r.soc = soc;
|
||||||
|
cf->addr_index = next;
|
||||||
|
cf->connect_start = time_local();
|
||||||
|
if (back[i].timeout > 0)
|
||||||
|
back[i].timeout_refresh = cf->connect_start;
|
||||||
|
back[i].status = STATUS_CONNECTING;
|
||||||
|
hts_log_print(opt, LOG_DEBUG,
|
||||||
|
"connect failed, trying next address (%d/%d) for %s", next + 1,
|
||||||
|
cf->addr_count, back[i].url_adr);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
void back_delete_all(httrackp * opt, cache_back * cache, struct_back * sback) {
|
void back_delete_all(httrackp * opt, cache_back * cache, struct_back * sback) {
|
||||||
if (sback != NULL) {
|
if (sback != NULL) {
|
||||||
int i;
|
int i;
|
||||||
@@ -1911,8 +1984,11 @@ int back_add(struct_back * sback, httrackp * opt, cache_back * cache, const char
|
|||||||
// ouvrir liaison, envoyer requète
|
// ouvrir liaison, envoyer requète
|
||||||
// ne pas traiter ou recevoir l'en tête immédiatement
|
// ne pas traiter ou recevoir l'en tête immédiatement
|
||||||
hts_init_htsblk(&back[p].r);
|
hts_init_htsblk(&back[p].r);
|
||||||
//memset(&(back[p].r), 0, sizeof(htsblk));
|
// memset(&(back[p].r), 0, sizeof(htsblk));
|
||||||
back[p].r.location = back[p].location_buffer;
|
back[p].r.location = back[p].location_buffer;
|
||||||
|
// fresh connect: address list not yet probed, start at the first
|
||||||
|
sback->connect_fallback[p].addr_index = 0;
|
||||||
|
sback->connect_fallback[p].addr_count = -1;
|
||||||
// recopier proxy
|
// recopier proxy
|
||||||
if ((back[p].r.req.proxy.active = opt->proxy.active)) {
|
if ((back[p].r.req.proxy.active = opt->proxy.active)) {
|
||||||
if (StringBuff(opt->proxy.bindhost) != NULL)
|
if (StringBuff(opt->proxy.bindhost) != NULL)
|
||||||
@@ -2369,21 +2445,25 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
// en cas de gestion du connect préemptif
|
// en cas de gestion du connect préemptif
|
||||||
#if HTS_XCONN
|
#if HTS_XCONN
|
||||||
if (back[i].status == STATUS_CONNECTING) { // connexion
|
if (back[i].status == STATUS_CONNECTING) { // connexion
|
||||||
do_wait = 1;
|
// a connecting slot always carries a live socket; guard anyway so a
|
||||||
|
// stray INVALID_SOCKET can never reach FD_SET (mirrors the recv branch)
|
||||||
|
if (back[i].r.soc != INVALID_SOCKET) {
|
||||||
|
do_wait = 1;
|
||||||
|
|
||||||
// noter socket write
|
// noter socket write
|
||||||
FD_SET(back[i].r.soc, &fds_c);
|
FD_SET(back[i].r.soc, &fds_c);
|
||||||
|
|
||||||
// noter socket erreur
|
// noter socket erreur
|
||||||
FD_SET(back[i].r.soc, &fds_e);
|
FD_SET(back[i].r.soc, &fds_e);
|
||||||
|
|
||||||
// calculer max
|
// calculer max
|
||||||
if (max_c) {
|
if (max_c) {
|
||||||
max_c = 0;
|
max_c = 0;
|
||||||
nfds = back[i].r.soc;
|
nfds = back[i].r.soc;
|
||||||
} else if (back[i].r.soc > nfds) {
|
} else if (back[i].r.soc > nfds) {
|
||||||
// ID socket la plus élevée
|
// ID socket la plus élevée
|
||||||
nfds = back[i].r.soc;
|
nfds = back[i].r.soc;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else
|
} else
|
||||||
@@ -2517,8 +2597,20 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
}
|
}
|
||||||
// ---- FLAG WRITE MIS A UN?: POUR LE CONNECT
|
// ---- FLAG WRITE MIS A UN?: POUR LE CONNECT
|
||||||
if (back[i].status == STATUS_CONNECTING) { // attendre connect
|
if (back[i].status == STATUS_CONNECTING) { // attendre connect
|
||||||
|
hts_connect_fallback *const cf = &sback->connect_fallback[i];
|
||||||
int dispo = 0;
|
int dispo = 0;
|
||||||
|
|
||||||
|
// probe the resolved address list once per fresh connect (cache hit:
|
||||||
|
// the host was resolved when this connect was opened)
|
||||||
|
if (cf->addr_count < 0 && back[i].r.soc != INVALID_SOCKET &&
|
||||||
|
!back[i].r.is_file) {
|
||||||
|
SOCaddr scratch[HTS_MAXADDRNUM];
|
||||||
|
|
||||||
|
cf->addr_count = hts_dns_resolve_all(opt, back[i].url_adr, scratch,
|
||||||
|
HTS_MAXADDRNUM, NULL);
|
||||||
|
cf->connect_start = time_local();
|
||||||
|
}
|
||||||
|
|
||||||
// vérifier l'existance de timeout-check
|
// vérifier l'existance de timeout-check
|
||||||
if (!gestion_timeout)
|
if (!gestion_timeout)
|
||||||
if (back[i].timeout > 0)
|
if (back[i].timeout > 0)
|
||||||
@@ -2526,7 +2618,20 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
|
|
||||||
// connecté?
|
// connecté?
|
||||||
dispo = FD_ISSET(back[i].r.soc, &fds_c);
|
dispo = FD_ISSET(back[i].r.soc, &fds_c);
|
||||||
if (dispo) { // ok connected!!
|
if (dispo) { // socket ready: connect() finished (ok or failed)
|
||||||
|
// a refused/failed connect is reported writable too; probe SO_ERROR
|
||||||
|
// and, on failure, fall back to the next address (or fail the slot)
|
||||||
|
if (connect_socket_error(back[i].r.soc) != 0) {
|
||||||
|
if (!back_connect_next(opt, sback, i)) {
|
||||||
|
deletehttp(&back[i].r);
|
||||||
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
|
back[i].r.statuscode = STATUSCODE_CONNERROR;
|
||||||
|
strcpybuff(back[i].r.msg, "Connect Error");
|
||||||
|
back[i].status = STATUS_READY;
|
||||||
|
back_set_finished(sback, i);
|
||||||
|
}
|
||||||
|
continue; // reconnected (stay connecting) or failed
|
||||||
|
}
|
||||||
busy_state = 1;
|
busy_state = 1;
|
||||||
|
|
||||||
#if HTS_USEOPENSSL
|
#if HTS_USEOPENSSL
|
||||||
@@ -3661,7 +3766,27 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
/********** **************************** ********** */
|
/********** **************************** ********** */
|
||||||
} else { // il faut aller le chercher
|
}
|
||||||
|
// MIME type excluded by a -mime: filter: abort, don't fetch
|
||||||
|
// the body (#58)
|
||||||
|
else if (HTTP_IS_OK(back[i].r.statuscode) &&
|
||||||
|
!back[i].testmode &&
|
||||||
|
strnotempty(back[i].r.contenttype) &&
|
||||||
|
hts_acceptmime(opt, 0, back[i].url_adr,
|
||||||
|
back[i].url_fil,
|
||||||
|
back[i].r.contenttype) == 1) {
|
||||||
|
deletehttp(&back[i].r);
|
||||||
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
|
back[i].status = STATUS_READY;
|
||||||
|
back_set_finished(sback, i);
|
||||||
|
back[i].r.statuscode = STATUSCODE_EXCLUDED;
|
||||||
|
strcpybuff(back[i].r.msg, "Excluded by MIME type filter");
|
||||||
|
hts_log_print(
|
||||||
|
opt, LOG_NOTICE,
|
||||||
|
"File excluded by MIME type filter (%s): %s%s",
|
||||||
|
back[i].r.contenttype, back[i].url_adr,
|
||||||
|
back[i].url_fil);
|
||||||
|
} else { // il faut aller le chercher
|
||||||
|
|
||||||
// effacer buffer (requète)
|
// effacer buffer (requète)
|
||||||
if (!noFreebuff) {
|
if (!noFreebuff) {
|
||||||
@@ -3672,35 +3797,70 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
// xxc SI CHUNK VERIFIER QUE CA MARCHE??
|
// xxc SI CHUNK VERIFIER QUE CA MARCHE??
|
||||||
if (back[i].r.statuscode == 206) { // on nous envoie un morceau (la fin) coz une partie sur disque!
|
if (back[i].r.statuscode == 206) { // on nous envoie un morceau (la fin) coz une partie sur disque!
|
||||||
off_t sz = fsize_utf8(back[i].url_sav);
|
off_t sz = fsize_utf8(back[i].url_sav);
|
||||||
|
/* RFC 7233: resume at the server's Content-Range start,
|
||||||
|
not the offset we requested; a server may resume
|
||||||
|
earlier and appending the overlap duplicates bytes
|
||||||
|
(#198). */
|
||||||
|
const LLint resume = back[i].r.crange_start;
|
||||||
|
const hts_boolean range_ok =
|
||||||
|
back[i].r.crange > 0 && resume >= 0 &&
|
||||||
|
resume <= (LLint) sz &&
|
||||||
|
back[i].r.crange_end + 1 == back[i].r.crange &&
|
||||||
|
(back[i].r.totalsize < 0 ||
|
||||||
|
back[i].r.totalsize ==
|
||||||
|
back[i].r.crange_end - resume + 1);
|
||||||
|
|
||||||
#if HDEBUG
|
#if HDEBUG
|
||||||
printf("partial content: " LLintP " on disk..\n",
|
printf("partial content: " LLintP " on disk..\n",
|
||||||
(LLint) sz);
|
(LLint) sz);
|
||||||
#endif
|
#endif
|
||||||
if (sz >= 0) {
|
if (sz >= 0 && range_ok) {
|
||||||
if (!is_hypertext_mime(opt, back[i].r.contenttype, back[i].url_sav)) { // pas HTML
|
if (!is_hypertext_mime(opt, back[i].r.contenttype, back[i].url_sav)) { // pas HTML
|
||||||
if (opt->getmode & HTS_GETMODE_NONHTML) {
|
if (opt->getmode & HTS_GETMODE_NONHTML) {
|
||||||
filenote(&opt->state.strc, back[i].url_sav, NULL); // noter fichier comme connu
|
filenote(&opt->state.strc, back[i].url_sav, NULL); // noter fichier comme connu
|
||||||
file_notify(opt, back[i].url_adr, back[i].url_fil,
|
file_notify(opt, back[i].url_adr, back[i].url_fil,
|
||||||
back[i].url_sav, 0, 1,
|
back[i].url_sav, 0, 1,
|
||||||
back[i].r.notmodified);
|
back[i].r.notmodified);
|
||||||
back[i].r.out = FOPEN(fconv(catbuff, sizeof(catbuff), back[i].url_sav), "ab"); // append
|
back[i].r.out =
|
||||||
|
FOPEN(fconv(catbuff, sizeof(catbuff),
|
||||||
|
back[i].url_sav),
|
||||||
|
"r+b"); // resume in place
|
||||||
if (back[i].r.out && opt->cache != 0) {
|
if (back[i].r.out && opt->cache != 0) {
|
||||||
back[i].r.is_write = 1; // écrire
|
back[i].r.is_write = 1;
|
||||||
back[i].r.size = sz; // déja écrit
|
back[i].r.size = resume; // bytes already on disk
|
||||||
back[i].r.statuscode = HTTP_OK; // Forcer 'OK'
|
back[i].r.statuscode = HTTP_OK; // force 'OK'
|
||||||
if (back[i].r.totalsize >= 0)
|
if (back[i].r.totalsize >= 0)
|
||||||
back[i].r.totalsize += sz; // plus en fait
|
back[i].r.totalsize += resume; // -> full size
|
||||||
fseek(back[i].r.out, 0, SEEK_END); // à la fin
|
// drop bytes past the resume point; a silent
|
||||||
/* create a temporary reference file in case of broken mirror */
|
// failure could leave a stale tail, so on error
|
||||||
if (back_serialize_ref(opt, &back[i]) != 0) {
|
// drop the partial and refetch the whole file
|
||||||
hts_log_print(opt, LOG_WARNING,
|
if (HTS_FTRUNCATE(back[i].r.out,
|
||||||
"Could not create temporary reference file for %s%s",
|
(off_t) resume) != 0) {
|
||||||
back[i].url_adr, back[i].url_fil);
|
fclose(back[i].r.out);
|
||||||
}
|
back[i].r.out = NULL;
|
||||||
|
url_savename_refname_remove(
|
||||||
|
opt, back[i].url_adr, back[i].url_fil);
|
||||||
|
UNLINK(back[i].url_sav);
|
||||||
|
back[i].status = STATUS_READY;
|
||||||
|
back_set_finished(sback, i);
|
||||||
|
strcpybuff(back[i].r.msg,
|
||||||
|
"Can not truncate partial file, "
|
||||||
|
"restarting");
|
||||||
|
} else {
|
||||||
|
fseeko(back[i].r.out, (off_t) resume, SEEK_SET);
|
||||||
|
/* create a temporary reference file in case of
|
||||||
|
* broken mirror */
|
||||||
|
if (back_serialize_ref(opt, &back[i]) != 0) {
|
||||||
|
hts_log_print(opt, LOG_WARNING,
|
||||||
|
"Could not create temporary "
|
||||||
|
"reference file for %s%s",
|
||||||
|
back[i].url_adr,
|
||||||
|
back[i].url_fil);
|
||||||
|
}
|
||||||
#if HDEBUG
|
#if HDEBUG
|
||||||
printf("continue interrupted file\n");
|
printf("continue interrupted file\n");
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
} else { // On est dans la m**
|
} else { // On est dans la m**
|
||||||
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
||||||
back_set_finished(sback, i);
|
back_set_finished(sback, i);
|
||||||
@@ -3712,17 +3872,18 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
FILE *fp =
|
FILE *fp =
|
||||||
FOPEN(fconv(catbuff, sizeof(catbuff), back[i].url_sav), "rb");
|
FOPEN(fconv(catbuff, sizeof(catbuff), back[i].url_sav), "rb");
|
||||||
if (fp) {
|
if (fp) {
|
||||||
LLint alloc_mem = sz + 1;
|
LLint alloc_mem = resume + 1;
|
||||||
|
|
||||||
if (back[i].r.totalsize >= 0)
|
if (back[i].r.totalsize >= 0)
|
||||||
alloc_mem += back[i].r.totalsize; // AJOUTER RESTANT!
|
alloc_mem += back[i].r.totalsize; // AJOUTER RESTANT!
|
||||||
if (deleteaddr(&back[i].r)
|
if (deleteaddr(&back[i].r)
|
||||||
&& (back[i].r.adr =
|
&& (back[i].r.adr =
|
||||||
(char *) malloct((size_t) alloc_mem))) {
|
(char *) malloct((size_t) alloc_mem))) {
|
||||||
back[i].r.size = sz;
|
back[i].r.size = resume;
|
||||||
if (back[i].r.totalsize >= 0)
|
if (back[i].r.totalsize >= 0)
|
||||||
back[i].r.totalsize += sz; // plus en fait
|
back[i].r.totalsize += resume; // -> full size
|
||||||
if ((fread(back[i].r.adr, 1, sz, fp)) != sz) {
|
if ((fread(back[i].r.adr, 1, (size_t) resume,
|
||||||
|
fp)) != (size_t) resume) {
|
||||||
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
||||||
back_set_finished(sback, i);
|
back_set_finished(sback, i);
|
||||||
strcpybuff(back[i].r.msg,
|
strcpybuff(back[i].r.msg,
|
||||||
@@ -3740,14 +3901,30 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
"No memory for partial file");
|
"No memory for partial file");
|
||||||
}
|
}
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
} else { // Argh..
|
} else { // open failed
|
||||||
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
||||||
back_set_finished(sback, i);
|
back_set_finished(sback, i);
|
||||||
strcpybuff(back[i].r.msg,
|
strcpybuff(back[i].r.msg,
|
||||||
"Can not open partial file");
|
"Can not open partial file");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else { // Non trouvé??
|
} else if (sz >=
|
||||||
|
0) { // unusable range -> restart whole file
|
||||||
|
hts_log_print(opt, LOG_WARNING,
|
||||||
|
"Unusable partial-content range for %s%s "
|
||||||
|
"(have " LLintP " bytes, got " LLintP
|
||||||
|
"-" LLintP "/" LLintP "), restarting",
|
||||||
|
back[i].url_adr, back[i].url_fil,
|
||||||
|
(LLint) sz, back[i].r.crange_start,
|
||||||
|
back[i].r.crange_end, back[i].r.crange);
|
||||||
|
url_savename_refname_remove(opt, back[i].url_adr,
|
||||||
|
back[i].url_fil);
|
||||||
|
UNLINK(back[i].url_sav);
|
||||||
|
back[i].status = STATUS_READY;
|
||||||
|
back_set_finished(sback, i);
|
||||||
|
strcpybuff(back[i].r.msg,
|
||||||
|
"Unusable partial content, restarting");
|
||||||
|
} else { // partial not found
|
||||||
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
back[i].status = STATUS_READY; // terminé (voir plus loin)
|
||||||
back_set_finished(sback, i);
|
back_set_finished(sback, i);
|
||||||
strcpybuff(back[i].r.msg, "Can not find partial file");
|
strcpybuff(back[i].r.msg, "Can not find partial file");
|
||||||
@@ -3828,7 +4005,6 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*} */
|
/*} */
|
||||||
@@ -3884,6 +4060,29 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
|
|
||||||
if (back[i].status > 0) { // réception/connexion/..
|
if (back[i].status > 0) { // réception/connexion/..
|
||||||
if (back[i].timeout > 0) {
|
if (back[i].timeout > 0) {
|
||||||
|
// a stuck connect with a fallback address: retry the next one well
|
||||||
|
// before the full timeout (dead IPv6 on a dual-stack host, ...)
|
||||||
|
if (back[i].status == STATUS_CONNECTING) {
|
||||||
|
const hts_connect_fallback *const cf =
|
||||||
|
&sback->connect_fallback[i];
|
||||||
|
|
||||||
|
if (back_connect_fallback_due(cf->addr_index, cf->addr_count,
|
||||||
|
(int) (act - cf->connect_start),
|
||||||
|
back[i].timeout)) {
|
||||||
|
if (back_connect_next(opt, sback, i)) {
|
||||||
|
continue; // reconnected to the next candidate
|
||||||
|
}
|
||||||
|
// fallback was due but no socket could be opened
|
||||||
|
// (back_connect_next closed the dead one): stop now rather than
|
||||||
|
// spin on an invalid fd
|
||||||
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
|
back[i].r.statuscode = STATUSCODE_CONNERROR;
|
||||||
|
strcpybuff(back[i].r.msg, "Connect Error");
|
||||||
|
back[i].status = STATUS_READY;
|
||||||
|
back_set_finished(sback, i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
//printf("time check %d\n",((int) (act-back[i].timeout_refresh))-back[i].timeout);
|
//printf("time check %d\n",((int) (act-back[i].timeout_refresh))-back[i].timeout);
|
||||||
if (((int) (act - back[i].timeout_refresh)) >= back[i].timeout) {
|
if (((int) (act - back[i].timeout_refresh)) >= back[i].timeout) {
|
||||||
hts_log_print(opt, LOG_DEBUG, "connection timed out for %s%s", back[i].url_adr,
|
hts_log_print(opt, LOG_DEBUG, "connection timed out for %s%s", back[i].url_adr,
|
||||||
|
|||||||
@@ -146,7 +146,8 @@ typedef enum BackStatusCode {
|
|||||||
STATUSCODE_NON_FATAL = -5,
|
STATUSCODE_NON_FATAL = -5,
|
||||||
STATUSCODE_SSL_HANDSHAKE = -6,
|
STATUSCODE_SSL_HANDSHAKE = -6,
|
||||||
STATUSCODE_TOO_BIG = -7,
|
STATUSCODE_TOO_BIG = -7,
|
||||||
STATUSCODE_TEST_OK = -10
|
STATUSCODE_TEST_OK = -10,
|
||||||
|
STATUSCODE_EXCLUDED = -11 /* aborted: MIME excluded by a -mime: filter */
|
||||||
} BackStatusCode;
|
} BackStatusCode;
|
||||||
|
|
||||||
/** HTTrack status ('status' member of of 'lien_back') **/
|
/** HTTrack status ('status' member of of 'lien_back') **/
|
||||||
|
|||||||
@@ -220,6 +220,25 @@ struct cache_back_zip_entry {
|
|||||||
} \
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
/* A cache (new.zip) write failed: storage is gone (disk full / dropped share),
|
||||||
|
so the mirror is doomed too. Abort it via exit_xh, don't crash as assertf
|
||||||
|
did. */
|
||||||
|
static void cache_zip_write_failed(httrackp *opt, cache_back *cache,
|
||||||
|
const char *what, int zErr) {
|
||||||
|
if (!cache->zipWriteFailed) {
|
||||||
|
cache->zipWriteFailed = HTS_TRUE;
|
||||||
|
if (check_fatal_io_errno()) {
|
||||||
|
hts_log_print(opt, LOG_ERROR,
|
||||||
|
"Mirror aborted: disk full or filesystem problems");
|
||||||
|
} else {
|
||||||
|
hts_log_print(opt, LOG_ERROR,
|
||||||
|
"Mirror aborted: cache write failed (%s): %s", what,
|
||||||
|
hts_get_zerror(zErr));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
|
||||||
|
}
|
||||||
|
|
||||||
/* Ajout d'un fichier en cache */
|
/* Ajout d'un fichier en cache */
|
||||||
void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||||
const char *url_adr, const char *url_fil, const char *url_save,
|
const char *url_adr, const char *url_fil, const char *url_save,
|
||||||
@@ -236,6 +255,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
|||||||
const char *url_save_suffix = url_save;
|
const char *url_save_suffix = url_save;
|
||||||
int zErr;
|
int zErr;
|
||||||
|
|
||||||
|
/* already failed and aborting; don't touch the broken stream again */
|
||||||
|
if (cache->zipWriteFailed)
|
||||||
|
return;
|
||||||
|
|
||||||
// robots.txt hack
|
// robots.txt hack
|
||||||
if (url_save == NULL) {
|
if (url_save == NULL) {
|
||||||
dataincache = 0; // testing links
|
dataincache = 0; // testing links
|
||||||
@@ -346,9 +369,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
|||||||
*/
|
*/
|
||||||
headers, (uInt) strlen(headers), NULL, 0, NULL, /* comment */
|
headers, (uInt) strlen(headers), NULL, 0, NULL, /* comment */
|
||||||
Z_DEFLATED, Z_DEFAULT_COMPRESSION)) != Z_OK) {
|
Z_DEFLATED, Z_DEFAULT_COMPRESSION)) != Z_OK) {
|
||||||
int zip_zipOpenNewFileInZip_failed = 0;
|
cache_zip_write_failed(opt, cache, "opening a cache entry", zErr);
|
||||||
|
return;
|
||||||
assertf(zip_zipOpenNewFileInZip_failed);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Write data in cache */
|
/* Write data in cache */
|
||||||
@@ -358,9 +380,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
|||||||
if ((zErr =
|
if ((zErr =
|
||||||
zipWriteInFileInZip((zipFile) cache->zipOutput, r->adr,
|
zipWriteInFileInZip((zipFile) cache->zipOutput, r->adr,
|
||||||
(int) r->size)) != Z_OK) {
|
(int) r->size)) != Z_OK) {
|
||||||
int zip_zipWriteInFileInZip_failed = 0;
|
cache_zip_write_failed(opt, cache, "writing to the cache", zErr);
|
||||||
|
return;
|
||||||
assertf(zip_zipWriteInFileInZip_failed);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -381,9 +402,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
|||||||
if ((zErr =
|
if ((zErr =
|
||||||
zipWriteInFileInZip((zipFile) cache->zipOutput, buff,
|
zipWriteInFileInZip((zipFile) cache->zipOutput, buff,
|
||||||
(int) nl)) != Z_OK) {
|
(int) nl)) != Z_OK) {
|
||||||
int zip_zipWriteInFileInZip_failed = 0;
|
cache_zip_write_failed(opt, cache, "writing to the cache",
|
||||||
|
zErr);
|
||||||
assertf(zip_zipWriteInFileInZip_failed);
|
fclose(fp);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while(nl > 0);
|
} while(nl > 0);
|
||||||
@@ -397,16 +419,14 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
|||||||
|
|
||||||
/* Close */
|
/* Close */
|
||||||
if ((zErr = zipCloseFileInZip((zipFile) cache->zipOutput)) != Z_OK) {
|
if ((zErr = zipCloseFileInZip((zipFile) cache->zipOutput)) != Z_OK) {
|
||||||
int zip_zipCloseFileInZip_failed = 0;
|
cache_zip_write_failed(opt, cache, "closing a cache entry", zErr);
|
||||||
|
return;
|
||||||
assertf(zip_zipCloseFileInZip_failed);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Flush */
|
/* Flush */
|
||||||
if ((zErr = zipFlush((zipFile) cache->zipOutput)) != 0) {
|
if ((zErr = zipFlush((zipFile) cache->zipOutput)) != 0) {
|
||||||
int zip_zipFlush_failed = 0;
|
cache_zip_write_failed(opt, cache, "flushing the cache", zErr);
|
||||||
|
return;
|
||||||
assertf(zip_zipFlush_failed);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ Please visit our Website: http://www.httrack.com
|
|||||||
#include "htslib.h"
|
#include "htslib.h"
|
||||||
#include "htszlib.h"
|
#include "htszlib.h"
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
@@ -316,6 +317,136 @@ static int disk_fallback_selftest(httrackp *opt) {
|
|||||||
return fail;
|
return fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
size_t budget; /**< bytes allowed through before writes start failing */
|
||||||
|
int fail_errno; /**< errno set on the failing write (ENOSPC, EIO, ...) */
|
||||||
|
int writes; /**< zwrite call count, to detect re-entry into the stream */
|
||||||
|
} writefail_inject;
|
||||||
|
|
||||||
|
/* zwrite that copies until the budget runs out, then fails with inj->fail_errno
|
||||||
|
(the #174/#219 condition). Counts calls so the test can prove a flagged cache
|
||||||
|
never re-enters the stream. */
|
||||||
|
static uLong selftest_failing_zwrite(voidpf opaque, voidpf stream,
|
||||||
|
const void *buf, uLong size) {
|
||||||
|
writefail_inject *inj = (writefail_inject *) opaque;
|
||||||
|
|
||||||
|
inj->writes++;
|
||||||
|
if (inj->budget >= (size_t) size) {
|
||||||
|
inj->budget -= (size_t) size;
|
||||||
|
return (uLong) fwrite(buf, 1, (size_t) size, (FILE *) stream);
|
||||||
|
}
|
||||||
|
errno = inj->fail_errno;
|
||||||
|
return 0; /* short write -> the minizip op returns an error */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Open a ZIP whose writes fail past inj->budget, so cache_add() hits an error.
|
||||||
|
*/
|
||||||
|
static zipFile selftest_open_failing_zip(const char *path,
|
||||||
|
writefail_inject *inj) {
|
||||||
|
zlib_filefunc_def ff;
|
||||||
|
|
||||||
|
fill_fopen_filefunc(&ff); /* real fopen/read/seek/close; ignores opaque */
|
||||||
|
ff.zwrite_file = selftest_failing_zwrite;
|
||||||
|
ff.opaque = inj;
|
||||||
|
return zipOpen2(path, APPEND_STATUS_CREATE, NULL, &ff);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Store one octet-stream body into `cache` (all-in-cache, body in the ZIP). */
|
||||||
|
static void writefail_store(httrackp *opt, cache_back *cache, const char *fil,
|
||||||
|
const char *body, size_t body_len) {
|
||||||
|
htsblk r;
|
||||||
|
char locbuf[4];
|
||||||
|
char *bodycopy = malloct(body_len);
|
||||||
|
|
||||||
|
hts_init_htsblk(&r);
|
||||||
|
r.statuscode = 200;
|
||||||
|
r.size = (LLint) body_len;
|
||||||
|
strcpybuff(r.msg, "OK");
|
||||||
|
strcpybuff(r.contenttype, "application/octet-stream");
|
||||||
|
locbuf[0] = '\0';
|
||||||
|
r.location = locbuf;
|
||||||
|
r.is_write = 0;
|
||||||
|
memcpy(bodycopy, body, body_len);
|
||||||
|
r.adr = bodycopy;
|
||||||
|
cache_add(opt, cache, &r, "example.com", fil, "example.com/blob.bin", 1,
|
||||||
|
NULL);
|
||||||
|
freet(bodycopy);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* #174/#219: a failing cache write used to crash via assertf(); it must instead
|
||||||
|
stop the mirror (exit_xh = -1) without crashing. Assert that, plus the cache
|
||||||
|
is flagged and a sibling write doesn't re-enter the broken stream. */
|
||||||
|
int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||||
|
int fail = 0;
|
||||||
|
char path[HTS_URLMAXSIZE];
|
||||||
|
/* incompressible + big, so deflate flushes (and fails) mid-write, before
|
||||||
|
* close */
|
||||||
|
static const size_t body_len = 256 * 1024;
|
||||||
|
char *body = malloct(body_len);
|
||||||
|
int phase;
|
||||||
|
|
||||||
|
gen_body(body, body_len, 1 /* incompressible */);
|
||||||
|
fconcat(path, sizeof(path), dir, "/wfail.zip");
|
||||||
|
|
||||||
|
/* phase 0: fail on the body write, fatal errno (ENOSPC, the disk-full
|
||||||
|
branch). phase 1: fail on the open, non-fatal errno (EIO, dropped-share
|
||||||
|
branch). Both must abort the mirror. */
|
||||||
|
for (phase = 0; phase < 2; phase++) {
|
||||||
|
cache_back cache;
|
||||||
|
writefail_inject inj;
|
||||||
|
int writes_after_fail;
|
||||||
|
|
||||||
|
inj.budget = (phase == 0) ? 4096 : 0;
|
||||||
|
inj.fail_errno = (phase == 0) ? ENOSPC : EIO;
|
||||||
|
inj.writes = 0;
|
||||||
|
memset(&cache, 0, sizeof(cache));
|
||||||
|
cache.type = 1;
|
||||||
|
cache.log = stderr;
|
||||||
|
cache.errlog = stderr;
|
||||||
|
cache.hashtable = coucal_new(0);
|
||||||
|
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||||
|
if (cache.zipOutput == NULL) {
|
||||||
|
fprintf(stderr, "cache-writefail: could not open injected ZIP\n");
|
||||||
|
fail++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
opt->state.exit_xh = 0; /* clear; the failing write must set it to -1 */
|
||||||
|
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||||
|
if (!cache.zipWriteFailed) {
|
||||||
|
fprintf(stderr, "cache-writefail: phase %d: write error not caught\n",
|
||||||
|
phase);
|
||||||
|
fail++;
|
||||||
|
}
|
||||||
|
if (opt->state.exit_xh != -1) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"cache-writefail: phase %d: mirror not aborted (exit_xh=%d)\n",
|
||||||
|
phase, opt->state.exit_xh);
|
||||||
|
fail++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* a flagged cache must no-op a sibling write: no further backend write */
|
||||||
|
writes_after_fail = inj.writes;
|
||||||
|
writefail_store(opt, &cache, "/blob2.bin", body, 16);
|
||||||
|
if (inj.writes != writes_after_fail) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"cache-writefail: phase %d: sibling write re-entered the broken "
|
||||||
|
"stream (%d extra backend writes)\n",
|
||||||
|
phase, inj.writes - writes_after_fail);
|
||||||
|
fail++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cache.zipOutput != NULL) {
|
||||||
|
zipClose(cache.zipOutput,
|
||||||
|
NULL); /* best-effort; may fail on the backend */
|
||||||
|
cache.zipOutput = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
freet(body);
|
||||||
|
return fail;
|
||||||
|
}
|
||||||
|
|
||||||
int cache_selftests(httrackp *opt, const char *dir) {
|
int cache_selftests(httrackp *opt, const char *dir) {
|
||||||
int failures = 0;
|
int failures = 0;
|
||||||
cache_back cache;
|
cache_back cache;
|
||||||
|
|||||||
@@ -52,6 +52,10 @@ int cache_selftests(httrackp *opt, const char *dir);
|
|||||||
committed file, never by the test). Returns the failed-check count. */
|
committed file, never by the test). Returns the failed-check count. */
|
||||||
int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
|
int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
|
||||||
|
|
||||||
|
/* #174/#219: assert a failing cache write aborts the mirror cleanly instead of
|
||||||
|
crashing. Returns the failed-check count. */
|
||||||
|
int cache_write_failure_selftest(httrackp *opt, const char *dir);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -736,26 +736,39 @@ int httpmirror(char *url1, httrackp * opt) {
|
|||||||
/* OPTIMIZED for fast load */
|
/* OPTIMIZED for fast load */
|
||||||
if (StringNotEmpty(opt->filelist)) {
|
if (StringNotEmpty(opt->filelist)) {
|
||||||
char *filelist_buff = NULL;
|
char *filelist_buff = NULL;
|
||||||
const size_t filelist_sz = off_t_to_size_t(fsize(StringBuff(opt->filelist)));
|
size_t filelist_sz = 0;
|
||||||
|
const char *filelist_err = NULL; /* failure reason, NULL on success */
|
||||||
|
const off_t fs = fsize(StringBuff(opt->filelist));
|
||||||
|
|
||||||
if (filelist_sz != (size_t) -1) {
|
if (fs < 0) {
|
||||||
|
/* fsize() hides the cause; redo stat() for a precise errno (#49) */
|
||||||
|
struct stat st;
|
||||||
|
filelist_err = stat(StringBuff(opt->filelist), &st) != 0
|
||||||
|
? strerror(errno)
|
||||||
|
: "not a regular file";
|
||||||
|
} else if ((filelist_sz = off_t_to_size_t(fs)) == (size_t) -1) {
|
||||||
|
filelist_err = "file too large";
|
||||||
|
filelist_sz = 0;
|
||||||
|
} else {
|
||||||
FILE *fp = fopen(StringBuff(opt->filelist), "rb");
|
FILE *fp = fopen(StringBuff(opt->filelist), "rb");
|
||||||
|
|
||||||
if (fp) {
|
if (fp == NULL) {
|
||||||
|
filelist_err = strerror(errno);
|
||||||
|
} else {
|
||||||
filelist_buff = malloct(filelist_sz + 1);
|
filelist_buff = malloct(filelist_sz + 1);
|
||||||
if (filelist_buff) {
|
if (filelist_buff == NULL) {
|
||||||
if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
|
filelist_err = "out of memory";
|
||||||
freet(filelist_buff);
|
} else if (fread(filelist_buff, 1, filelist_sz, fp) != filelist_sz) {
|
||||||
filelist_buff = NULL;
|
freet(filelist_buff);
|
||||||
} else {
|
filelist_err = "read error";
|
||||||
*(filelist_buff + filelist_sz) = '\0';
|
} else {
|
||||||
}
|
filelist_buff[filelist_sz] = '\0';
|
||||||
}
|
}
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (filelist_buff) {
|
if (filelist_buff != NULL) {
|
||||||
int filelist_ptr = 0;
|
int filelist_ptr = 0;
|
||||||
int n = 0;
|
int n = 0;
|
||||||
char BIGSTK line[HTS_URLMAXSIZE * 2];
|
char BIGSTK line[HTS_URLMAXSIZE * 2];
|
||||||
@@ -780,8 +793,8 @@ int httpmirror(char *url1, httrackp * opt) {
|
|||||||
// Free buffer
|
// Free buffer
|
||||||
freet(filelist_buff);
|
freet(filelist_buff);
|
||||||
} else {
|
} else {
|
||||||
hts_log_print(opt, LOG_ERROR, "Could not include URL list: %s",
|
hts_log_print(opt, LOG_ERROR, "Could not include URL list \"%s\": %s",
|
||||||
StringBuff(opt->filelist));
|
StringBuff(opt->filelist), filelist_err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3726,6 +3739,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
|||||||
if (StringNotEmpty(from->user_agent))
|
if (StringNotEmpty(from->user_agent))
|
||||||
StringCopyS(to->user_agent, from->user_agent);
|
StringCopyS(to->user_agent, from->user_agent);
|
||||||
|
|
||||||
|
if (StringNotEmpty(from->strip_query))
|
||||||
|
StringCopyS(to->strip_query, from->strip_query);
|
||||||
|
|
||||||
if (from->retry > -1)
|
if (from->retry > -1)
|
||||||
to->retry = from->retry;
|
to->retry = from->retry;
|
||||||
|
|
||||||
|
|||||||
@@ -152,6 +152,15 @@ struct lien_adrfilsave {
|
|||||||
char save[HTS_URLMAXSIZE * 2]; /**< local save path (with directory) */
|
char save[HTS_URLMAXSIZE * 2]; /**< local save path (with directory) */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** Per-slot connect-fallback bookkeeping (parallel to struct_back.lnk).
|
||||||
|
Tracks which resolved address the slot is currently connecting to so a
|
||||||
|
stuck connect can be retried against the next one. */
|
||||||
|
typedef struct hts_connect_fallback {
|
||||||
|
int addr_index; /**< candidate being connected (0-based) */
|
||||||
|
int addr_count; /**< resolved addresses; -1 = not yet probed */
|
||||||
|
TStamp connect_start; /**< when the current candidate's connect began */
|
||||||
|
} hts_connect_fallback;
|
||||||
|
|
||||||
/** The download-slot ring: the set of concurrent transfers in flight.
|
/** The download-slot ring: the set of concurrent transfers in flight.
|
||||||
Allocated/owned by the engine; consumers (status callbacks, the loop)
|
Allocated/owned by the engine; consumers (status callbacks, the loop)
|
||||||
read it but do not resize or free it. */
|
read it but do not resize or free it. */
|
||||||
@@ -168,6 +177,7 @@ struct struct_back {
|
|||||||
int count; /**< number of usable slots (back_max) */
|
int count; /**< number of usable slots (back_max) */
|
||||||
coucal ready; /**< index of slots whose transfer completed */
|
coucal ready; /**< index of slots whose transfer completed */
|
||||||
LLint ready_size_bytes; /**< total bytes buffered in completed slots */
|
LLint ready_size_bytes; /**< total bytes buffered in completed slots */
|
||||||
|
hts_connect_fallback *connect_fallback; /**< per-slot, count+1 entries */
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct cache_back_zip_entry cache_back_zip_entry;
|
typedef struct cache_back_zip_entry cache_back_zip_entry;
|
||||||
@@ -204,6 +214,8 @@ struct cache_back {
|
|||||||
cache_back_zip_entry *zipEntries;
|
cache_back_zip_entry *zipEntries;
|
||||||
int zipEntriesOffs;
|
int zipEntriesOffs;
|
||||||
int zipEntriesCapa;
|
int zipEntriesCapa;
|
||||||
|
hts_boolean
|
||||||
|
zipWriteFailed; /**< a cache write failed; stop touching the stream */
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifndef HTS_DEF_FWSTRUCT_hash_struct
|
#ifndef HTS_DEF_FWSTRUCT_hash_struct
|
||||||
@@ -222,8 +234,12 @@ struct hash_struct {
|
|||||||
coucal adrfil;
|
coucal adrfil;
|
||||||
/* former address+path -> link index (renamed/moved entries) */
|
/* former address+path -> link index (renamed/moved entries) */
|
||||||
coucal former_adrfil;
|
coucal former_adrfil;
|
||||||
/* scratch buffers reused across lookups (not reentrant) */
|
/* effective urlhack sub-flags: www.==host / // collapse / query-arg sort */
|
||||||
int normalized;
|
hts_boolean norm_host;
|
||||||
|
hts_boolean norm_slash;
|
||||||
|
hts_boolean norm_query;
|
||||||
|
/* query-strip keys (not owned); set from opt->strip_query at hash_init */
|
||||||
|
const char *strip_query;
|
||||||
char normfil[HTS_URLMAXSIZE * 2];
|
char normfil[HTS_URLMAXSIZE * 2];
|
||||||
char normfil2[HTS_URLMAXSIZE * 2];
|
char normfil2[HTS_URLMAXSIZE * 2];
|
||||||
char catbuff[CATBUFF_SIZE];
|
char catbuff[CATBUFF_SIZE];
|
||||||
@@ -352,6 +368,22 @@ int fspc(httrackp * opt, FILE * fp, const char *type);
|
|||||||
|
|
||||||
char *next_token(char *p, int flag);
|
char *next_token(char *p, int flag);
|
||||||
|
|
||||||
|
/* Like fil_normalized(), but first drops query keys in STRIP (comma-separated,
|
||||||
|
"*" = all); STRIP NULL/empty behaves exactly like fil_normalized(). */
|
||||||
|
char *fil_normalized_filtered(const char *source, char *dest,
|
||||||
|
const char *strip);
|
||||||
|
|
||||||
|
/* As fil_normalized_filtered(), but DO_SLASH/DO_QUERY gate the // collapse and
|
||||||
|
the query-argument sort independently (the urlhack sub-flags). */
|
||||||
|
char *fil_normalized_filtered_ex(const char *source, char *dest,
|
||||||
|
const char *strip, int do_slash, int do_query);
|
||||||
|
|
||||||
|
/* For URL ADR/FIL, return (in DEST) the comma keylist to strip from the
|
||||||
|
'\n'-separated "[pattern=]keys" RULES (patterns matched on host/path via
|
||||||
|
strjoker, last wins); NULL if none match. Feeds fil_normalized_filtered(). */
|
||||||
|
const char *hts_query_strip_keys(const char *rules, const char *adr,
|
||||||
|
const char *fil, char *dest, size_t destsize);
|
||||||
|
|
||||||
/* Read a whole file into a freshly malloc'd, NUL-terminated buffer; the caller
|
/* Read a whole file into a freshly malloc'd, NUL-terminated buffer; the caller
|
||||||
owns it and must release it with freet(). Return NULL on missing/unreadable
|
owns it and must release it with freet(). Return NULL on missing/unreadable
|
||||||
file (readfile_or substitutes defaultdata instead). The byte content is NOT
|
file (readfile_or substitutes defaultdata instead). The byte content is NOT
|
||||||
@@ -372,6 +404,13 @@ void check_rate(TStamp stat_timestart, int maxrate);
|
|||||||
/* Backing (download-slot) scheduler. Operate on the back[] ring (struct_back).
|
/* Backing (download-slot) scheduler. Operate on the back[] ring (struct_back).
|
||||||
Not thread-safe; call from the single crawl loop. */
|
Not thread-safe; call from the single crawl loop. */
|
||||||
|
|
||||||
|
/* True if a connecting slot should give up on the current address and try the
|
||||||
|
next one: a fallback address remains (addr_index+1 < addr_count) and the
|
||||||
|
candidate has been connecting for at least its deadline, min(timeout, an
|
||||||
|
internal cap). elapsed/timeout in seconds. Exposed for the -#D self-test. */
|
||||||
|
int back_connect_fallback_due(int addr_index, int addr_count, int elapsed,
|
||||||
|
int timeout);
|
||||||
|
|
||||||
/* How many new sockets may be opened now, honoring maxsoc and the maxconn rate
|
/* How many new sockets may be opened now, honoring maxsoc and the maxconn rate
|
||||||
limit (>=0). _strict ignores reserved-slot headroom; the plain form leaves
|
limit (>=0). _strict ignores reserved-slot headroom; the plain form leaves
|
||||||
room for naming tests and stops at 0 when the stack is nearly full. */
|
room for naming tests and stops at 0 when the stack is nearly full. */
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -62,7 +62,7 @@ typedef struct mock_host {
|
|||||||
const char *name;
|
const char *name;
|
||||||
int gai_err; /* non-zero: getaddrinfo returns this */
|
int gai_err; /* non-zero: getaddrinfo returns this */
|
||||||
int naddr;
|
int naddr;
|
||||||
mock_addr addr[3];
|
mock_addr addr[6];
|
||||||
int calls; /* times the backend resolved this host */
|
int calls; /* times the backend resolved this host */
|
||||||
} mock_host;
|
} mock_host;
|
||||||
|
|
||||||
@@ -83,6 +83,17 @@ static mock_host mock_hosts[] = {
|
|||||||
{{AF_INET, {9, 10, 11, 12}},
|
{{AF_INET, {9, 10, 11, 12}},
|
||||||
{AF_INET6, {0x20, 0x01, 0x0d, 0xb8, [15] = 3}}},
|
{AF_INET6, {0x20, 0x01, 0x0d, 0xb8, [15] = 3}}},
|
||||||
0},
|
0},
|
||||||
|
/* more addresses than HTS_MAXADDRNUM: the list must clamp to the cap. */
|
||||||
|
{"many.test",
|
||||||
|
0,
|
||||||
|
6,
|
||||||
|
{{AF_INET, {10, 0, 0, 1}},
|
||||||
|
{AF_INET, {10, 0, 0, 2}},
|
||||||
|
{AF_INET, {10, 0, 0, 3}},
|
||||||
|
{AF_INET, {10, 0, 0, 4}},
|
||||||
|
{AF_INET, {10, 0, 0, 5}},
|
||||||
|
{AF_INET, {10, 0, 0, 6}}},
|
||||||
|
0},
|
||||||
{"nodns.test", EAI_NONAME, 0, {{0}}, 0},
|
{"nodns.test", EAI_NONAME, 0, {{0}}, 0},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -197,9 +208,9 @@ int dns_selftests(httrackp *opt) {
|
|||||||
CHECK(resolve_family_nocache("v4only.test") == AF_INET);
|
CHECK(resolve_family_nocache("v4only.test") == AF_INET);
|
||||||
CHECK(resolve_family_nocache("v6only.test") == AF_INET6);
|
CHECK(resolve_family_nocache("v6only.test") == AF_INET6);
|
||||||
|
|
||||||
/* Dual-stack: the current resolver keeps only the *first* address. Both
|
/* Dual-stack: the single-address API returns the *first* resolved address.
|
||||||
orderings pin that (not a family preference); PR2 (multi-address) widens
|
Both orderings pin selection by position, not a family preference. The
|
||||||
it. */
|
multi-address API (resolve_all, below) exposes the whole list. */
|
||||||
CHECK(resolve_family_nocache("dual.test") == AF_INET6); /* v6 listed first */
|
CHECK(resolve_family_nocache("dual.test") == AF_INET6); /* v6 listed first */
|
||||||
CHECK(resolve_family_nocache("dual4.test") == AF_INET); /* v4 listed first */
|
CHECK(resolve_family_nocache("dual4.test") == AF_INET); /* v4 listed first */
|
||||||
|
|
||||||
@@ -240,6 +251,100 @@ int dns_selftests(httrackp *opt) {
|
|||||||
CHECK(mock_find("nodns.test")->calls == 1); /* resolved once, then cached */
|
CHECK(mock_find("nodns.test")->calls == 1); /* resolved once, then cached */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Multi-address resolution: count and order are the connect-fallback
|
||||||
|
contract. A dead first address is retried against the next, so both must be
|
||||||
|
exact. */
|
||||||
|
mock_reset_calls();
|
||||||
|
{
|
||||||
|
SOCaddr addrs[HTS_MAXADDRNUM];
|
||||||
|
char ip[64];
|
||||||
|
const char *err = NULL;
|
||||||
|
|
||||||
|
/* dual-stack, in resolver order: [0]=v6, [1]=v4 */
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "dual.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||||
|
2);
|
||||||
|
CHECK(SOCaddr_sinfamily(addrs[0]) == AF_INET6);
|
||||||
|
CHECK(SOCaddr_sinfamily(addrs[1]) == AF_INET);
|
||||||
|
SOCaddr_inetntoa(ip, sizeof(ip), addrs[1]);
|
||||||
|
CHECK(strcmp(ip, "5.6.7.8") == 0);
|
||||||
|
CHECK(mock_find("dual.test")->calls ==
|
||||||
|
1); /* one backend hit for the list */
|
||||||
|
|
||||||
|
/* single-address host: count 1 */
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "v4only.test", addrs, HTS_MAXADDRNUM,
|
||||||
|
&err) == 1);
|
||||||
|
SOCaddr_inetntoa(ip, sizeof(ip), addrs[0]);
|
||||||
|
CHECK(strcmp(ip, "1.2.3.4") == 0);
|
||||||
|
|
||||||
|
/* does-not-resolve: count 0 (negative), no addresses */
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "nodns.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||||
|
0);
|
||||||
|
|
||||||
|
/* more than the cap: the kept list is clamped to HTS_MAXADDRNUM, keeping
|
||||||
|
the FIRST addresses in resolver order (not some other window) */
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "many.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||||
|
HTS_MAXADDRNUM);
|
||||||
|
SOCaddr_inetntoa(ip, sizeof(ip), addrs[0]);
|
||||||
|
CHECK(strcmp(ip, "10.0.0.1") == 0);
|
||||||
|
SOCaddr_inetntoa(ip, sizeof(ip), addrs[HTS_MAXADDRNUM - 1]);
|
||||||
|
CHECK(strcmp(ip, "10.0.0.4") == 0);
|
||||||
|
|
||||||
|
/* family filter still applies through the list path */
|
||||||
|
IPV6_resolver = 1;
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "dual4.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||||
|
1);
|
||||||
|
CHECK(SOCaddr_sinfamily(addrs[0]) == AF_INET);
|
||||||
|
IPV6_resolver = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* newhttp_addr() must connect to the addr_index-th address, not always the
|
||||||
|
first: this is what back_connect_next relies on to reach the fallback. */
|
||||||
|
{
|
||||||
|
htsblk r;
|
||||||
|
int count = -1;
|
||||||
|
T_SOC s;
|
||||||
|
|
||||||
|
hts_init_htsblk(&r);
|
||||||
|
s = newhttp_addr(opt, "dual.test", &r, 80, 0, 0, &count);
|
||||||
|
CHECK(count == 2);
|
||||||
|
CHECK(SOCaddr_sinfamily(r.address) == AF_INET6); /* index 0 = v6 */
|
||||||
|
if (s != INVALID_SOCKET)
|
||||||
|
deletesoc(s);
|
||||||
|
|
||||||
|
hts_init_htsblk(&r);
|
||||||
|
count = -1;
|
||||||
|
s = newhttp_addr(opt, "dual.test", &r, 80, 0, 1, &count);
|
||||||
|
CHECK(count == 2);
|
||||||
|
CHECK(SOCaddr_sinfamily(r.address) == AF_INET); /* index 1 = v4 */
|
||||||
|
if (s != INVALID_SOCKET)
|
||||||
|
deletesoc(s);
|
||||||
|
|
||||||
|
/* out-of-range index: no address selected (address stays unset) */
|
||||||
|
hts_init_htsblk(&r);
|
||||||
|
s = newhttp_addr(opt, "dual.test", &r, 80, 0, 2, NULL);
|
||||||
|
CHECK(s == INVALID_SOCKET);
|
||||||
|
if (s != INVALID_SOCKET)
|
||||||
|
deletesoc(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Connect-fallback decision (consumer of the multi-address list): when a
|
||||||
|
stuck connect should abandon the current address for the next one. */
|
||||||
|
{
|
||||||
|
/* no fallback for the last/only candidate, whatever the elapsed time */
|
||||||
|
CHECK(back_connect_fallback_due(0, 1, 9999, 120) == 0);
|
||||||
|
CHECK(back_connect_fallback_due(1, 2, 9999, 120) == 0);
|
||||||
|
CHECK(back_connect_fallback_due(3, 4, 9999, 120) == 0);
|
||||||
|
/* fallback available: wait the per-candidate deadline (cap 10s here) */
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 9, 120) == 0);
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 10, 120) == 1);
|
||||||
|
CHECK(back_connect_fallback_due(2, 4, 10, 120) == 1);
|
||||||
|
/* a shorter slot timeout shortens the deadline (min(timeout, cap)) */
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 4, 5) == 0);
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 5, 5) == 1);
|
||||||
|
/* no timeout management: never force a fallback */
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 9999, 0) == 0);
|
||||||
|
}
|
||||||
|
|
||||||
hts_dns_set_resolver_backend(NULL);
|
hts_dns_set_resolver_backend(NULL);
|
||||||
return failures;
|
return failures;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -76,7 +76,8 @@ int fa_strjoker(int type, char **filters, int nfil, const char *nom, LLint * siz
|
|||||||
}
|
}
|
||||||
if (size)
|
if (size)
|
||||||
sz = *size;
|
sz = *size;
|
||||||
if (strjoker(nom, filters[i] + filteroffs, &sz, size_flag)) { // reconnu
|
/* size unknown (scan time): no size pointer => size tests stay neutral */
|
||||||
|
if (strjoker(nom, filters[i] + filteroffs, size ? &sz : NULL, size_flag)) {
|
||||||
if (size)
|
if (size)
|
||||||
if (sz != *size)
|
if (sz != *size)
|
||||||
sizelimit = sz;
|
sizelimit = sz;
|
||||||
|
|||||||
@@ -408,6 +408,10 @@ typedef int T_SOC;
|
|||||||
/* Buffer size for a printed network address (IPv4 or IPv6, NUL included). */
|
/* Buffer size for a printed network address (IPv4 or IPv6, NUL included). */
|
||||||
#define HTS_MAXADDRLEN 64
|
#define HTS_MAXADDRLEN 64
|
||||||
|
|
||||||
|
/* Max resolved addresses kept per host for connect fallback (dead IPv6 etc.).
|
||||||
|
*/
|
||||||
|
#define HTS_MAXADDRNUM 4
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#else
|
#else
|
||||||
#define __cdecl
|
#define __cdecl
|
||||||
|
|||||||
@@ -106,10 +106,10 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
|
|||||||
const lien_url*const lien = (const lien_url*) value;
|
const lien_url*const lien = (const lien_url*) value;
|
||||||
const char *const adr = !former ? lien->adr : lien->former_adr;
|
const char *const adr = !former ? lien->adr : lien->former_adr;
|
||||||
const char *const fil = !former ? lien->fil : lien->former_fil;
|
const char *const fil = !former ? lien->fil : lien->former_fil;
|
||||||
const char *const adr_norm = adr != NULL ?
|
const char *const adr_norm =
|
||||||
( hash->normalized ? jump_normalized_const(adr)
|
adr != NULL ? (hash->norm_host ? jump_normalized_const(adr)
|
||||||
: jump_identification_const(adr) )
|
: jump_identification_const(adr))
|
||||||
: NULL;
|
: NULL;
|
||||||
|
|
||||||
// copy address
|
// copy address
|
||||||
assertf(adr_norm != NULL);
|
assertf(adr_norm != NULL);
|
||||||
@@ -117,10 +117,18 @@ static coucal_hashkeys key_adrfil_hashes_generic(void *arg,
|
|||||||
|
|
||||||
// copy link
|
// copy link
|
||||||
assertf(fil != NULL);
|
assertf(fil != NULL);
|
||||||
if (hash->normalized) {
|
{
|
||||||
fil_normalized(fil, &hash->normfil[strlen(hash->normfil)]);
|
/* resolve the per-URL strip keys; strip applies even when urlhack is off */
|
||||||
} else {
|
char BIGSTK keybuf[HTS_URLMAXSIZE];
|
||||||
strcpy(&hash->normfil[strlen(hash->normfil)], fil);
|
const char *const keys = hts_query_strip_keys(hash->strip_query, adr, fil,
|
||||||
|
keybuf, sizeof(keybuf));
|
||||||
|
|
||||||
|
if (hash->norm_slash || hash->norm_query || keys != NULL) {
|
||||||
|
fil_normalized_filtered_ex(fil, &hash->normfil[strlen(hash->normfil)],
|
||||||
|
keys, hash->norm_slash, hash->norm_query);
|
||||||
|
} else {
|
||||||
|
strcpy(&hash->normfil[strlen(hash->normfil)], fil);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// hash
|
// hash
|
||||||
@@ -132,8 +140,7 @@ static int key_adrfil_equals_generic(void *arg,
|
|||||||
coucal_key_const a_,
|
coucal_key_const a_,
|
||||||
coucal_key_const b_,
|
coucal_key_const b_,
|
||||||
const int former) {
|
const int former) {
|
||||||
hash_struct *const hash = (hash_struct*) arg;
|
hash_struct *const hash = (hash_struct *) arg;
|
||||||
const int normalized = hash->normalized;
|
|
||||||
const lien_url*const a = (const lien_url*) a_;
|
const lien_url*const a = (const lien_url*) a_;
|
||||||
const lien_url*const b = (const lien_url*) b_;
|
const lien_url*const b = (const lien_url*) b_;
|
||||||
const char *const a_adr = !former ? a->adr : a->former_adr;
|
const char *const a_adr = !former ? a->adr : a->former_adr;
|
||||||
@@ -150,10 +157,10 @@ static int key_adrfil_equals_generic(void *arg,
|
|||||||
assertf(b_fil != NULL);
|
assertf(b_fil != NULL);
|
||||||
|
|
||||||
// skip scheme and authentication to the domain (possibly without www.)
|
// skip scheme and authentication to the domain (possibly without www.)
|
||||||
ja = normalized
|
ja = hash->norm_host ? jump_normalized_const(a_adr)
|
||||||
? jump_normalized_const(a_adr) : jump_identification_const(a_adr);
|
: jump_identification_const(a_adr);
|
||||||
jb = normalized
|
jb = hash->norm_host ? jump_normalized_const(b_adr)
|
||||||
? jump_normalized_const(b_adr) : jump_identification_const(b_adr);
|
: jump_identification_const(b_adr);
|
||||||
assertf(ja != NULL);
|
assertf(ja != NULL);
|
||||||
assertf(jb != NULL);
|
assertf(jb != NULL);
|
||||||
if (strcasecmp(ja, jb) != 0) {
|
if (strcasecmp(ja, jb) != 0) {
|
||||||
@@ -161,12 +168,23 @@ static int key_adrfil_equals_generic(void *arg,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// now compare pathes
|
// now compare pathes
|
||||||
if (normalized) {
|
{
|
||||||
fil_normalized(a_fil, hash->normfil);
|
char BIGSTK ka[HTS_URLMAXSIZE], kb[HTS_URLMAXSIZE];
|
||||||
fil_normalized(b_fil, hash->normfil2);
|
const char *const keysa =
|
||||||
return strcmp(hash->normfil, hash->normfil2) == 0;
|
hts_query_strip_keys(hash->strip_query, a_adr, a_fil, ka, sizeof(ka));
|
||||||
} else {
|
const char *const keysb =
|
||||||
return strcmp(a_fil, b_fil) == 0;
|
hts_query_strip_keys(hash->strip_query, b_adr, b_fil, kb, sizeof(kb));
|
||||||
|
|
||||||
|
if (hash->norm_slash || hash->norm_query || keysa != NULL ||
|
||||||
|
keysb != NULL) {
|
||||||
|
fil_normalized_filtered_ex(a_fil, hash->normfil, keysa, hash->norm_slash,
|
||||||
|
hash->norm_query);
|
||||||
|
fil_normalized_filtered_ex(b_fil, hash->normfil2, keysb, hash->norm_slash,
|
||||||
|
hash->norm_query);
|
||||||
|
return strcmp(hash->normfil, hash->normfil2) == 0;
|
||||||
|
} else {
|
||||||
|
return strcmp(a_fil, b_fil) == 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -222,11 +240,17 @@ static int key_former_adrfil_equals(void *arg,
|
|||||||
return key_adrfil_equals_generic(arg, a, b, 1);
|
return key_adrfil_equals_generic(arg, a, b, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void hash_init(httrackp *opt, hash_struct * hash, int normalized) {
|
void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized) {
|
||||||
hash->sav = coucal_new(0);
|
hash->sav = coucal_new(0);
|
||||||
hash->adrfil = coucal_new(0);
|
hash->adrfil = coucal_new(0);
|
||||||
hash->former_adrfil = coucal_new(0);
|
hash->former_adrfil = coucal_new(0);
|
||||||
hash->normalized = normalized;
|
/* urlhack is the umbrella; per-feature negatives opt out of each part */
|
||||||
|
hash->norm_host = normalized && !opt->no_www_dedup;
|
||||||
|
hash->norm_slash = normalized && !opt->no_slash_dedup;
|
||||||
|
hash->norm_query = normalized && !opt->no_query_dedup;
|
||||||
|
/* snapshot the query-strip list (not owned; valid for the hash lifetime) */
|
||||||
|
hash->strip_query =
|
||||||
|
StringNotEmpty(opt->strip_query) ? StringBuff(opt->strip_query) : NULL;
|
||||||
|
|
||||||
hts_set_hash_handler(hash->sav, opt);
|
hts_set_hash_handler(hash->sav, opt);
|
||||||
hts_set_hash_handler(hash->adrfil, opt);
|
hts_set_hash_handler(hash->adrfil, opt);
|
||||||
@@ -282,6 +306,26 @@ void hash_free(hash_struct *hash) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Test helper: do the two URLs dedupe to the same key under opt's urlhack
|
||||||
|
flags? Exercises the live hash compare (norm_host/slash/query resolution). */
|
||||||
|
hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila,
|
||||||
|
const char *adrb, const char *filb) {
|
||||||
|
hash_struct hash;
|
||||||
|
lien_url la, lb;
|
||||||
|
hts_boolean eq;
|
||||||
|
|
||||||
|
memset(&la, 0, sizeof(la));
|
||||||
|
memset(&lb, 0, sizeof(lb));
|
||||||
|
la.adr = key_duphandler(NULL, adra);
|
||||||
|
la.fil = key_duphandler(NULL, fila);
|
||||||
|
lb.adr = key_duphandler(NULL, adrb);
|
||||||
|
lb.fil = key_duphandler(NULL, filb);
|
||||||
|
hash_init(opt, &hash, opt->urlhack);
|
||||||
|
eq = key_adrfil_equals(&hash, &la, &lb);
|
||||||
|
hash_free(&hash);
|
||||||
|
return eq;
|
||||||
|
}
|
||||||
|
|
||||||
// retour: position ou -1 si non trouvé
|
// retour: position ou -1 si non trouvé
|
||||||
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
|
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
|
||||||
hash_struct_type type) {
|
hash_struct_type type) {
|
||||||
|
|||||||
@@ -51,8 +51,12 @@ typedef enum hash_struct_type {
|
|||||||
} hash_struct_type;
|
} hash_struct_type;
|
||||||
|
|
||||||
// tables de hachage
|
// tables de hachage
|
||||||
void hash_init(httrackp *opt, hash_struct *hash, int normalized);
|
void hash_init(httrackp *opt, hash_struct *hash, hts_boolean normalized);
|
||||||
void hash_free(hash_struct *hash);
|
void hash_free(hash_struct *hash);
|
||||||
|
/* Test helper: HTS_TRUE if the two URLs dedupe together under opt's urlhack
|
||||||
|
flags. */
|
||||||
|
hts_boolean hash_url_equals(httrackp *opt, const char *adra, const char *fila,
|
||||||
|
const char *adrb, const char *filb);
|
||||||
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
|
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
|
||||||
hash_struct_type type);
|
hash_struct_type type);
|
||||||
void hash_write(hash_struct * hash, size_t lpos);
|
void hash_write(hash_struct * hash, size_t lpos);
|
||||||
|
|||||||
@@ -563,6 +563,7 @@ void help(const char *app, int more) {
|
|||||||
(" %x do not include any password for external password protected websites (%x0 include)");
|
(" %x do not include any password for external password protected websites (%x0 include)");
|
||||||
infomsg
|
infomsg
|
||||||
(" %q *include query string for local files (useless, for information purpose only) (%q0 don't include)");
|
(" %q *include query string for local files (useless, for information purpose only) (%q0 don't include)");
|
||||||
|
infomsg(" %g strip query keys for dedup ([host/pattern=]key1,key2,...)");
|
||||||
infomsg
|
infomsg
|
||||||
(" o *generate output html file in case of error (404..) (o0 don't generate)");
|
(" o *generate output html file in case of error (404..) (o0 don't generate)");
|
||||||
infomsg(" X *purge old files after update (X0 keep delete)");
|
infomsg(" X *purge old files after update (X0 keep delete)");
|
||||||
@@ -587,6 +588,9 @@ void help(const char *app, int more) {
|
|||||||
(" %s update hacks: various hacks to limit re-transfers when updating (identical size, bogus response..)");
|
(" %s update hacks: various hacks to limit re-transfers when updating (identical size, bogus response..)");
|
||||||
infomsg
|
infomsg
|
||||||
(" %u url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..)");
|
(" %u url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..)");
|
||||||
|
infomsg(" opt out of one url-hack part: --keep-www-prefix "
|
||||||
|
"(www.foo.com<>foo.com), --keep-double-slashes (//), "
|
||||||
|
"--keep-query-order (?b&a)");
|
||||||
infomsg
|
infomsg
|
||||||
(" %A assume that a type (cgi,asp..) is always linked with a mime type (-%A php3,cgi=text/html;dat,bin=application/x-zip)");
|
(" %A assume that a type (cgi,asp..) is always linked with a mime type (-%A php3,cgi=text/html;dat,bin=application/x-zip)");
|
||||||
infomsg(" shortcut: '--assume standard' is equivalent to -%A "
|
infomsg(" shortcut: '--assume standard' is equivalent to -%A "
|
||||||
@@ -646,9 +650,7 @@ void help(const char *app, int more) {
|
|||||||
infomsg("");
|
infomsg("");
|
||||||
infomsg("Guru options: (do NOT use if possible)");
|
infomsg("Guru options: (do NOT use if possible)");
|
||||||
infomsg(" #X *use optimized engine (limited memory boundary checks)");
|
infomsg(" #X *use optimized engine (limited memory boundary checks)");
|
||||||
infomsg(" #0 filter test (-#0 '*.gif' 'www.bar.com/foo.gif')");
|
infomsg(" #test list engine self-tests (run one with -#test=NAME [args])");
|
||||||
infomsg(" #1 simplify test (-#1 ./foo/bar/../foobar)");
|
|
||||||
infomsg(" #2 type test (-#2 /foo/bar.php)");
|
|
||||||
infomsg(" #C cache list (-#C '*.com/spider*.gif'");
|
infomsg(" #C cache list (-#C '*.com/spider*.gif'");
|
||||||
infomsg(" #R cache repair (damaged cache)");
|
infomsg(" #R cache repair (damaged cache)");
|
||||||
infomsg(" #d debug parser");
|
infomsg(" #d debug parser");
|
||||||
|
|||||||
651
src/htslib.c
651
src/htslib.c
@@ -2297,14 +2297,27 @@ htsblk http_test(httrackp * opt, const char *adr, const char *fil, char *loc) {
|
|||||||
// peut ouvrir avec des connect() non bloquants: waitconnect=0/1
|
// peut ouvrir avec des connect() non bloquants: waitconnect=0/1
|
||||||
T_SOC newhttp(httrackp * opt, const char *_iadr, htsblk * retour, int port,
|
T_SOC newhttp(httrackp * opt, const char *_iadr, htsblk * retour, int port,
|
||||||
int waitconnect) {
|
int waitconnect) {
|
||||||
|
return newhttp_addr(opt, _iadr, retour, port, waitconnect, 0, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
T_SOC newhttp_addr(httrackp *opt, const char *_iadr, htsblk *retour, int port,
|
||||||
|
int waitconnect, int addr_index, int *addr_count) {
|
||||||
T_SOC soc; // descipteur de la socket
|
T_SOC soc; // descipteur de la socket
|
||||||
|
|
||||||
|
if (addr_count != NULL) {
|
||||||
|
*addr_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (strcmp(_iadr, "file://") != 0) { /* non fichier */
|
if (strcmp(_iadr, "file://") != 0) { /* non fichier */
|
||||||
SOCaddr server;
|
SOCaddr server;
|
||||||
|
SOCaddr addrs[HTS_MAXADDRNUM];
|
||||||
|
int naddr;
|
||||||
const char *error = "unknown error";
|
const char *error = "unknown error";
|
||||||
|
|
||||||
// tester un éventuel id:pass et virer id:pass@ si détecté
|
// tester un éventuel id:pass et virer id:pass@ si détecté
|
||||||
const char *const iadr = jump_identification_const(_iadr);
|
const char *const iadr = jump_identification_const(_iadr);
|
||||||
|
const char *resolve_host = iadr;
|
||||||
|
char BIGSTK iadr2[HTS_URLMAXSIZE * 2];
|
||||||
|
|
||||||
SOCaddr_clear(server);
|
SOCaddr_clear(server);
|
||||||
|
|
||||||
@@ -2326,7 +2339,6 @@ T_SOC newhttp(httrackp * opt, const char *_iadr, htsblk * retour, int port,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (a != NULL) {
|
if (a != NULL) {
|
||||||
char BIGSTK iadr2[HTS_URLMAXSIZE * 2];
|
|
||||||
int i = -1;
|
int i = -1;
|
||||||
|
|
||||||
iadr2[0] = '\0';
|
iadr2[0] = '\0';
|
||||||
@@ -2337,18 +2349,19 @@ T_SOC newhttp(httrackp * opt, const char *_iadr, htsblk * retour, int port,
|
|||||||
|
|
||||||
// adresse véritable (sans :xx)
|
// adresse véritable (sans :xx)
|
||||||
strncatbuff(iadr2, iadr, (int) (a - iadr));
|
strncatbuff(iadr2, iadr, (int) (a - iadr));
|
||||||
|
resolve_host = iadr2;
|
||||||
// adresse sans le :xx
|
|
||||||
hts_dns_resolve2(opt, iadr2, &server, &error);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
// adresse normale (port par défaut par la suite)
|
|
||||||
hts_dns_resolve2(opt, iadr, &server, &error);
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} else { // port défini
|
// resolve the full address list and pick the requested candidate; the
|
||||||
hts_dns_resolve2(opt, iadr, &server, &error);
|
// scheduler retries the next index when a connect fails (dead IPv6 etc.)
|
||||||
|
naddr =
|
||||||
|
hts_dns_resolve_all(opt, resolve_host, addrs, HTS_MAXADDRNUM, &error);
|
||||||
|
if (addr_count != NULL) {
|
||||||
|
*addr_count = naddr;
|
||||||
|
}
|
||||||
|
if (addr_index >= 0 && addr_index < naddr) {
|
||||||
|
SOCaddr_copy_SOCaddr(server, addrs[addr_index]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!SOCaddr_is_valid(server)) {
|
if (!SOCaddr_is_valid(server)) {
|
||||||
@@ -3597,7 +3610,10 @@ static int sortNormFnc(const void *a_, const void *b_) {
|
|||||||
return strcmp(*a + 1, *b + 1);
|
return strcmp(*a + 1, *b + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
/* Path normalizer core: optionally collapse redundant '//' (DO_SLASH) and/or
|
||||||
|
sort query arguments (DO_QUERY) so equivalent URLs dedupe. */
|
||||||
|
static char *fil_normalized_ex(const char *source, char *dest, int do_slash,
|
||||||
|
int do_query) {
|
||||||
char lastc = 0;
|
char lastc = 0;
|
||||||
int gotquery = 0;
|
int gotquery = 0;
|
||||||
int ampargs = 0;
|
int ampargs = 0;
|
||||||
@@ -3607,8 +3623,8 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
|||||||
for(i = j = 0; source[i] != '\0'; i++) {
|
for(i = j = 0; source[i] != '\0'; i++) {
|
||||||
if (!gotquery && source[i] == '?')
|
if (!gotquery && source[i] == '?')
|
||||||
gotquery = ampargs = 1;
|
gotquery = ampargs = 1;
|
||||||
if ((!gotquery && lastc == '/' && source[i] == '/') // foo//bar -> foo/bar
|
if (do_slash && !gotquery && lastc == '/' && source[i] == '/') {
|
||||||
) {
|
// foo//bar -> foo/bar
|
||||||
} else {
|
} else {
|
||||||
if (gotquery && source[i] == '&') {
|
if (gotquery && source[i] == '&') {
|
||||||
ampargs++;
|
ampargs++;
|
||||||
@@ -3620,7 +3636,7 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
|||||||
dest[j++] = '\0';
|
dest[j++] = '\0';
|
||||||
|
|
||||||
/* Sort arguments (&foo=1&bar=2 == &bar=2&foo=1) */
|
/* Sort arguments (&foo=1&bar=2 == &bar=2&foo=1) */
|
||||||
if (ampargs > 1) {
|
if (do_query && ampargs > 1) {
|
||||||
char **amps = malloct(ampargs * sizeof(char *));
|
char **amps = malloct(ampargs * sizeof(char *));
|
||||||
char *copyBuff = NULL;
|
char *copyBuff = NULL;
|
||||||
size_t qLen = 0;
|
size_t qLen = 0;
|
||||||
@@ -3668,6 +3684,153 @@ HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
|||||||
return dest;
|
return dest;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HTSEXT_API char *fil_normalized(const char *source, char *dest) {
|
||||||
|
return fil_normalized_ex(source, dest, 1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Is query key ARG[0..keylen) in the comma-separated STRIP list? "*" = all;
|
||||||
|
case-sensitive, space-trimmed tokens. */
|
||||||
|
static int hts_query_key_stripped(const char *arg, size_t keylen,
|
||||||
|
const char *strip) {
|
||||||
|
const char *p = strip;
|
||||||
|
|
||||||
|
while (*p != '\0') {
|
||||||
|
const char *start = p;
|
||||||
|
size_t toklen;
|
||||||
|
|
||||||
|
while (*p != '\0' && *p != ',')
|
||||||
|
p++;
|
||||||
|
toklen = (size_t) (p - start);
|
||||||
|
while (toklen > 0 && *start == ' ') {
|
||||||
|
start++;
|
||||||
|
toklen--;
|
||||||
|
}
|
||||||
|
while (toklen > 0 && start[toklen - 1] == ' ')
|
||||||
|
toklen--;
|
||||||
|
if (toklen == 1 && start[0] == '*')
|
||||||
|
return 1;
|
||||||
|
if (toklen == keylen && strncmp(start, arg, keylen) == 0)
|
||||||
|
return 1;
|
||||||
|
if (*p == ',')
|
||||||
|
p++;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* see htscore.h */
|
||||||
|
char *fil_normalized_filtered_ex(const char *source, char *dest,
|
||||||
|
const char *strip, int do_slash,
|
||||||
|
int do_query) {
|
||||||
|
const char *query;
|
||||||
|
char BIGSTK tmp[HTS_URLMAXSIZE * 2];
|
||||||
|
htsbuff cb;
|
||||||
|
int wrote = 0;
|
||||||
|
|
||||||
|
/* No strip list, or no query: plain normalization. */
|
||||||
|
if (strip == NULL || *strip == '\0' ||
|
||||||
|
(query = strchr(source, '?')) == NULL) {
|
||||||
|
return fil_normalized_ex(source, dest, do_slash, do_query);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy the path, re-emit kept query args, let fil_normalized() sort. Walk
|
||||||
|
every field incl. empty/trailing ("a&","?&&") so the result is a fixpoint
|
||||||
|
(the read re-normalizes it; a dropped empty arg would miss dedup). */
|
||||||
|
cb = htsbuff_ptr(tmp, sizeof(tmp));
|
||||||
|
htsbuff_catn(&cb, source, (size_t) (query - source));
|
||||||
|
for (query++;;) {
|
||||||
|
const char *const arg = query;
|
||||||
|
const char *eq = NULL;
|
||||||
|
size_t keylen, arglen;
|
||||||
|
|
||||||
|
while (*query != '\0' && *query != '&') {
|
||||||
|
if (eq == NULL && *query == '=')
|
||||||
|
eq = query;
|
||||||
|
query++;
|
||||||
|
}
|
||||||
|
arglen = (size_t) (query - arg);
|
||||||
|
keylen = eq != NULL ? (size_t) (eq - arg) : arglen;
|
||||||
|
if (!hts_query_key_stripped(arg, keylen, strip)) {
|
||||||
|
htsbuff_catc(&cb, wrote ? '&' : '?');
|
||||||
|
htsbuff_catn(&cb, arg, arglen);
|
||||||
|
wrote = 1;
|
||||||
|
}
|
||||||
|
if (*query == '\0')
|
||||||
|
break;
|
||||||
|
query++;
|
||||||
|
}
|
||||||
|
return fil_normalized_ex(tmp, dest, do_slash, do_query);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* see htscore.h */
|
||||||
|
char *fil_normalized_filtered(const char *source, char *dest,
|
||||||
|
const char *strip) {
|
||||||
|
return fil_normalized_filtered_ex(source, dest, strip, 1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* see htscore.h */
|
||||||
|
const char *hts_query_strip_keys(const char *rules, const char *adr,
|
||||||
|
const char *fil, char *dest, size_t destsize) {
|
||||||
|
const char *p, *q;
|
||||||
|
const char *result = NULL;
|
||||||
|
char BIGSTK url[HTS_URLMAXSIZE * 2];
|
||||||
|
|
||||||
|
if (rules == NULL || *rules == '\0' || destsize == 0)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/* Match string = normalized host/path, query removed. jump_normalized_const
|
||||||
|
collapses www+scheme/auth so read and write (double-normalized) agree;
|
||||||
|
query excluded keeps the decision on host/path only. */
|
||||||
|
url[0] = '\0';
|
||||||
|
strcatbuff(url, jump_normalized_const(adr));
|
||||||
|
if (fil[0] != '/')
|
||||||
|
strcatbuff(url, "/");
|
||||||
|
q = strchr(fil, '?');
|
||||||
|
if (q != NULL)
|
||||||
|
strncatbuff(url, fil, (int) (q - fil));
|
||||||
|
else
|
||||||
|
strcatbuff(url, fil);
|
||||||
|
|
||||||
|
/* Walk the '\n' entries; last match wins (like the +/- filter eval). Each is
|
||||||
|
"pattern=keys"; no '=' is the bare form, pattern "*". */
|
||||||
|
for (p = rules; *p != '\0';) {
|
||||||
|
const char *const line = p;
|
||||||
|
const char *eol, *eq, *keys;
|
||||||
|
char BIGSTK pat[HTS_URLMAXSIZE * 2];
|
||||||
|
|
||||||
|
while (*p != '\0' && *p != '\n')
|
||||||
|
p++;
|
||||||
|
eol = p;
|
||||||
|
if (*p == '\n')
|
||||||
|
p++;
|
||||||
|
if (eol == line)
|
||||||
|
continue;
|
||||||
|
eq = memchr(line, '=', (size_t) (eol - line));
|
||||||
|
if (eq != NULL) {
|
||||||
|
size_t patlen = (size_t) (eq - line);
|
||||||
|
|
||||||
|
if (patlen >= sizeof(pat))
|
||||||
|
patlen = sizeof(pat) - 1;
|
||||||
|
memcpy(pat, line, patlen);
|
||||||
|
pat[patlen] = '\0';
|
||||||
|
keys = eq + 1;
|
||||||
|
} else {
|
||||||
|
pat[0] = '*';
|
||||||
|
pat[1] = '\0';
|
||||||
|
keys = line;
|
||||||
|
}
|
||||||
|
if (strjoker(url, pat, NULL, NULL) != NULL) {
|
||||||
|
size_t klen = (size_t) (eol - keys);
|
||||||
|
|
||||||
|
if (klen >= destsize)
|
||||||
|
klen = destsize - 1;
|
||||||
|
memcpy(dest, keys, klen);
|
||||||
|
dest[klen] = '\0';
|
||||||
|
result = dest;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
#define endwith(a) ( (len >= (sizeof(a)-1)) ? ( strncmp(dest, a+len-(sizeof(a)-1), sizeof(a)-1) == 0 ) : 0 );
|
#define endwith(a) ( (len >= (sizeof(a)-1)) ? ( strncmp(dest, a+len-(sizeof(a)-1), sizeof(a)-1) == 0 ) : 0 );
|
||||||
HTSEXT_API char *adr_normalized_sized(const char *source, char *dest,
|
HTSEXT_API char *adr_normalized_sized(const char *source, char *dest,
|
||||||
size_t destsize) {
|
size_t destsize) {
|
||||||
@@ -4164,9 +4327,10 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
|||||||
/* Check html -> text/html */
|
/* Check html -> text/html */
|
||||||
const char *a = fil + strlen(fil) - 1;
|
const char *a = fil + strlen(fil) - 1;
|
||||||
|
|
||||||
while((*a != '.') && (*a != '/') && (a > fil))
|
/* a < fil when fil is empty: bound before dereferencing */
|
||||||
|
while ((a > fil) && (*a != '.') && (*a != '/'))
|
||||||
a--;
|
a--;
|
||||||
if (*a == '.' && strlen(a) < 32) {
|
if (a >= fil && *a == '.' && strlen(a) < 32) {
|
||||||
int j = 0;
|
int j = 0;
|
||||||
|
|
||||||
a++;
|
a++;
|
||||||
@@ -4753,66 +4917,53 @@ int hts_read(htsblk * r, char *buff, int size) {
|
|||||||
// -- Gestion cache DNS --
|
// -- Gestion cache DNS --
|
||||||
// 'RX98
|
// 'RX98
|
||||||
|
|
||||||
// 'capsule' contenant uniquement le cache
|
// Free a DNS cache record (coucal value handler).
|
||||||
t_dnscache *hts_cache(httrackp * opt) {
|
static void hts_cache_value_free(coucal_opaque arg, coucal_value value) {
|
||||||
|
void *record = value.ptr;
|
||||||
|
|
||||||
|
(void) arg;
|
||||||
|
freet(record);
|
||||||
|
}
|
||||||
|
|
||||||
|
// opt's DNS cache hashtable, created on first use. Records (t_dnscache*) are
|
||||||
|
// owned by the table and freed by hts_cache_value_free on coucal_delete.
|
||||||
|
coucal hts_cache(httrackp *opt) {
|
||||||
assertf(opt != NULL);
|
assertf(opt != NULL);
|
||||||
if (opt->state.dns_cache == NULL) {
|
if (opt->state.dns_cache == NULL) {
|
||||||
opt->state.dns_cache = (t_dnscache *) malloct(sizeof(t_dnscache));
|
coucal cache = coucal_new(0);
|
||||||
memset(opt->state.dns_cache, 0, sizeof(t_dnscache));
|
|
||||||
|
coucal_set_name(cache, "dns_cache");
|
||||||
|
coucal_value_set_value_handler(cache, hts_cache_value_free, NULL);
|
||||||
|
opt->state.dns_cache = cache;
|
||||||
}
|
}
|
||||||
assertf(opt->state.dns_cache != NULL);
|
assertf(opt->state.dns_cache != NULL);
|
||||||
/* first entry is NULL */
|
|
||||||
assertf(opt->state.dns_cache->iadr == NULL);
|
|
||||||
return opt->state.dns_cache;
|
return opt->state.dns_cache;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Free DNS cache.
|
// MUST BE LOCKED (coucal is not internally serialized vs FTP/web threads)
|
||||||
void hts_cache_free(t_dnscache *const root) {
|
// Look up iadr in the DNS cache, filling out[0..min(count,max)-1].
|
||||||
if (root != NULL) {
|
// Returns: -1 not yet tested; 0 negative-cached (not in DNS); >0 address count.
|
||||||
t_dnscache *cache;
|
static int hts_ghbn_all(coucal cache, const char *const iadr,
|
||||||
for(cache = root; cache != NULL; ) {
|
SOCaddr *const out, const int max) {
|
||||||
t_dnscache *const next = cache->next;
|
void *ptr;
|
||||||
cache->next = NULL;
|
|
||||||
freet(cache);
|
|
||||||
cache = next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// lock le cache dns pour tout opération d'ajout
|
assertf(out != NULL);
|
||||||
// plus prudent quand plusieurs threads peuvent écrire dedans..
|
|
||||||
// -1: status? 0: libérer 1:locker
|
|
||||||
|
|
||||||
// MUST BE LOCKED
|
|
||||||
// routine pour le cache - retour optionnel à donner à chaque fois
|
|
||||||
// NULL: nom non encore testé dans le cache
|
|
||||||
// si h_length==0 alors le nom n'existe pas dans le dns
|
|
||||||
static SOCaddr* hts_ghbn(const t_dnscache *cache, const char *const iadr, SOCaddr *const addr) {
|
|
||||||
assertf(addr != NULL);
|
|
||||||
assertf(iadr != NULL);
|
assertf(iadr != NULL);
|
||||||
if (*iadr == '\0') {
|
if (*iadr == '\0') {
|
||||||
return NULL;
|
return -1;
|
||||||
}
|
}
|
||||||
/* first entry is empty */
|
if (coucal_read_pvoid(cache, iadr, &ptr)) { // ok trouvé
|
||||||
if (cache->iadr == NULL) {
|
const t_dnscache *const record = (const t_dnscache *) ptr;
|
||||||
cache = cache->next;
|
int i;
|
||||||
}
|
|
||||||
for(; cache != NULL; cache = cache->next) {
|
assertf(record->host_count <= HTS_MAXADDRNUM);
|
||||||
assertf(cache != NULL);
|
for (i = 0; i < record->host_count && i < max; i++) {
|
||||||
assertf(cache->iadr != NULL);
|
assertf(record->host_length[i] <= sizeof(record->host_addr[i]));
|
||||||
assertf(cache->iadr == (const char*) cache + sizeof(t_dnscache));
|
SOCaddr_copyaddr2(out[i], record->host_addr[i], record->host_length[i]);
|
||||||
if (strcmp(cache->iadr, iadr) == 0) { // ok trouvé
|
|
||||||
if (cache->host_length != 0) { // entrée valide
|
|
||||||
assertf(cache->host_length <= sizeof(cache->host_addr));
|
|
||||||
SOCaddr_copyaddr2(*addr, cache->host_addr, cache->host_length);
|
|
||||||
return addr;
|
|
||||||
} else { // erreur dans le dns, déja vérifié
|
|
||||||
SOCaddr_clear(*addr);
|
|
||||||
return addr;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
return record->host_count;
|
||||||
}
|
}
|
||||||
return NULL;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if HTS_INET6 != 0
|
#if HTS_INET6 != 0
|
||||||
@@ -4826,84 +4977,239 @@ static const hts_resolver_backend *hts_resolver = &hts_resolver_libc;
|
|||||||
void hts_dns_set_resolver_backend(const hts_resolver_backend *backend) {
|
void hts_dns_set_resolver_backend(const hts_resolver_backend *backend) {
|
||||||
hts_resolver = (backend != NULL) ? backend : &hts_resolver_libc;
|
hts_resolver = (backend != NULL) ? backend : &hts_resolver_libc;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
static SOCaddr *hts_dns_resolve_nocache2_(const char *const hostname,
|
/* Debug/test hook: HTTRACK_DEBUG_RESOLVE="host:ip[,ip...]" pins the resolution
|
||||||
SOCaddr *const addr,
|
of `host` to the listed addresses (curl --resolve style), so the connect
|
||||||
const char **error) {
|
fallback can be exercised deterministically (a dead address first, a live one
|
||||||
{
|
next). Any other host resolves normally. Below: an addrinfo backend that owns
|
||||||
#if HTS_INET6==0
|
its chain (its own freeaddrinfo), so a synthesized and a delegated result
|
||||||
/* IPv4 resolver */
|
free the same way. */
|
||||||
struct hostent *const hp = gethostbyname(hostname);
|
|
||||||
|
|
||||||
if (hp != NULL) {
|
/* Deep-copy a libc addrinfo chain into our own allocations. */
|
||||||
SOCaddr_copyaddr2(addr, hp->h_addr_list[0], hp->h_length);
|
static struct addrinfo *resolver_dup_chain(const struct addrinfo *src) {
|
||||||
return SOCaddr_is_valid(addr) ? &addr : NULL;
|
struct addrinfo *head = NULL, *tail = NULL;
|
||||||
} else {
|
|
||||||
SOCaddr_clear(*addr);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
/* IPv6 resolver */
|
|
||||||
struct addrinfo *res = NULL;
|
|
||||||
struct addrinfo hints;
|
|
||||||
int gerr;
|
|
||||||
|
|
||||||
SOCaddr_clear(*addr);
|
for (; src != NULL; src = src->ai_next) {
|
||||||
memset(&hints, 0, sizeof(hints));
|
struct addrinfo *const ai = calloct(1, sizeof(*ai));
|
||||||
if (IPV6_resolver == 1) // V4 only (for bogus V6 entries)
|
|
||||||
hints.ai_family = PF_INET;
|
ai->ai_family = src->ai_family;
|
||||||
else if (IPV6_resolver == 2) // V6 only (for testing V6 only)
|
ai->ai_socktype = src->ai_socktype;
|
||||||
hints.ai_family = PF_INET6;
|
ai->ai_protocol = src->ai_protocol;
|
||||||
else // V4 + V6
|
ai->ai_addrlen = src->ai_addrlen;
|
||||||
hints.ai_family = PF_UNSPEC;
|
ai->ai_addr = malloct(src->ai_addrlen);
|
||||||
hints.ai_socktype = SOCK_STREAM;
|
memcpy(ai->ai_addr, src->ai_addr, src->ai_addrlen);
|
||||||
hints.ai_protocol = IPPROTO_TCP;
|
if (head == NULL)
|
||||||
if ((gerr = hts_resolver->getaddrinfo(hostname, NULL, &hints, &res)) == 0) {
|
head = ai;
|
||||||
if (res != NULL) {
|
else
|
||||||
if (res->ai_addr != NULL && res->ai_addrlen != 0) {
|
tail->ai_next = ai;
|
||||||
SOCaddr_copyaddr2(*addr, res->ai_addr, res->ai_addrlen);
|
tail = ai;
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (error != NULL) {
|
|
||||||
*error = gai_strerror(gerr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (res) {
|
|
||||||
hts_resolver->freeaddrinfo(res);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
return head;
|
||||||
return SOCaddr_is_valid(*addr) ? addr : NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
HTSEXT_API SOCaddr* hts_dns_resolve_nocache2(const char *const hostname,
|
/* Build one addrinfo node from an IPv4/IPv6 literal, or NULL if it does not
|
||||||
SOCaddr *const addr, const char **error) {
|
parse or is filtered out by want_family (AF_INET/AF_INET6/PF_UNSPEC). */
|
||||||
/* Protection */
|
static struct addrinfo *resolver_make_ai(const char *ip, int want_family) {
|
||||||
if (!strnotempty(hostname)) {
|
struct addrinfo *ai;
|
||||||
return NULL;
|
|
||||||
|
if (strchr(ip, ':') != NULL) { // IPv6 literal
|
||||||
|
struct sockaddr_in6 sa6;
|
||||||
|
|
||||||
|
if (want_family != PF_UNSPEC && want_family != AF_INET6)
|
||||||
|
return NULL;
|
||||||
|
memset(&sa6, 0, sizeof(sa6));
|
||||||
|
if (inet_pton(AF_INET6, ip, &sa6.sin6_addr) != 1)
|
||||||
|
return NULL;
|
||||||
|
sa6.sin6_family = AF_INET6;
|
||||||
|
ai = calloct(1, sizeof(*ai));
|
||||||
|
ai->ai_family = AF_INET6;
|
||||||
|
ai->ai_addrlen = sizeof(sa6);
|
||||||
|
ai->ai_addr = malloct(sizeof(sa6));
|
||||||
|
memcpy(ai->ai_addr, &sa6, sizeof(sa6));
|
||||||
|
} else { // IPv4 literal
|
||||||
|
struct sockaddr_in sa;
|
||||||
|
|
||||||
|
if (want_family != PF_UNSPEC && want_family != AF_INET)
|
||||||
|
return NULL;
|
||||||
|
memset(&sa, 0, sizeof(sa));
|
||||||
|
if (inet_pton(AF_INET, ip, &sa.sin_addr) != 1)
|
||||||
|
return NULL;
|
||||||
|
sa.sin_family = AF_INET;
|
||||||
|
ai = calloct(1, sizeof(*ai));
|
||||||
|
ai->ai_family = AF_INET;
|
||||||
|
ai->ai_addrlen = sizeof(sa);
|
||||||
|
ai->ai_addr = malloct(sizeof(sa));
|
||||||
|
memcpy(ai->ai_addr, &sa, sizeof(sa));
|
||||||
|
}
|
||||||
|
return ai;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void override_freeaddrinfo(struct addrinfo *res) {
|
||||||
|
while (res != NULL) {
|
||||||
|
struct addrinfo *const next = res->ai_next;
|
||||||
|
|
||||||
|
freet(res->ai_addr);
|
||||||
|
freet(res);
|
||||||
|
res = next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int override_getaddrinfo(const char *node, const char *service,
|
||||||
|
const struct addrinfo *hints,
|
||||||
|
struct addrinfo **res) {
|
||||||
|
const char *const spec = getenv("HTTRACK_DEBUG_RESOLVE");
|
||||||
|
const int want = (hints != NULL) ? hints->ai_family : PF_UNSPEC;
|
||||||
|
const char *colon;
|
||||||
|
|
||||||
|
*res = NULL;
|
||||||
|
if (spec != NULL && node != NULL && (colon = strchr(spec, ':')) != NULL &&
|
||||||
|
(size_t) (colon - spec) == strlen(node) &&
|
||||||
|
strncmp(spec, node, colon - spec) == 0) {
|
||||||
|
struct addrinfo *head = NULL, *tail = NULL;
|
||||||
|
char buf[256];
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
buf[0] = '\0';
|
||||||
|
strncatbuff(buf, colon + 1, sizeof(buf) - 1);
|
||||||
|
for (p = strtok(buf, ","); p != NULL; p = strtok(NULL, ",")) {
|
||||||
|
struct addrinfo *const ai = resolver_make_ai(p, want);
|
||||||
|
|
||||||
|
if (ai != NULL) {
|
||||||
|
if (head == NULL)
|
||||||
|
head = ai;
|
||||||
|
else
|
||||||
|
tail->ai_next = ai;
|
||||||
|
tail = ai;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (head == NULL)
|
||||||
|
return EAI_NONAME;
|
||||||
|
*res = head;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* not overridden: delegate to libc, copying into our owned format */
|
||||||
Strip [] if any : [3ffe:b80:1234:1::1]
|
{
|
||||||
The resolver doesn't seem to handle IP6 addresses in brackets
|
struct addrinfo *sys = NULL;
|
||||||
*/
|
int gerr = getaddrinfo(node, service, hints, &sys);
|
||||||
|
|
||||||
|
if (gerr != 0)
|
||||||
|
return gerr;
|
||||||
|
*res = resolver_dup_chain(sys);
|
||||||
|
freeaddrinfo(sys);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static const hts_resolver_backend hts_resolver_override = {
|
||||||
|
override_getaddrinfo, override_freeaddrinfo};
|
||||||
|
|
||||||
|
/* Install the env override once, unless a backend was already set (self-test).
|
||||||
|
*/
|
||||||
|
static void hts_resolver_check_env(void) {
|
||||||
|
static int checked = 0;
|
||||||
|
|
||||||
|
if (!checked) {
|
||||||
|
checked = 1;
|
||||||
|
if (hts_resolver == &hts_resolver_libc &&
|
||||||
|
getenv("HTTRACK_DEBUG_RESOLVE") != NULL) {
|
||||||
|
hts_resolver = &hts_resolver_override;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Resolve hostname into up to max addresses (resolver/RFC 6724 order), no
|
||||||
|
// cache. Returns the count copied into out[0..count-1]; 0 = does not resolve.
|
||||||
|
static int hts_dns_resolve_nocache_list_(const char *const hostname,
|
||||||
|
SOCaddr *const out, const int max,
|
||||||
|
const char **error) {
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
#if HTS_INET6==0
|
||||||
|
/* IPv4 resolver */
|
||||||
|
struct hostent *const hp = gethostbyname(hostname);
|
||||||
|
|
||||||
|
if (hp != NULL) {
|
||||||
|
char **h;
|
||||||
|
|
||||||
|
for (h = hp->h_addr_list; count < max && h != NULL && *h != NULL; h++) {
|
||||||
|
SOCaddr_clear(out[count]);
|
||||||
|
SOCaddr_copyaddr2(out[count], *h, hp->h_length);
|
||||||
|
if (SOCaddr_is_valid(out[count]))
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
/* IPv6 resolver */
|
||||||
|
struct addrinfo *res = NULL, *cur;
|
||||||
|
struct addrinfo hints;
|
||||||
|
int gerr;
|
||||||
|
|
||||||
|
hts_resolver_check_env();
|
||||||
|
memset(&hints, 0, sizeof(hints));
|
||||||
|
if (IPV6_resolver == 1) // V4 only (for bogus V6 entries)
|
||||||
|
hints.ai_family = PF_INET;
|
||||||
|
else if (IPV6_resolver == 2) // V6 only (for testing V6 only)
|
||||||
|
hints.ai_family = PF_INET6;
|
||||||
|
else // V4 + V6
|
||||||
|
hints.ai_family = PF_UNSPEC;
|
||||||
|
hints.ai_socktype = SOCK_STREAM;
|
||||||
|
hints.ai_protocol = IPPROTO_TCP;
|
||||||
|
if ((gerr = hts_resolver->getaddrinfo(hostname, NULL, &hints, &res)) == 0) {
|
||||||
|
for (cur = res; cur != NULL && count < max; cur = cur->ai_next) {
|
||||||
|
if (cur->ai_addr != NULL && cur->ai_addrlen != 0) {
|
||||||
|
SOCaddr_clear(out[count]);
|
||||||
|
SOCaddr_copyaddr2(out[count], cur->ai_addr, cur->ai_addrlen);
|
||||||
|
if (SOCaddr_is_valid(out[count]))
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (error != NULL) {
|
||||||
|
*error = gai_strerror(gerr);
|
||||||
|
}
|
||||||
|
if (res) {
|
||||||
|
hts_resolver->freeaddrinfo(res);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strip [] around a literal IPv6 ([3ffe:b80:1234:1::1]) the resolver won't
|
||||||
|
// take, then resolve into a list. Returns the count.
|
||||||
|
static int hts_dns_resolve_nocache_list(const char *const hostname,
|
||||||
|
SOCaddr *const out, const int max,
|
||||||
|
const char **error) {
|
||||||
|
if (!strnotempty(hostname) || max <= 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if ((hostname[0] == '[') && (hostname[strlen(hostname) - 1] == ']')) {
|
if ((hostname[0] == '[') && (hostname[strlen(hostname) - 1] == ']')) {
|
||||||
SOCaddr *ret;
|
|
||||||
size_t size = strlen(hostname);
|
size_t size = strlen(hostname);
|
||||||
char *copy = malloct(size + 1);
|
char *copy = malloct(size + 1);
|
||||||
|
int count;
|
||||||
|
|
||||||
assertf(copy != NULL);
|
assertf(copy != NULL);
|
||||||
copy[0] = '\0';
|
copy[0] = '\0';
|
||||||
strncat(copy, hostname + 1, size - 2);
|
strncat(copy, hostname + 1, size - 2);
|
||||||
ret = hts_dns_resolve_nocache2_(copy, addr, error);
|
count = hts_dns_resolve_nocache_list_(copy, out, max, error);
|
||||||
freet(copy);
|
freet(copy);
|
||||||
return ret;
|
return count;
|
||||||
} else {
|
} else {
|
||||||
return hts_dns_resolve_nocache2_(hostname, addr, error);
|
return hts_dns_resolve_nocache_list_(hostname, out, max, error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HTSEXT_API SOCaddr *hts_dns_resolve_nocache2(const char *const hostname,
|
||||||
|
SOCaddr *const addr,
|
||||||
|
const char **error) {
|
||||||
|
SOCaddr_clear(*addr);
|
||||||
|
if (hts_dns_resolve_nocache_list(hostname, addr, 1, error) > 0) {
|
||||||
|
return SOCaddr_is_valid(*addr) ? addr : NULL;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
HTSEXT_API SOCaddr* hts_dns_resolve_nocache(const char *const hostname, SOCaddr *const addr) {
|
HTSEXT_API SOCaddr* hts_dns_resolve_nocache(const char *const hostname, SOCaddr *const addr) {
|
||||||
return hts_dns_resolve_nocache2(hostname, addr, NULL);
|
return hts_dns_resolve_nocache2(hostname, addr, NULL);
|
||||||
}
|
}
|
||||||
@@ -4914,16 +5220,18 @@ HTSEXT_API int check_hostname_dns(const char *const hostname) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Needs locking
|
// Needs locking
|
||||||
// cache dns interne à HTS // ** FREE A FAIRE sur la chaine
|
// Internal DNS cache. Fill out[0..count-1] with up to max addresses for _iadr,
|
||||||
static SOCaddr* hts_dns_resolve_(httrackp * opt, const char *_iadr,
|
// resolving (and caching the full list) on a miss. Returns the count.
|
||||||
SOCaddr *const addr, const char **error) {
|
static int hts_dns_resolve_list_(httrackp *opt, const char *_iadr,
|
||||||
|
SOCaddr *const out, const int max,
|
||||||
|
const char **error) {
|
||||||
char BIGSTK iadr[HTS_URLMAXSIZE * 2];
|
char BIGSTK iadr[HTS_URLMAXSIZE * 2];
|
||||||
t_dnscache *cache = hts_cache(opt); // adresse du cache
|
coucal cache = hts_cache(opt); // le cache dns
|
||||||
SOCaddr *sa;
|
int count;
|
||||||
|
|
||||||
assertf(opt != NULL);
|
assertf(opt != NULL);
|
||||||
assertf(_iadr != NULL);
|
assertf(_iadr != NULL);
|
||||||
assertf(addr != NULL);
|
assertf(out != NULL);
|
||||||
|
|
||||||
strcpybuff(iadr, jump_identification_const(_iadr));
|
strcpybuff(iadr, jump_identification_const(_iadr));
|
||||||
// couper éventuel :
|
// couper éventuel :
|
||||||
@@ -4935,55 +5243,67 @@ static SOCaddr* hts_dns_resolve_(httrackp * opt, const char *_iadr,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* get IP from the dns cache */
|
/* get IP from the dns cache */
|
||||||
sa = hts_ghbn(cache, iadr, addr);
|
count = hts_ghbn_all(cache, iadr, out, max);
|
||||||
if (sa != NULL) {
|
if (count >= 0) { // cache hit (0 == negative-cached)
|
||||||
return SOCaddr_is_valid(*sa) ? sa : NULL;
|
return count;
|
||||||
} else { // non présent dans le cache dns, tester
|
} else { // non présent dans le cache dns, tester
|
||||||
const size_t iadr_len = strlen(iadr) + 1;
|
SOCaddr resolved[HTS_MAXADDRNUM];
|
||||||
|
t_dnscache *record;
|
||||||
// find queue
|
int i;
|
||||||
for(; cache->next != NULL; cache = cache->next) ;
|
|
||||||
|
|
||||||
#if DEBUGDNS
|
#if DEBUGDNS
|
||||||
printf("resolving (not cached) %s\n", iadr);
|
printf("resolving (not cached) %s\n", iadr);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
sa = hts_dns_resolve_nocache2(iadr, addr, error); // calculer IP host
|
count = hts_dns_resolve_nocache_list(iadr, resolved, HTS_MAXADDRNUM, error);
|
||||||
|
|
||||||
#if HTS_WIDE_DEBUG
|
#if HTS_WIDE_DEBUG
|
||||||
DEBUG_W("gethostbyname done\n");
|
DEBUG_W("gethostbyname done\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* attempt to store new entry */
|
/* attempt to store new entry (coucal owns it and dups the host key) */
|
||||||
cache->next = malloct(sizeof(t_dnscache) + iadr_len);
|
record = malloct(sizeof(t_dnscache));
|
||||||
if (cache->next != NULL) {
|
if (record != NULL) {
|
||||||
t_dnscache *const next = cache->next;
|
memset(record, 0, sizeof(*record));
|
||||||
char *const block = (char*) cache->next;
|
record->host_count = count;
|
||||||
char *const str = block + sizeof(t_dnscache);
|
for (i = 0; i < count; i++) {
|
||||||
memcpy(str, iadr, iadr_len);
|
record->host_length[i] = SOCaddr_size(resolved[i]);
|
||||||
next->iadr = str;
|
assertf(record->host_length[i] <= sizeof(record->host_addr[i]));
|
||||||
if (sa != NULL) {
|
memcpy(record->host_addr[i], &SOCaddr_sockaddr(resolved[i]),
|
||||||
next->host_length = SOCaddr_size(*sa);
|
record->host_length[i]);
|
||||||
assertf(next->host_length <= sizeof(next->host_addr));
|
|
||||||
memcpy(next->host_addr, &SOCaddr_sockaddr(*sa), next->host_length);
|
|
||||||
} else {
|
|
||||||
next->host_length = 0; // non existant dans le dns
|
|
||||||
}
|
}
|
||||||
next->next = NULL;
|
coucal_add_pvoid(cache, iadr, record);
|
||||||
return sa;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* return result if any */
|
/* copy result to caller (cache store may have failed; result still valid)
|
||||||
return sa;
|
*/
|
||||||
} // retour hp du cache
|
for (i = 0; i < count && i < max; i++) {
|
||||||
|
SOCaddr_copy_SOCaddr(out[i], resolved[i]);
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
} // retour hp du cache
|
||||||
}
|
}
|
||||||
|
|
||||||
SOCaddr* hts_dns_resolve2(httrackp * opt, const char *_iadr, SOCaddr *const addr, const char **error) {
|
int hts_dns_resolve_all(httrackp *opt, const char *iadr, SOCaddr *out, int max,
|
||||||
SOCaddr *ret;
|
const char **error) {
|
||||||
|
int count;
|
||||||
|
|
||||||
|
if (!strnotempty(iadr) || max <= 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
hts_mutexlock(&opt->state.lock);
|
hts_mutexlock(&opt->state.lock);
|
||||||
ret = hts_dns_resolve_(opt, _iadr, addr, error);
|
count = hts_dns_resolve_list_(opt, iadr, out, max, error);
|
||||||
hts_mutexrelease(&opt->state.lock);
|
hts_mutexrelease(&opt->state.lock);
|
||||||
return ret;
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
SOCaddr *hts_dns_resolve2(httrackp *opt, const char *_iadr, SOCaddr *const addr,
|
||||||
|
const char **error) {
|
||||||
|
SOCaddr_clear(*addr);
|
||||||
|
if (hts_dns_resolve_all(opt, _iadr, addr, 1, error) > 0) {
|
||||||
|
return SOCaddr_is_valid(*addr) ? addr : NULL;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
SOCaddr* hts_dns_resolve(httrackp * opt, const char *_iadr, SOCaddr *const addr) {
|
SOCaddr* hts_dns_resolve(httrackp * opt, const char *_iadr, SOCaddr *const addr) {
|
||||||
@@ -5720,7 +6040,11 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
|||||||
opt->verbosedisplay = HTS_VERBOSE_NONE; // no text animation
|
opt->verbosedisplay = HTS_VERBOSE_NONE; // no text animation
|
||||||
opt->sizehack = HTS_FALSE;
|
opt->sizehack = HTS_FALSE;
|
||||||
opt->urlhack = HTS_TRUE;
|
opt->urlhack = HTS_TRUE;
|
||||||
|
opt->no_www_dedup = HTS_FALSE;
|
||||||
|
opt->no_slash_dedup = HTS_FALSE;
|
||||||
|
opt->no_query_dedup = HTS_FALSE;
|
||||||
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
|
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
|
||||||
|
StringCopy(opt->strip_query, "");
|
||||||
opt->ftp_proxy = HTS_TRUE;
|
opt->ftp_proxy = HTS_TRUE;
|
||||||
opt->convert_utf8 = HTS_TRUE;
|
opt->convert_utf8 = HTS_TRUE;
|
||||||
StringCopy(opt->filelist, "");
|
StringCopy(opt->filelist, "");
|
||||||
@@ -5823,14 +6147,14 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
|
|||||||
|
|
||||||
/* Cache */
|
/* Cache */
|
||||||
if (opt->state.dns_cache != NULL) {
|
if (opt->state.dns_cache != NULL) {
|
||||||
t_dnscache *root;
|
coucal root;
|
||||||
|
|
||||||
hts_mutexlock(&opt->state.lock);
|
hts_mutexlock(&opt->state.lock);
|
||||||
root = opt->state.dns_cache;
|
root = opt->state.dns_cache;
|
||||||
opt->state.dns_cache = NULL;
|
opt->state.dns_cache = NULL;
|
||||||
hts_mutexrelease(&opt->state.lock);
|
hts_mutexrelease(&opt->state.lock);
|
||||||
|
|
||||||
hts_cache_free(root);
|
coucal_delete(&root); // frees records via hts_cache_value_free
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Cancel chain */
|
/* Cancel chain */
|
||||||
@@ -5865,6 +6189,7 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
|
|||||||
StringFree(opt->urllist);
|
StringFree(opt->urllist);
|
||||||
StringFree(opt->footer);
|
StringFree(opt->footer);
|
||||||
StringFree(opt->mod_blacklist);
|
StringFree(opt->mod_blacklist);
|
||||||
|
StringFree(opt->strip_query);
|
||||||
|
|
||||||
StringFree(opt->path_html);
|
StringFree(opt->path_html);
|
||||||
StringFree(opt->path_html_utf8);
|
StringFree(opt->path_html_utf8);
|
||||||
|
|||||||
33
src/htslib.h
33
src/htslib.h
@@ -147,11 +147,13 @@ struct OLD_htsblk {
|
|||||||
#define HTS_DEF_FWSTRUCT_t_dnscache
|
#define HTS_DEF_FWSTRUCT_t_dnscache
|
||||||
typedef struct t_dnscache t_dnscache;
|
typedef struct t_dnscache t_dnscache;
|
||||||
#endif
|
#endif
|
||||||
|
// One DNS cache record, stored as a coucal value keyed by hostname.
|
||||||
struct t_dnscache {
|
struct t_dnscache {
|
||||||
struct t_dnscache *next;
|
// resolved addresses, in resolver (RFC 6724) order; host_count==0 means the
|
||||||
const char *iadr;
|
// name does not resolve (negative cache). host_count<=HTS_MAXADDRNUM.
|
||||||
size_t host_length; // length ; (4 or 16) ; 0 for error
|
int host_count;
|
||||||
char host_addr[HTS_MAXADDRLEN];
|
size_t host_length[HTS_MAXADDRNUM]; // sockaddr length of each (16 or 28)
|
||||||
|
char host_addr[HTS_MAXADDRNUM][HTS_MAXADDRLEN];
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Library internal definictions */
|
/* Library internal definictions */
|
||||||
@@ -191,6 +193,13 @@ int http_cookie_header_selftest(t_cookie *cookie, const char *domain,
|
|||||||
//int newhttp(char* iadr,char* err=NULL);
|
//int newhttp(char* iadr,char* err=NULL);
|
||||||
T_SOC newhttp(httrackp * opt, const char *iadr, htsblk * retour, int port,
|
T_SOC newhttp(httrackp * opt, const char *iadr, htsblk * retour, int port,
|
||||||
int waitconnect);
|
int waitconnect);
|
||||||
|
/* Like newhttp(), but connect to the addr_index-th resolved address of the host
|
||||||
|
(0-based) instead of always the first; *addr_count, if non-NULL, is set to
|
||||||
|
the total resolved addresses. newhttp() == newhttp_addr(...,0,NULL). Used by
|
||||||
|
the slot scheduler to try the next address when a connect fails (dead IPv6
|
||||||
|
etc.). */
|
||||||
|
T_SOC newhttp_addr(httrackp *opt, const char *iadr, htsblk *retour, int port,
|
||||||
|
int waitconnect, int addr_index, int *addr_count);
|
||||||
HTS_INLINE void deletehttp(htsblk * r);
|
HTS_INLINE void deletehttp(htsblk * r);
|
||||||
HTS_INLINE int deleteaddr(htsblk * r);
|
HTS_INLINE int deleteaddr(htsblk * r);
|
||||||
HTS_INLINE void deletesoc(T_SOC soc);
|
HTS_INLINE void deletesoc(T_SOC soc);
|
||||||
@@ -215,9 +224,14 @@ void treatfirstline(htsblk * retour, const char *rcvd);
|
|||||||
|
|
||||||
// sous-fonctions
|
// sous-fonctions
|
||||||
LLint http_xfread1(htsblk * r, int bufl);
|
LLint http_xfread1(htsblk * r, int bufl);
|
||||||
HTS_INLINE SOCaddr* hts_dns_resolve2(httrackp * opt, const char *iadr,
|
/* Cached resolver: fill out[0..count-1] with up to max addresses for iadr (in
|
||||||
SOCaddr *const addr,
|
resolver order), returning the count (0 = does not resolve, negative-cached).
|
||||||
const char **error);
|
Resolves once per host; later calls read the DNS cache. Must hold no lock
|
||||||
|
(brackets opt->state.lock itself). */
|
||||||
|
int hts_dns_resolve_all(httrackp *opt, const char *iadr, SOCaddr *out, int max,
|
||||||
|
const char **error);
|
||||||
|
HTS_INLINE SOCaddr *hts_dns_resolve2(httrackp *opt, const char *iadr,
|
||||||
|
SOCaddr *const addr, const char **error);
|
||||||
HTS_INLINE SOCaddr* hts_dns_resolve(httrackp * opt, const char *iadr,
|
HTS_INLINE SOCaddr* hts_dns_resolve(httrackp * opt, const char *iadr,
|
||||||
SOCaddr *const addr);
|
SOCaddr *const addr);
|
||||||
HTSEXT_API SOCaddr* hts_dns_resolve_nocache2(const char *const hostname,
|
HTSEXT_API SOCaddr* hts_dns_resolve_nocache2(const char *const hostname,
|
||||||
@@ -230,8 +244,9 @@ HTSEXT_API int check_hostname_dns(const char *const hostname);
|
|||||||
int ftp_available(void);
|
int ftp_available(void);
|
||||||
|
|
||||||
#if HTS_DNSCACHE
|
#if HTS_DNSCACHE
|
||||||
void hts_cache_free(t_dnscache *const cache);
|
/* Return opt's DNS cache hashtable (hostname -> t_dnscache record), creating it
|
||||||
t_dnscache *hts_cache(httrackp * opt);
|
on first use. Records are owned by the table and freed on coucal_delete. */
|
||||||
|
coucal hts_cache(httrackp *opt);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// outils divers
|
// outils divers
|
||||||
|
|||||||
@@ -198,6 +198,13 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
// copy of fil, used for lookups (see urlhack)
|
// copy of fil, used for lookups (see urlhack)
|
||||||
const char *normadr = adr;
|
const char *normadr = adr;
|
||||||
const char *normfil = fil_complete;
|
const char *normfil = fil_complete;
|
||||||
|
/* query keys to strip for this URL (NULL = none); decoupled from urlhack */
|
||||||
|
char BIGSTK stripkeys[HTS_URLMAXSIZE];
|
||||||
|
const char *const strip =
|
||||||
|
StringNotEmpty(opt->strip_query)
|
||||||
|
? hts_query_strip_keys(StringBuff(opt->strip_query), adr,
|
||||||
|
fil_complete, stripkeys, sizeof(stripkeys))
|
||||||
|
: NULL;
|
||||||
const char *const print_adr = jump_protocol_const(adr);
|
const char *const print_adr = jump_protocol_const(adr);
|
||||||
const char *start_pos = NULL, *nom_pos = NULL, *dot_pos = NULL; // Position nom et point
|
const char *start_pos = NULL, *nom_pos = NULL, *dot_pos = NULL; // Position nom et point
|
||||||
|
|
||||||
@@ -230,9 +237,13 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
// www-42.foo.com -> foo.com
|
// www-42.foo.com -> foo.com
|
||||||
// foo.com/bar//foobar -> foo.com/bar/foobar
|
// foo.com/bar//foobar -> foo.com/bar/foobar
|
||||||
if (opt->urlhack) {
|
if (opt->urlhack) {
|
||||||
// copy of adr (without protocol), used for lookups (see urlhack)
|
// dedup-lookup key; honor the per-feature negatives like htshash.c so
|
||||||
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
|
// distinct URLs keep distinct savenames (else keep normadr = adr)
|
||||||
normfil = fil_normalized(fil_complete, normfil_);
|
if (!opt->no_www_dedup)
|
||||||
|
normadr = adr_normalized_sized(adr, normadr_, sizeof(normadr_));
|
||||||
|
normfil =
|
||||||
|
fil_normalized_filtered_ex(fil_complete, normfil_, strip,
|
||||||
|
!opt->no_slash_dedup, !opt->no_query_dedup);
|
||||||
} else {
|
} else {
|
||||||
if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder
|
if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder
|
||||||
char *pos = strchr(adr_complete, ':');
|
char *pos = strchr(adr_complete, ':');
|
||||||
@@ -245,6 +256,11 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
normadr = normadr_;
|
normadr = normadr_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// strip still applies with urlhack off (host left untouched); no // or
|
||||||
|
// query-sort here, to match the hash key (norm_slash/norm_query are 0 when
|
||||||
|
// urlhack is off) so a URL is looked up under the key it was stored with
|
||||||
|
if (strip != NULL)
|
||||||
|
normfil = fil_normalized_filtered_ex(fil_complete, normfil_, strip, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// à afficher sans ftp://
|
// à afficher sans ftp://
|
||||||
@@ -760,9 +776,9 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
strcatbuff(fil, DEFAULT_HTML); // nommer page par défaut (à priori ici html depuis un proxy http)
|
strcatbuff(fil, DEFAULT_HTML); // nommer page par défaut (à priori ici html depuis un proxy http)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Changer extension?
|
// Change the extension? e.g. php3 saved as html, cgi as html or gif/xbm
|
||||||
// par exemple, php3 sera sauvé en html, cgi en html ou gif, xbm etc.. selon les cas
|
// depending on the resolved type.
|
||||||
if (ext_chg && !opt->no_type_change) { // changer ext
|
if (ext_chg && !opt->no_type_change) {
|
||||||
char *a = fil + strlen(fil) - 1;
|
char *a = fil + strlen(fil) - 1;
|
||||||
|
|
||||||
if ((opt->debug > 1) && (opt->log != NULL)) {
|
if ((opt->debug > 1) && (opt->log != NULL)) {
|
||||||
@@ -774,11 +790,19 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
adr_complete, fil_complete, ext);
|
adr_complete, fil_complete, ext);
|
||||||
}
|
}
|
||||||
if (ext_chg == 1) {
|
if (ext_chg == 1) {
|
||||||
|
// Cut the old extension only when it is empty (a bare trailing dot), the
|
||||||
|
// new one, or a recognized one; an unknown trailing ".token" (e.g.
|
||||||
|
// /article-1.884291, #115) is part of the name, not an extension.
|
||||||
|
const char *const old_ext = get_ext(catbuff, sizeof(catbuff), fil);
|
||||||
|
const int known_ext = !*old_ext || strfield2(old_ext, ext) ||
|
||||||
|
is_knowntype(opt, fil) || is_dyntype(old_ext) ||
|
||||||
|
ishtml_ext(old_ext) != -1;
|
||||||
|
|
||||||
while((a > fil) && (*a != '.') && (*a != '/'))
|
while((a > fil) && (*a != '.') && (*a != '/'))
|
||||||
a--;
|
a--;
|
||||||
if (*a == '.')
|
if (*a == '.' && known_ext)
|
||||||
*a = '\0'; // couper
|
*a = '\0'; // cut
|
||||||
strcatbuff(fil, "."); // recopier point
|
strcatbuff(fil, "."); // re-add the dot
|
||||||
} else {
|
} else {
|
||||||
while((a > fil) && (*a != '/'))
|
while((a > fil) && (*a != '/'))
|
||||||
a--;
|
a--;
|
||||||
@@ -786,7 +810,7 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
a++;
|
a++;
|
||||||
*a = '\0';
|
*a = '\0';
|
||||||
}
|
}
|
||||||
strcatbuff(fil, ext); // copier ext/nom
|
strcatbuff(fil, ext); // append ext/name
|
||||||
}
|
}
|
||||||
// Rechercher premier / et dernier .
|
// Rechercher premier / et dernier .
|
||||||
{
|
{
|
||||||
@@ -1721,10 +1745,10 @@ char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
|||||||
StringBuff(opt->path_log), digest_filename);
|
StringBuff(opt->path_log), digest_filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* remove refname if any */
|
/* remove refname if any; HTS_TRUE if it was removed */
|
||||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||||
const char *fil) {
|
const char *fil) {
|
||||||
char *filename = url_savename_refname_fullpath(opt, adr, fil);
|
char *filename = url_savename_refname_fullpath(opt, adr, fil);
|
||||||
|
|
||||||
(void) UNLINK(filename);
|
return UNLINK(filename) == 0 ? HTS_TRUE : HTS_FALSE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -104,8 +104,9 @@ char *url_md5(char *digest_buffer, const char *fil_complete);
|
|||||||
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
||||||
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||||
const char *fil);
|
const char *fil);
|
||||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
/* Remove the temp-ref for (adr,fil); HTS_TRUE if it was removed. */
|
||||||
const char *fil);
|
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||||
|
const char *fil);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -241,7 +241,7 @@ struct htsoptstate {
|
|||||||
char *userhttptype;
|
char *userhttptype;
|
||||||
int verif_backblue_done; /**< backblue.gif/fade.gif already emitted */
|
int verif_backblue_done; /**< backblue.gif/fade.gif already emitted */
|
||||||
int verif_external_status;
|
int verif_external_status;
|
||||||
t_dnscache *dns_cache; /**< DNS resolution cache */
|
coucal dns_cache; /**< DNS resolution cache: hostname -> t_dnscache record */
|
||||||
int dns_cache_nthreads; /**< number of in-flight DNS resolver threads */
|
int dns_cache_nthreads; /**< number of in-flight DNS resolver threads */
|
||||||
/* HTML parsing state */
|
/* HTML parsing state */
|
||||||
char _hts_errmsg[HTS_CDLMAXSIZE + 256]; /**< last engine error message */
|
char _hts_errmsg[HTS_CDLMAXSIZE + 256]; /**< last engine error message */
|
||||||
@@ -529,6 +529,12 @@ struct httrackp {
|
|||||||
htslibhandles libHandles; /**< loaded external module handles */
|
htslibhandles libHandles; /**< loaded external module handles */
|
||||||
//
|
//
|
||||||
htsoptstate state; /**< embedded live engine state */
|
htsoptstate state; /**< embedded live engine state */
|
||||||
|
String strip_query; /**< query keys to drop when deduping URLs (-strip-query);
|
||||||
|
appended at the tail to keep field offsets stable */
|
||||||
|
hts_boolean
|
||||||
|
no_www_dedup; /**< with urlhack, keep www.host distinct from host */
|
||||||
|
hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */
|
||||||
|
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Running statistics for a mirror. */
|
/* Running statistics for a mirror. */
|
||||||
|
|||||||
@@ -3602,16 +3602,28 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
|||||||
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
|
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
|
||||||
int set_prio_to = 0; // pas de priotité fixéd par wizard
|
int set_prio_to = 0; // pas de priotité fixéd par wizard
|
||||||
|
|
||||||
// check whether URLHack is harmless or not
|
// check whether URLHack is harmless or not (per the effective
|
||||||
if (opt->urlhack) {
|
// sub-flags)
|
||||||
|
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
|
||||||
|
!opt->no_query_dedup)) {
|
||||||
|
const int norm_host = !opt->no_www_dedup;
|
||||||
|
const int norm_slash = !opt->no_slash_dedup;
|
||||||
|
const int norm_query = !opt->no_query_dedup;
|
||||||
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
|
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
|
||||||
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||||
|
|
||||||
n_adr[0] = n_fil[0] = '\0';
|
strlcpybuff(n_adr,
|
||||||
(void) adr_normalized_sized(moved->adr, n_adr, sizeof(n_adr));
|
norm_host ? jump_normalized_const(moved->adr)
|
||||||
(void) fil_normalized(moved->fil, n_fil);
|
: jump_identification_const(moved->adr),
|
||||||
(void) adr_normalized_sized(urladr(), pn_adr, sizeof(pn_adr));
|
sizeof(n_adr));
|
||||||
(void) fil_normalized(urlfil(), pn_fil);
|
strlcpybuff(pn_adr,
|
||||||
|
norm_host ? jump_normalized_const(urladr())
|
||||||
|
: jump_identification_const(urladr()),
|
||||||
|
sizeof(pn_adr));
|
||||||
|
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
|
||||||
|
norm_query);
|
||||||
|
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
|
||||||
|
norm_query);
|
||||||
if (strcasecmp(n_adr, pn_adr) == 0
|
if (strcasecmp(n_adr, pn_adr) == 0
|
||||||
&& strcasecmp(n_fil, pn_fil) == 0) {
|
&& strcasecmp(n_fil, pn_fil) == 0) {
|
||||||
hts_log_print(opt, LOG_WARNING,
|
hts_log_print(opt, LOG_WARNING,
|
||||||
@@ -3749,44 +3761,60 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
|||||||
|
|
||||||
} // bloc
|
} // bloc
|
||||||
// erreur HTTP (ex: 404, not found)
|
// erreur HTTP (ex: 404, not found)
|
||||||
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED)
|
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED) ||
|
||||||
|| (r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)
|
(r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)) {
|
||||||
) { // Precondition Failed, c'est à dire pour nous redemander TOUT le fichier
|
// 412/416: the resume partial is stale; re-get the whole file (#206)
|
||||||
if (fexist_utf8(heap(ptr)->sav)) {
|
lien_back *itemback = NULL;
|
||||||
remove(heap(ptr)->sav); // Eliminer
|
int had_partial = 0;
|
||||||
} else {
|
int ref_existed = 0;
|
||||||
hts_log_print(opt, LOG_WARNING,
|
int ref_gone;
|
||||||
"Unexpected 412/416 error (%s) for %s%s, '%s' could not be found on disk",
|
|
||||||
r->msg, urladr(), urlfil(),
|
// Drop the temp-ref, its partial, and heap->sav so the re-get carries no
|
||||||
heap(ptr)->sav != NULL ? heap(ptr)->sav : "");
|
// Range; else back_add rebuilds the same Range and loops.
|
||||||
|
if (back_unserialize_ref(opt, heap(ptr)->adr, heap(ptr)->fil,
|
||||||
|
&itemback) == 0) {
|
||||||
|
had_partial = 1;
|
||||||
|
ref_existed = 1;
|
||||||
|
// best-effort: an orphaned partial cannot re-Range once the ref is gone
|
||||||
|
if (fexist_utf8(itemback->url_sav))
|
||||||
|
(void) UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||||
|
itemback->url_sav));
|
||||||
|
back_clear_entry(itemback);
|
||||||
|
freet(itemback);
|
||||||
}
|
}
|
||||||
if (!fexist_utf8(heap(ptr)->sav)) { // Bien éliminé? (sinon on boucle..)
|
// don't re-record if the ref survived (it would re-Range and loop)
|
||||||
#if HDEBUG
|
ref_gone =
|
||||||
printf("Partial content NOT up-to-date, reget all file for %s\n",
|
url_savename_refname_remove(opt, heap(ptr)->adr, heap(ptr)->fil) ||
|
||||||
heap(ptr)->sav);
|
!ref_existed;
|
||||||
#endif
|
if (fexist_utf8(heap(ptr)->sav)) {
|
||||||
|
had_partial = 1;
|
||||||
|
remove(heap(ptr)->sav);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-get once, only if a partial existed and both Range triggers are
|
||||||
|
// gone; a failed removal gives up rather than looping. range_used is
|
||||||
|
// unreliable (it does not survive the delayed-type two-pass).
|
||||||
|
if (had_partial && ref_gone && !fexist_utf8(heap(ptr)->sav)) {
|
||||||
hts_log_print(opt, LOG_DEBUG, "Partial file reget (%s) for %s%s",
|
hts_log_print(opt, LOG_DEBUG, "Partial file reget (%s) for %s%s",
|
||||||
r->msg, urladr(), urlfil());
|
r->msg, urladr(), urlfil());
|
||||||
// enregistrer le MEME lien
|
|
||||||
if (hts_record_link(opt, heap(ptr)->adr, heap(ptr)->fil, heap(ptr)->sav, "", "", NULL)) {
|
if (hts_record_link(opt, heap(ptr)->adr, heap(ptr)->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||||
heap_top()->testmode = heap(ptr)->testmode; // mode test?
|
heap_top()->testmode = heap(ptr)->testmode;
|
||||||
heap_top()->link_import = 0; // pas mode import
|
heap_top()->link_import = 0;
|
||||||
heap_top()->depth = heap(ptr)->depth;
|
heap_top()->depth = heap(ptr)->depth;
|
||||||
heap_top()->pass2 = max(heap(ptr)->pass2, numero_passe);
|
heap_top()->pass2 = max(heap(ptr)->pass2, numero_passe);
|
||||||
heap_top()->retry = heap(ptr)->retry;
|
heap_top()->retry = heap(ptr)->retry;
|
||||||
heap_top()->premier = heap(ptr)->premier;
|
heap_top()->premier = heap(ptr)->premier;
|
||||||
heap_top()->precedent = ptr;
|
heap_top()->precedent = ptr;
|
||||||
//
|
|
||||||
// canceller lien actuel
|
|
||||||
error = 1;
|
error = 1;
|
||||||
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
||||||
//
|
} else { // out of memory
|
||||||
} else { // oups erreur, plus de mémoire!!
|
XH_uninit;
|
||||||
XH_uninit; // désallocation mémoire & buffers
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
hts_log_print(opt, LOG_ERROR, "Can not remove old file %s", urlfil());
|
hts_log_print(opt, LOG_WARNING,
|
||||||
|
"Giving up on partial reget (%s) for %s%s", r->msg,
|
||||||
|
urladr(), urlfil());
|
||||||
error = 1;
|
error = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
1293
src/htsselftest.c
Normal file
1293
src/htsselftest.c
Normal file
File diff suppressed because it is too large
Load Diff
52
src/htsselftest.h
Normal file
52
src/htsselftest.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/* ------------------------------------------------------------ */
|
||||||
|
/*
|
||||||
|
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||||
|
Copyright (C) 2026 Xavier Roche and other contributors
|
||||||
|
|
||||||
|
SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
Ethical use: we kindly ask that you NOT use this software to harvest email
|
||||||
|
addresses or to collect any other private information about people. Doing so
|
||||||
|
would dishonor our work and waste the many hours we have spent on it.
|
||||||
|
|
||||||
|
Please visit our Website: http://www.httrack.com
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* ------------------------------------------------------------ */
|
||||||
|
/* File: htsselftest.h */
|
||||||
|
/* named dispatch for the hidden engine self-tests */
|
||||||
|
/* Author: Xavier Roche */
|
||||||
|
/* ------------------------------------------------------------ */
|
||||||
|
|
||||||
|
#ifndef HTSSELFTEST_DEFH
|
||||||
|
#define HTSSELFTEST_DEFH
|
||||||
|
|
||||||
|
#ifdef HTS_INTERNAL_BYTECODE
|
||||||
|
|
||||||
|
#ifndef HTS_DEF_FWSTRUCT_httrackp
|
||||||
|
#define HTS_DEF_FWSTRUCT_httrackp
|
||||||
|
typedef struct httrackp httrackp;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Run engine self-test `name` over the positional args argv[0..argc-1], or list
|
||||||
|
the available tests when name is NULL, empty, or "list". Prints the result;
|
||||||
|
returns the process exit code (0 == success). The caller owns option cleanup.
|
||||||
|
Reached through the hidden `httrack -#test[=NAME ...]` subcommand. */
|
||||||
|
int hts_selftest(httrackp *opt, const char *name, int argc, char **argv);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -4,7 +4,7 @@
|
|||||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||||
# tool flags despite the #!/bin/bash above.
|
# tool flags despite the #!/bin/bash above.
|
||||||
|
|
||||||
# Golden cache-format regression test (driven by 'httrack -#B <dir>').
|
# Golden cache-format regression test (driven by 'httrack -#test=cache-golden <dir>').
|
||||||
#
|
#
|
||||||
# 01_engine-cache.test writes the cache with the same build it reads back (a
|
# 01_engine-cache.test writes the cache with the same build it reads back (a
|
||||||
# round-trip), so it cannot catch a read-path or ZIP-format regression where
|
# round-trip), so it cannot catch a read-path or ZIP-format regression where
|
||||||
@@ -13,7 +13,7 @@
|
|||||||
# byte-exact.
|
# byte-exact.
|
||||||
#
|
#
|
||||||
# Regenerate the fixture after a deliberate format change with
|
# Regenerate the fixture after a deliberate format change with
|
||||||
# 'httrack -#B <dir> regen', then copy <dir>/hts-cache/new.zip over the
|
# 'httrack -#test=cache-golden <dir> regen', then copy <dir>/hts-cache/new.zip over the
|
||||||
# committed file.
|
# committed file.
|
||||||
|
|
||||||
set -eu
|
set -eu
|
||||||
@@ -37,11 +37,11 @@ trap 'rm -rf "$dir"' EXIT
|
|||||||
mkdir -p "$dir/hts-cache"
|
mkdir -p "$dir/hts-cache"
|
||||||
cp "$fixture/hts-cache/new.zip" "$dir/hts-cache/new.zip"
|
cp "$fixture/hts-cache/new.zip" "$dir/hts-cache/new.zip"
|
||||||
|
|
||||||
out=$(httrack -#B "$dir")
|
out=$(httrack -#test=cache-golden "$dir")
|
||||||
|
|
||||||
# Match the exact success line: the read must have found and verified every
|
# Match the exact success line: the read must have found and verified every
|
||||||
# entry, not merely failed to enter the mode (a bad -#B falls through to the
|
# entry, not merely failed to enter the mode (a renamed/removed test prints the
|
||||||
# usage screen, which also exits non-zero but never prints this).
|
# registry to stderr, which also exits non-zero but never prints this).
|
||||||
test "$out" = "cache-golden: OK" || {
|
test "$out" = "cache-golden: OK" || {
|
||||||
echo "expected 'cache-golden: OK', got: $out" >&2
|
echo "expected 'cache-golden: OK', got: $out" >&2
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
24
tests/01_engine-cache-writefail.test
Normal file
24
tests/01_engine-cache-writefail.test
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Keep this POSIX-portable: the harness runs it via $(BASH), which is a plain
|
||||||
|
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||||
|
# tool flags despite the #!/bin/bash above.
|
||||||
|
|
||||||
|
# Cache write-failure handling (httrack -#test=cache-writefail <dir>). #174/#219.
|
||||||
|
# A failing new.zip write (disk full) used to crash the process via assertf; it
|
||||||
|
# must instead stop the mirror with a fatal error (exit_xh=-1), no crash. The
|
||||||
|
# self-test asserts that; reverting the fix makes -#test=cache-writefail abort (SIGABRT) and fail.
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
dir=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$dir"' EXIT
|
||||||
|
|
||||||
|
out=$(httrack -#test=cache-writefail "$dir")
|
||||||
|
|
||||||
|
# Match the exact success line (error logs also go to stdout); a renamed/removed
|
||||||
|
# test prints the registry to stderr, which exits non-zero but never prints this.
|
||||||
|
printf '%s\n' "$out" | grep -qx "cache-writefail: OK" || {
|
||||||
|
echo "expected 'cache-writefail: OK', got: $out" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
@@ -4,7 +4,7 @@
|
|||||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||||
# tool flags despite the #!/bin/bash above.
|
# tool flags despite the #!/bin/bash above.
|
||||||
|
|
||||||
# Cache create/read/update logic (driven by 'httrack -#A <dir>').
|
# Cache create/read/update logic (driven by 'httrack -#test=cache <dir>').
|
||||||
#
|
#
|
||||||
# The in-process self-test stores several hand-crafted edge entries (normal
|
# The in-process self-test stores several hand-crafted edge entries (normal
|
||||||
# HTML, an empty redirect with a near-limit location, a non-HTML body kept via
|
# HTML, an empty redirect with a near-limit location, a non-HTML body kept via
|
||||||
@@ -20,13 +20,13 @@ set -eu
|
|||||||
dir=$(mktemp -d)
|
dir=$(mktemp -d)
|
||||||
trap 'rm -rf "$dir"' EXIT
|
trap 'rm -rf "$dir"' EXIT
|
||||||
|
|
||||||
# Like the other -# debug modes, a trailing token (the working directory) is
|
# The working directory is a required argument; without it the test prints a
|
||||||
# required; a bare '-#A' falls through to the usage screen.
|
# usage line to stderr and returns non-zero.
|
||||||
out=$(httrack -#A "$dir")
|
out=$(httrack -#test=cache "$dir")
|
||||||
|
|
||||||
# Match the exact success line, so the test cannot pass for an unrelated reason
|
# Match the exact success line, so the test cannot pass for an unrelated reason
|
||||||
# (e.g. the -#A mode being gone and falling through to the usage screen, which
|
# (e.g. the cache test being gone, which prints the registry to stderr but
|
||||||
# also exits non-zero but never prints this).
|
# never prints this line).
|
||||||
test "$out" = "cache-selftest: OK" || {
|
test "$out" = "cache-selftest: OK" || {
|
||||||
echo "expected 'cache-selftest: OK', got: $out" >&2
|
echo "expected 'cache-selftest: OK', got: $out" >&2
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
@@ -4,13 +4,13 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# charset -> UTF-8 conversion (hts_convertStringToUTF8).
|
# charset -> UTF-8 conversion (hts_convertStringToUTF8).
|
||||||
# -#3 <charset> <string> prints the string re-decoded from <charset> as UTF-8.
|
# -#test=charset <charset> <string> prints the string re-decoded from <charset> as UTF-8.
|
||||||
conv() {
|
conv() {
|
||||||
test "$(httrack -O /dev/null -#3 "$1" "$2")" == "$3" || exit 1
|
test "$(httrack -O /dev/null -#test=charset "$1" "$2")" == "$3" || exit 1
|
||||||
}
|
}
|
||||||
# crash probe: malformed input must exit cleanly, not abort.
|
# crash probe: malformed input must exit cleanly, not abort.
|
||||||
runs() {
|
runs() {
|
||||||
httrack -O /dev/null -#3 "$1" "$2" >/dev/null 2>&1 || exit 1
|
httrack -O /dev/null -#test=charset "$1" "$2" >/dev/null 2>&1 || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# the source bytes below are UTF-8 (this file is UTF-8); "café" is 0x63 61 66 C3 A9.
|
# the source bytes below are UTF-8 (this file is UTF-8); "café" is 0x63 61 66 C3 A9.
|
||||||
@@ -31,7 +31,7 @@ conv 'us-ascii' 'hello' 'hello'
|
|||||||
# unknown charset: ASCII passes through unchanged, but non-ASCII input cannot be
|
# unknown charset: ASCII passes through unchanged, but non-ASCII input cannot be
|
||||||
# decoded and yields empty output (an error is printed to stderr).
|
# decoded and yields empty output (an error is printed to stderr).
|
||||||
conv 'no-such-charset-xyz' 'abc' 'abc'
|
conv 'no-such-charset-xyz' 'abc' 'abc'
|
||||||
test "$(httrack -O /dev/null -#3 'no-such-charset-xyz' 'café' 2>/dev/null)" == "" || exit 1
|
test "$(httrack -O /dev/null -#test=charset 'no-such-charset-xyz' 'café' 2>/dev/null)" == "" || exit 1
|
||||||
|
|
||||||
# malformed UTF-8 (lone continuation byte, truncated lead byte) must not crash
|
# malformed UTF-8 (lone continuation byte, truncated lead byte) must not crash
|
||||||
runs 'utf-8' $'\x80'
|
runs 'utf-8' $'\x80'
|
||||||
|
|||||||
@@ -1,14 +1,15 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# Issue #151 guard: the request Cookie header must be bare RFC 6265 name=value
|
# Issue #151 guard: the request Cookie header must be bare RFC 6265 name=value
|
||||||
# pairs, no $Version/$Path attributes. Driven by the 'httrack -#Q' selftest.
|
# pairs, no $Version/$Path attributes. Driven by the 'httrack -#test=cookies' selftest.
|
||||||
|
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
# A trailing token is required; a bare '-#Q' falls through to the usage screen.
|
# 'run' is an ignored placeholder argument.
|
||||||
out=$(httrack -#Q run)
|
out=$(httrack -#test=cookies run)
|
||||||
|
|
||||||
# Exact-match the success line so a fall-through to usage can't pass the test.
|
# Exact-match the success line so a renamed/removed test (it prints the registry
|
||||||
|
# to stderr) can't pass.
|
||||||
test "$out" = "cookie-header: OK" || {
|
test "$out" = "cookie-header: OK" || {
|
||||||
echo "expected 'cookie-header: OK', got: $out" >&2
|
echo "expected 'cookie-header: OK', got: $out" >&2
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
@@ -2,15 +2,16 @@
|
|||||||
#
|
#
|
||||||
# Regression guard for the unsigned-enum sentinel trap: copy_htsopt's
|
# Regression guard for the unsigned-enum sentinel trap: copy_htsopt's
|
||||||
# `if (from->X > -1)` guard is always false for unsigned hts_boolean fields, so
|
# `if (from->X > -1)` guard is always false for unsigned hts_boolean fields, so
|
||||||
# they silently stop being copied. Driven by the in-process 'httrack -#9' test.
|
# they silently stop being copied. Driven by the in-process 'httrack -#test=copyopt' test.
|
||||||
# Keep POSIX-portable (harness runs it via $(BASH), a plain /bin/sh on macOS).
|
# Keep POSIX-portable (harness runs it via $(BASH), a plain /bin/sh on macOS).
|
||||||
|
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
# A trailing token is required; a bare '-#9' falls through to the usage screen.
|
# 'run' is an ignored placeholder argument.
|
||||||
out=$(httrack -#9 run)
|
out=$(httrack -#test=copyopt run)
|
||||||
|
|
||||||
# Exact-match the success line so a fall-through to usage can't pass the test.
|
# Exact-match the success line so a renamed/removed test (it prints the registry
|
||||||
|
# to stderr) can't pass.
|
||||||
test "$out" = "copy-htsopt: OK" || {
|
test "$out" = "copy-htsopt: OK" || {
|
||||||
echo "expected 'copy-htsopt: OK', got: $out" >&2
|
echo "expected 'copy-htsopt: OK', got: $out" >&2
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
@@ -5,9 +5,8 @@ set -euo pipefail
|
|||||||
|
|
||||||
# DNS resolver/cache self-test: a mock getaddrinfo (no network) checks address
|
# DNS resolver/cache self-test: a mock getaddrinfo (no network) checks address
|
||||||
# family, single-address selection, the -@i4/-@i6 family filter, and cache reuse.
|
# family, single-address selection, the -@i4/-@i6 family filter, and cache reuse.
|
||||||
# The trailing token is required, like the other -# selftests, so a bare command
|
# 'run' is an ignored placeholder argument.
|
||||||
# line isn't treated as "no arguments" and routed to the usage screen.
|
out=$(httrack -#test=dns run)
|
||||||
out=$(httrack -#D run)
|
|
||||||
|
|
||||||
test "$out" = "dns-selftest: OK" || {
|
test "$out" = "dns-selftest: OK" || {
|
||||||
echo "expected 'dns-selftest: OK', got: $out" >&2
|
echo "expected 'dns-selftest: OK', got: $out" >&2
|
||||||
|
|||||||
@@ -4,13 +4,13 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# HTML entity unescaping (hts_unescapeEntitiesWithCharset).
|
# HTML entity unescaping (hts_unescapeEntitiesWithCharset).
|
||||||
# -#6 <string> prints the string with entities decoded (UTF-8 output).
|
# -#test=entities <string> prints the string with entities decoded (UTF-8 output).
|
||||||
ent() {
|
ent() {
|
||||||
test "$(httrack -O /dev/null -#6 "$1")" == "$2" || exit 1
|
test "$(httrack -O /dev/null -#test=entities "$1")" == "$2" || exit 1
|
||||||
}
|
}
|
||||||
# crash probe: malformed input must exit cleanly, not abort.
|
# crash probe: malformed input must exit cleanly, not abort.
|
||||||
runs() {
|
runs() {
|
||||||
httrack -O /dev/null -#6 "$1" >/dev/null 2>&1 || exit 1
|
httrack -O /dev/null -#test=entities "$1" >/dev/null 2>&1 || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# named entities
|
# named entities
|
||||||
|
|||||||
65
tests/01_engine-filelist.test
Normal file
65
tests/01_engine-filelist.test
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# -%L URL-list loading (#49): a readable list is honored; an unusable one fails
|
||||||
|
# with the reason (errno / not-a-regular-file), not a bare "Could not include
|
||||||
|
# URL list". Offline: file:// fixture, no server. Asserts on httrack's own
|
||||||
|
# strings and the message shape, so it is locale-independent.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
tmp=$(mktemp -d "${TMPDIR:-/tmp}/httrack_filelist.XXXXXX") || exit 1
|
||||||
|
trap 'rm -rf "$tmp"' EXIT HUP INT QUIT PIPE TERM
|
||||||
|
|
||||||
|
echo '<html><body>hi</body></html>' >"$tmp/index.html"
|
||||||
|
|
||||||
|
# run httrack with the given -%L target; structured log lands in $out/hts-log.txt
|
||||||
|
run() {
|
||||||
|
local out="$1" list="$2"
|
||||||
|
rm -rf "$out"
|
||||||
|
mkdir -p "$out"
|
||||||
|
httrack -O "$out" --quiet -n "-%L" "$list" >"$out/.stdout" 2>&1 || true
|
||||||
|
LOG="$out/hts-log.txt"
|
||||||
|
}
|
||||||
|
|
||||||
|
fail() {
|
||||||
|
echo "FAIL: $1"
|
||||||
|
cat "$LOG"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
loghas() {
|
||||||
|
grep -Eq "$1" "$LOG" || fail "expected /$1/ in $LOG"
|
||||||
|
}
|
||||||
|
lognot() {
|
||||||
|
if grep -Eq "$1" "$LOG"; then fail "unexpected /$1/ in $LOG"; fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# readable list: its one URL is loaded and counted (count must be non-zero)
|
||||||
|
printf 'file://%s/index.html\n' "$tmp" >"$tmp/urls.txt"
|
||||||
|
run "$tmp/ok" "$tmp/urls.txt"
|
||||||
|
loghas '[1-9][0-9]* links added from'
|
||||||
|
|
||||||
|
# missing file: quoted name + a non-empty reason, never the old reasonless
|
||||||
|
# "Could not include URL list: <name>". The reason is the stat() errno, not the
|
||||||
|
# directory fallback literal (guards against dropping the errno lookup).
|
||||||
|
run "$tmp/miss" "$tmp/nope.txt"
|
||||||
|
loghas 'Could not include URL list "[^"]+": .+'
|
||||||
|
lognot 'Could not include URL list: '
|
||||||
|
lognot 'not a regular file'
|
||||||
|
|
||||||
|
# a directory is rejected with our own reason (locale-independent)
|
||||||
|
mkdir -p "$tmp/adir"
|
||||||
|
run "$tmp/dir" "$tmp/adir"
|
||||||
|
loghas 'Could not include URL list "[^"]+": not a regular file'
|
||||||
|
|
||||||
|
# unreadable regular file: the fopen() errno arm fires, distinct from the
|
||||||
|
# directory branch. Root bypasses mode 000, so skip it there.
|
||||||
|
if test "$(id -u)" -ne 0; then
|
||||||
|
: >"$tmp/noperm.txt"
|
||||||
|
chmod 000 "$tmp/noperm.txt"
|
||||||
|
run "$tmp/perm" "$tmp/noperm.txt"
|
||||||
|
chmod 644 "$tmp/noperm.txt"
|
||||||
|
loghas 'Could not include URL list "[^"]+": .+'
|
||||||
|
lognot 'not a regular file'
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
@@ -4,13 +4,13 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# wildcard filter engine (strjoker), the core of +/- include/exclude rules.
|
# wildcard filter engine (strjoker), the core of +/- include/exclude rules.
|
||||||
# -#0 <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
|
# -#test=filter <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
|
||||||
|
|
||||||
match() {
|
match() {
|
||||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does match $1" || exit 1
|
test "$(httrack -O /dev/null -#test=filter "$1" "$2")" == "$2 does match $1" || exit 1
|
||||||
}
|
}
|
||||||
nomatch() {
|
nomatch() {
|
||||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does NOT match $1" || exit 1
|
test "$(httrack -O /dev/null -#test=filter "$1" "$2")" == "$2 does NOT match $1" || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# bare star matches everything
|
# bare star matches everything
|
||||||
@@ -71,3 +71,27 @@ nomatch '*[\[\]]' '[' # not matched, despite the docs
|
|||||||
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
|
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
|
||||||
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
|
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
|
||||||
nomatch '*[\[\]]' '[]x'
|
nomatch '*[\[\]]' '[]x'
|
||||||
|
|
||||||
|
# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
|
||||||
|
# means the size is still unknown (scan time). A size exclusion must stay neutral
|
||||||
|
# then, so the file is fetched and only cancelled once its size is known (#143).
|
||||||
|
fsize() {
|
||||||
|
local want="$1"
|
||||||
|
shift
|
||||||
|
test "$(httrack -O /dev/null -#test=filtersize "$@")" == "$want" || exit 1
|
||||||
|
}
|
||||||
|
fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # scan time: keep
|
||||||
|
fsize 'verdict=forbidden size_flag=1' 5 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # <10KB: cancel
|
||||||
|
fsize 'verdict=allowed size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[<10]' # >=10KB: keep
|
||||||
|
fsize 'verdict=forbidden size_flag=0' -1 foo.txt -* '+*.jpg' '-*.jpg*[<10]' # not a jpg
|
||||||
|
# the '>' operator is just as neutral at scan time, and fires once size is known
|
||||||
|
fsize 'verdict=allowed size_flag=0' -1 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # scan time: keep
|
||||||
|
fsize 'verdict=forbidden size_flag=1' 20 foo.jpg -* '+*.jpg' '-*.jpg*[>10]' # >10KB: cancel
|
||||||
|
|
||||||
|
# [name]/[file]/[path] never span '?' mid-string; a trailing query is still
|
||||||
|
# tolerated by the global '?' rule (same as plain *.aspx), not the class (#144).
|
||||||
|
nomatch '*[path]/end' 'a?b/end'
|
||||||
|
nomatch '*[file]end' 'foo?xend'
|
||||||
|
nomatch '*[name]X' 'abc?X'
|
||||||
|
match '*[file]' 'foo?x=1' # trailing query: tolerated, as for *.aspx
|
||||||
|
match '*.aspx' 'page.aspx?y=2'
|
||||||
|
|||||||
@@ -3,5 +3,7 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# httrack internal hashtable autotest on 100K keys
|
# httrack internal hashtable autotest on 100K keys. Assert the success line (on
|
||||||
httrack -#7 100000
|
# stderr) so a misrouted registry entry can't pass on exit code alone.
|
||||||
|
out=$(httrack -#test=hashtable 100000 2>&1)
|
||||||
|
printf '%s\n' "$out" | grep -q "all hashtable tests were successful!" || exit 1
|
||||||
|
|||||||
@@ -3,13 +3,13 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# IDNA / punycode encode (-#4) and decode (-#5). This code has a CVE history,
|
# IDNA / punycode encode (-#test=idna-encode) and decode (-#test=idna-decode). This code has a CVE history,
|
||||||
# so the edge cases below cover passthrough, round-trips, and malformed input.
|
# so the edge cases below cover passthrough, round-trips, and malformed input.
|
||||||
|
|
||||||
enc() { test "$(httrack -O /dev/null -#4 "$1")" == "$2" || exit 1; }
|
enc() { test "$(httrack -O /dev/null -#test=idna-encode "$1")" == "$2" || exit 1; }
|
||||||
dec() { test "$(httrack -O /dev/null -#5 "$1")" == "$2" || exit 1; }
|
dec() { test "$(httrack -O /dev/null -#test=idna-decode "$1")" == "$2" || exit 1; }
|
||||||
# crash probe: malformed ACE input must exit cleanly, not abort.
|
# crash probe: malformed ACE input must exit cleanly, not abort.
|
||||||
runs() { httrack -O /dev/null -#5 "$1" >/dev/null 2>&1 || exit 1; }
|
runs() { httrack -O /dev/null -#test=idna-decode "$1" >/dev/null 2>&1 || exit 1; }
|
||||||
|
|
||||||
# encode
|
# encode
|
||||||
enc 'www.café.com' 'www.xn--caf-dma.com'
|
enc 'www.café.com' 'www.xn--caf-dma.com'
|
||||||
|
|||||||
@@ -4,13 +4,13 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# MIME type guessing from extension (get_httptype / give_mimext).
|
# MIME type guessing from extension (get_httptype / give_mimext).
|
||||||
# -#2 <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
|
# -#test=mime <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
|
||||||
|
|
||||||
mime() {
|
mime() {
|
||||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is '$2'" || exit 1
|
test "$(httrack -O /dev/null -#test=mime "$1" | head -1)" == "$1 is '$2'" || exit 1
|
||||||
}
|
}
|
||||||
unknown() {
|
unknown() {
|
||||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
|
test "$(httrack -O /dev/null -#test=mime "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
mime '/a/b.html' 'text/html'
|
mime '/a/b.html' 'text/html'
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ set -euo pipefail
|
|||||||
# relative path from <curr>'s directory to <link>
|
# relative path from <curr>'s directory to <link>
|
||||||
rel() {
|
rel() {
|
||||||
local got
|
local got
|
||||||
got=$(httrack -O /dev/null -#l "$1" "$2")
|
got=$(httrack -O /dev/null -#test=relative "$1" "$2")
|
||||||
test "$got" == "relative=$3" ||
|
test "$got" == "relative=$3" ||
|
||||||
{
|
{
|
||||||
echo "FAIL rel($1, $2): got '$got' want 'relative=$3'"
|
echo "FAIL rel($1, $2): got '$got' want 'relative=$3'"
|
||||||
@@ -19,7 +19,7 @@ rel() {
|
|||||||
# resolve <link> against origin <adr>/<fil> -> adr=.. fil=..
|
# resolve <link> against origin <adr>/<fil> -> adr=.. fil=..
|
||||||
ident() {
|
ident() {
|
||||||
local got
|
local got
|
||||||
got=$(httrack -O /dev/null -#i "$1" "$2" "$3")
|
got=$(httrack -O /dev/null -#test=resolve "$1" "$2" "$3")
|
||||||
test "$got" == "$4" ||
|
test "$got" == "$4" ||
|
||||||
{
|
{
|
||||||
echo "FAIL ident($1, $2, $3): got '$got' want '$4'"
|
echo "FAIL ident($1, $2, $3): got '$got' want '$4'"
|
||||||
|
|||||||
41
tests/01_engine-savename.test
Executable file
41
tests/01_engine-savename.test
Executable file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Local save-name extension resolution (url_savename via -#test=savename <fil> <content-type>).
|
||||||
|
# Asserts on the basename of "savename: <path>".
|
||||||
|
|
||||||
|
name() {
|
||||||
|
out="$(httrack -O /dev/null -#test=savename "$1" "$2" | sed -n 's/^savename: //p')"
|
||||||
|
test "${out##*/}" == "$3" || {
|
||||||
|
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# #115: an unknown trailing ".token" is part of the name, keep it and append the type.
|
||||||
|
name '/article-1.884291' 'text/html' 'article-1.884291.html'
|
||||||
|
name '/news/story-12345.987654' 'text/html' 'story-12345.987654.html'
|
||||||
|
|
||||||
|
# Recognized extensions still collapse to the resolved type.
|
||||||
|
name '/page.php' 'text/html' 'page.html'
|
||||||
|
name '/page.asp' 'text/html' 'page.html'
|
||||||
|
name '/foo' 'text/html' 'foo.html'
|
||||||
|
|
||||||
|
# A bare trailing dot is not a tail to keep.
|
||||||
|
name '/page.' 'text/html' 'page.html'
|
||||||
|
|
||||||
|
# Soft-404 (#267/#408): a binary URL served as HTML is named .html.
|
||||||
|
name '/x.pdf' 'text/html' 'x.html'
|
||||||
|
name '/x.gif' 'text/html' 'x.html'
|
||||||
|
|
||||||
|
# Type agrees with the extension: keep it, no churn, no double extension.
|
||||||
|
name '/x.pdf' 'application/pdf' 'x.pdf'
|
||||||
|
name '/x.jpg' 'image/jpeg' 'x.jpg'
|
||||||
|
name '/x.html' 'text/html' 'x.html'
|
||||||
|
name '/x.js' 'application/x-javascript' 'x.js'
|
||||||
|
name '/types/data.json' 'application/json' 'data.json'
|
||||||
|
|
||||||
|
# Agreeing type must not rewrite the extension's casing (no strip-and-reappend).
|
||||||
|
name '/x.JPG' 'image/jpeg' 'x.JPG'
|
||||||
17
tests/01_engine-selftest-dispatch.test
Normal file
17
tests/01_engine-selftest-dispatch.test
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# The -#test dispatch itself: a bare -#test lists the registry, and an unknown
|
||||||
|
# name errors (non-zero, diagnostic) instead of silently passing.
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
# Bare -#test lists known tests (printed to stderr).
|
||||||
|
list=$(httrack -#test 2>&1)
|
||||||
|
printf '%s\n' "$list" | grep -q "filter" || exit 1
|
||||||
|
printf '%s\n' "$list" | grep -q "cache-writefail" || exit 1
|
||||||
|
|
||||||
|
# Unknown name: non-zero exit + diagnostic, and no test result line.
|
||||||
|
rc=0
|
||||||
|
err=$(httrack -#test=bogus 2>&1) || rc=$?
|
||||||
|
test "$rc" -ne 0 || exit 1
|
||||||
|
printf '%s\n' "$err" | grep -q "Unknown self-test" || exit 1
|
||||||
@@ -5,7 +5,7 @@ set -euo pipefail
|
|||||||
|
|
||||||
# path simplify engine (fil_simplifie): collapses ./ and ../ segments.
|
# path simplify engine (fil_simplifie): collapses ./ and ../ segments.
|
||||||
simp() {
|
simp() {
|
||||||
test "$(httrack -O /dev/null -#1 "$1")" == "simplified=$2" || exit 1
|
test "$(httrack -O /dev/null -#test=simplify "$1")" == "simplified=$2" || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
simp './foo/bar/' 'foo/bar/'
|
simp './foo/bar/' 'foo/bar/'
|
||||||
|
|||||||
8
tests/01_engine-stripquery.test
Executable file
8
tests/01_engine-stripquery.test
Executable file
@@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# --strip-query: pattern-scoped query-key stripping for dedup. All assertions
|
||||||
|
# live in the engine self-test (hts_query_strip_keys + fil_normalized_filtered).
|
||||||
|
httrack -O /dev/null -#test=stripquery | grep -q "strip-query self-test OK"
|
||||||
@@ -3,23 +3,22 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# htssafe.h bounded string operations (driven by 'httrack -#8').
|
# htssafe.h bounded string operations (driven by 'httrack -#test=strsafe').
|
||||||
|
|
||||||
# Success path: every bounded op (strcpybuff/strcatbuff/strncatbuff/strlcpybuff)
|
# Success path: every bounded op (strcpybuff/strcatbuff/strncatbuff/strlcpybuff)
|
||||||
# must behave correctly. Like the other -# debug modes, a trailing token is
|
# must behave correctly. 'run' selects the success path (vs the overflow modes).
|
||||||
# required (a bare '-#8' falls through to the usage screen).
|
|
||||||
rc=0
|
rc=0
|
||||||
out=$(httrack -#8 run) || rc=$?
|
out=$(httrack -#test=strsafe run) || rc=$?
|
||||||
test "$rc" -eq 0 || exit 1
|
test "$rc" -eq 0 || exit 1
|
||||||
test "$out" == "strsafe: OK" || exit 1
|
test "$out" == "strsafe: OK" || exit 1
|
||||||
|
|
||||||
# Overflow path: an over-capacity write into a sized buffer must be caught by
|
# Overflow path: an over-capacity write into a sized buffer must be caught by
|
||||||
# the bounded macro and abort the process, not be silently truncated/completed.
|
# the bounded macro and abort the process, not be silently truncated/completed.
|
||||||
# Assert the htssafe abort signature specifically, so the test cannot pass for
|
# Assert the htssafe abort signature specifically, so the test cannot pass for
|
||||||
# an unrelated reason (e.g. the -#8 mode being gone and falling through to the
|
# an unrelated reason (e.g. the strsafe test being gone, which prints the
|
||||||
# usage screen, which also exits non-zero).
|
# registry to stderr and also exits non-zero).
|
||||||
# the bounded macro aborts (non-zero exit), so don't let set -e trip on it
|
# the bounded macro aborts (non-zero exit), so don't let set -e trip on it
|
||||||
err=$(httrack -#8 overflow "this string is far too long for the buffer" 2>&1) || true
|
err=$(httrack -#test=strsafe overflow "this string is far too long for the buffer" 2>&1) || true
|
||||||
case "$err" in
|
case "$err" in
|
||||||
*"strsafe: NOT aborted"*)
|
*"strsafe: NOT aborted"*)
|
||||||
echo "over-capacity write was NOT caught" >&2
|
echo "over-capacity write was NOT caught" >&2
|
||||||
@@ -36,7 +35,7 @@ esac
|
|||||||
# capacity (4 bytes into a 4-byte buffer), so this also pins the boundary: a
|
# capacity (4 bytes into a 4-byte buffer), so this also pins the boundary: a
|
||||||
# '<=' off-by-one in the capacity check would let it through (and print "NOT
|
# '<=' off-by-one in the capacity check would let it through (and print "NOT
|
||||||
# aborted"). Match the specific htsbuff abort message, not just any assert.
|
# aborted"). Match the specific htsbuff abort message, not just any assert.
|
||||||
err=$(httrack -#8 overflow-buff "abcd" 2>&1) || true
|
err=$(httrack -#test=strsafe overflow-buff "abcd" 2>&1) || true
|
||||||
case "$err" in
|
case "$err" in
|
||||||
*"strsafe: NOT aborted"*)
|
*"strsafe: NOT aborted"*)
|
||||||
echo "htsbuff over-capacity write was NOT caught" >&2
|
echo "htsbuff over-capacity write was NOT caught" >&2
|
||||||
|
|||||||
8
tests/01_engine-urlhack.test
Normal file
8
tests/01_engine-urlhack.test
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# -%u url-hack split (#271): www / // / query-order dedup toggle independently.
|
||||||
|
# All assertions live in the engine self-test (hash compare flag resolution).
|
||||||
|
httrack -O /dev/null -#test=urlhack run | grep -q "urlhack self-test OK"
|
||||||
110
tests/19_local-connect-fallback.test
Normal file
110
tests/19_local-connect-fallback.test
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# A host that resolves to several addresses must fall back to the next one when
|
||||||
|
# a connect fails, instead of giving up on the first (dead IPv6 on a dual-stack
|
||||||
|
# host, ...). HTTRACK_DEBUG_RESOLVE pins "deadhost" to a refused address first
|
||||||
|
# (127.0.0.2, nothing listening) then the live server (127.0.0.1): the crawl
|
||||||
|
# only succeeds if httrack retries the second address. A second case pins every
|
||||||
|
# address to a refused one, so the slot must exhaust the list and error out
|
||||||
|
# (rather than hang or loop).
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
if test "${V6_SUPPORT:-}" == "no"; then
|
||||||
|
echo "no IPv6 support (resolver list/override is IPv6-only), skipping"
|
||||||
|
exit 77
|
||||||
|
fi
|
||||||
|
if ! command -v python3 >/dev/null 2>&1; then
|
||||||
|
echo "python3 missing, skipping"
|
||||||
|
exit 77
|
||||||
|
fi
|
||||||
|
|
||||||
|
server="$top_srcdir/tests/local-server.py"
|
||||||
|
root="$top_srcdir/tests/server-root"
|
||||||
|
tmpdir=$(mktemp -d)
|
||||||
|
serverpid=
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if test -n "$serverpid"; then
|
||||||
|
kill "$serverpid" 2>/dev/null || true
|
||||||
|
wait "$serverpid" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
# bind the live server to 127.0.0.1 only, so 127.0.0.2 refuses the connect
|
||||||
|
python3 "$server" --root "$root" --bind 127.0.0.1 >"$tmpdir/srv.out" 2>"$tmpdir/srv.err" &
|
||||||
|
serverpid=$!
|
||||||
|
port=
|
||||||
|
for _ in $(seq 1 50); do
|
||||||
|
line=$(head -n1 "$tmpdir/srv.out" 2>/dev/null || true)
|
||||||
|
if test "${line%% *}" == "PORT"; then
|
||||||
|
port="${line#PORT }"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
kill -0 "$serverpid" 2>/dev/null || {
|
||||||
|
echo "server exited early: $(cat "$tmpdir/srv.err")"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
test -n "$port" || {
|
||||||
|
echo "could not discover server port"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
out="$tmpdir/crawl"
|
||||||
|
HTTRACK_DEBUG_RESOLVE="deadhost:127.0.0.2,127.0.0.1" \
|
||||||
|
httrack "http://deadhost:$port/simple/basic.html" -O "$out" \
|
||||||
|
-c1 --robots=0 --timeout=30 --quiet -Z >"$tmpdir/log" 2>&1
|
||||||
|
|
||||||
|
log="$out/hts-log.txt"
|
||||||
|
|
||||||
|
# the dead address was tried, then the next one (proves the fallback ran)
|
||||||
|
if ! grep -q "trying next address" "$log"; then
|
||||||
|
echo "FAIL: no connect fallback happened"
|
||||||
|
cat "$log"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 0 errors and the file was actually fetched (over the live address)
|
||||||
|
errs=$(grep -iEc "^[0-9:]*[[:space:]]Error:" "$log" || true)
|
||||||
|
test "$errs" == "0" || {
|
||||||
|
echo "FAIL: $errs error(s) reported"
|
||||||
|
grep -iE "Error:" "$log"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
test -f "$out/deadhost_$port/simple/basic.html" || {
|
||||||
|
echo "FAIL: basic.html not downloaded via fallback"
|
||||||
|
find "$out" -type f
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# every address refused: the slot exhausts the list, then errors out (the
|
||||||
|
# harness timeout would catch a hang/loop; refused connects are instant)
|
||||||
|
out2="$tmpdir/crawl2"
|
||||||
|
HTTRACK_DEBUG_RESOLVE="alldead:127.0.0.2,127.0.0.3" \
|
||||||
|
httrack "http://alldead:$port/simple/basic.html" -O "$out2" \
|
||||||
|
-c1 --robots=0 --timeout=30 --quiet -Z >"$tmpdir/log2" 2>&1
|
||||||
|
log2="$out2/hts-log.txt"
|
||||||
|
|
||||||
|
grep -q "trying next address" "$log2" || {
|
||||||
|
echo "FAIL: exhaustion path never tried the fallback address"
|
||||||
|
cat "$log2"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -iqE "^[0-9:]*[[:space:]]Error:" "$log2" || {
|
||||||
|
echo "FAIL: all addresses failing did not report an error"
|
||||||
|
cat "$log2"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
test ! -f "$out2/alldead_$port/simple/basic.html" || {
|
||||||
|
echo "FAIL: file downloaded despite every address failing"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "OK: connect fallback succeeds, and exhausting all addresses errors out"
|
||||||
113
tests/20_local-resume-loop.test
Executable file
113
tests/20_local-resume-loop.test
Executable file
@@ -0,0 +1,113 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Issue #206: a continue/update crawl looped forever when the resume Range got a
|
||||||
|
# 416. Pass 1 leaves a partial + temp-ref; pass 2 must terminate and not loop.
|
||||||
|
set -u
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
testdir=$(cd "$(dirname "$0")" && pwd)
|
||||||
|
server="${testdir}/local-server.py"
|
||||||
|
|
||||||
|
command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
|
||||||
|
|
||||||
|
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_206.XXXXXX") || exit 1
|
||||||
|
serverpid=
|
||||||
|
crawlpid=
|
||||||
|
cleanup() {
|
||||||
|
test -n "$crawlpid" && kill -9 "$crawlpid" 2>/dev/null
|
||||||
|
if test -n "$serverpid"; then
|
||||||
|
kill "$serverpid" 2>/dev/null
|
||||||
|
wait "$serverpid" 2>/dev/null
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT HUP INT QUIT PIPE TERM
|
||||||
|
|
||||||
|
# --- start the server, discover its ephemeral port --------------------------
|
||||||
|
# RESUME_COUNTER gets a byte per /resume/blob.txt request (pass-2 delta bounds re-gets).
|
||||||
|
serverlog="${tmpdir}/server.log"
|
||||||
|
counter="${tmpdir}/blobcount"
|
||||||
|
RESUME_COUNTER="$counter" python3 "$server" --root "${testdir}/server-root" >"$serverlog" 2>&1 &
|
||||||
|
serverpid=$!
|
||||||
|
port=
|
||||||
|
for _ in $(seq 1 50); do
|
||||||
|
line=$(head -n1 "$serverlog" 2>/dev/null)
|
||||||
|
if test "${line%% *}" == "PORT"; then
|
||||||
|
port="${line#PORT }"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
kill -0 "$serverpid" 2>/dev/null || {
|
||||||
|
echo "server exited early: $(cat "$serverlog")"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
test -n "$port" || {
|
||||||
|
echo "could not discover server port"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
base="http://127.0.0.1:${port}"
|
||||||
|
|
||||||
|
which httrack >/dev/null || {
|
||||||
|
echo "could not find httrack"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
out="${tmpdir}/crawl"
|
||||||
|
mkdir "$out"
|
||||||
|
common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0)
|
||||||
|
refdir="${out}/hts-cache/ref"
|
||||||
|
|
||||||
|
# --- pass 1: crawl, interrupt once the blob download is underway -------------
|
||||||
|
printf '[pass 1: interrupt mid-download] ..\t'
|
||||||
|
httrack "${common[@]}" "${base}/resume/index.html" >"${tmpdir}/log1" 2>&1 &
|
||||||
|
crawlpid=$!
|
||||||
|
# Wait until blob.txt is requested, then SIGTERM so httrack's exit handler
|
||||||
|
# finalizes the cache and serializes the temp-ref.
|
||||||
|
for _ in $(seq 1 300); do
|
||||||
|
test -s "$counter" && break
|
||||||
|
kill -0 "$crawlpid" 2>/dev/null || break
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
sleep 0.5
|
||||||
|
kill -TERM "$crawlpid" 2>/dev/null
|
||||||
|
wait "$crawlpid" 2>/dev/null
|
||||||
|
crawlpid=
|
||||||
|
test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
|
||||||
|
echo "FAIL: no temp-ref survived pass 1; cannot drive #206"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
echo "OK (temp-ref present)"
|
||||||
|
before=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||||
|
|
||||||
|
# --- pass 2: --continue -> resume Range -> 416, bounded against the #206 loop -
|
||||||
|
# Kill pass 2 after a deadline (portable stand-in for `timeout`, absent on macOS).
|
||||||
|
printf '[pass 2: resume must terminate] ..\t'
|
||||||
|
HANG_RC=137 # 128 + SIGKILL
|
||||||
|
httrack "${common[@]}" --continue "${base}/resume/index.html" >"${tmpdir}/log2" 2>&1 &
|
||||||
|
crawlpid=$!
|
||||||
|
(sleep 30 && kill -9 "$crawlpid" 2>/dev/null) &
|
||||||
|
guard=$!
|
||||||
|
rc=0
|
||||||
|
wait "$crawlpid" 2>/dev/null || rc=$?
|
||||||
|
crawlpid=
|
||||||
|
kill "$guard" 2>/dev/null || true
|
||||||
|
wait "$guard" 2>/dev/null || true
|
||||||
|
if test "$rc" -eq "$HANG_RC"; then
|
||||||
|
echo "FAIL: pass 2 did not terminate (#206 resume->416 loop)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "OK (terminated, rc=$rc)"
|
||||||
|
|
||||||
|
# The fix re-gets once (resume Range + range-less re-get = 2): the lower bound
|
||||||
|
# rejects a drop-the-link non-fix (1), the upper bound rejects the loop (many).
|
||||||
|
after=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||||
|
hits=$((after - before))
|
||||||
|
printf '[bounded re-get count] ..\t'
|
||||||
|
if test "$hits" -lt 2; then
|
||||||
|
echo "FAIL: only ${hits} pass-2 request(s); the stale partial was not re-got"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if test "$hits" -gt 8; then
|
||||||
|
echo "FAIL: ${hits} pass-2 requests for blob.txt (resume is looping)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "OK (${hits} requests)"
|
||||||
11
tests/21_local-intl-update.test
Normal file
11
tests/21_local-intl-update.test
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# #157: a dotless, accented URL named .html on the first crawl must keep .html
|
||||||
|
# across an update -- not revert to the extensionless name.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||||
|
--found 'intl/Instalação_CVS_no_Ubuntu.html' \
|
||||||
|
--not-found 'intl/Instalação_CVS_no_Ubuntu' \
|
||||||
|
httrack 'BASEURL/intl/index.html'
|
||||||
17
tests/22_local-broken-size.test
Executable file
17
tests/22_local-broken-size.test
Executable file
@@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Issues #32/#41: a Content-Length that disagrees with the body warns "bogus
|
||||||
|
# state (broken size)" and skips the cache; -%B (tolerant) accepts it.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
# Default: warn, but the file is still written.
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||||
|
--found 'size/oversize.bin' \
|
||||||
|
--log-found 'bogus state \(broken size' \
|
||||||
|
httrack 'BASEURL/size/index.html'
|
||||||
|
|
||||||
|
# -%B (tolerant): no warning, file written.
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||||
|
--found 'size/oversize.bin' \
|
||||||
|
--log-not-found 'bogus state' \
|
||||||
|
httrack 'BASEURL/size/index.html' '-%B'
|
||||||
19
tests/23_local-errpage.test
Normal file
19
tests/23_local-errpage.test
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Issue #17: with "no error pages" (-o0), 4xx/5xx bodies must not be written;
|
||||||
|
# a genuine 0-byte 200 stays. Default (-o1) writes the error page. (#17's purge
|
||||||
|
# half also does not reproduce; the purge path is not exercised here.)
|
||||||
|
set -e
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
# -o0: 404 suppressed, good page and the legit 0-byte 200 kept.
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||||
|
--found 'errpage/good.html' \
|
||||||
|
--found 'errpage/empty.html' \
|
||||||
|
--not-found 'errpage/missing.html' \
|
||||||
|
httrack 'BASEURL/errpage/index.html' '-o0'
|
||||||
|
|
||||||
|
# Control -o1 (default): the 404 error page is written.
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||||
|
--found 'errpage/missing.html' \
|
||||||
|
httrack 'BASEURL/errpage/index.html' '-o1'
|
||||||
109
tests/24_local-resume-overlap.test
Normal file
109
tests/24_local-resume-overlap.test
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Issue #198: on a resumed download the server may answer the Range with a 206
|
||||||
|
# that starts *before* the offset we asked for (block-aligned ranges). httrack
|
||||||
|
# must honor the returned Content-Range, not blindly append, or the overlap
|
||||||
|
# bytes get duplicated and the file grows (corrupt PDFs). Pass 1 interrupts
|
||||||
|
# flaky.bin mid-body (partial + temp-ref); pass 2 resumes against a 206 that
|
||||||
|
# backs up 8 bytes. The result must equal the same bytes fetched whole (full.bin).
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
testdir=$(cd "$(dirname "$0")" && pwd)
|
||||||
|
server="${testdir}/local-server.py"
|
||||||
|
|
||||||
|
command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
|
||||||
|
|
||||||
|
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_198.XXXXXX") || exit 1
|
||||||
|
serverpid=
|
||||||
|
crawlpid=
|
||||||
|
cleanup() {
|
||||||
|
if test -n "$crawlpid"; then kill -9 "$crawlpid" 2>/dev/null || true; fi
|
||||||
|
if test -n "$serverpid"; then
|
||||||
|
kill "$serverpid" 2>/dev/null || true
|
||||||
|
wait "$serverpid" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT HUP INT QUIT PIPE TERM
|
||||||
|
|
||||||
|
# OVERLAP_COUNTER gets a byte per flaky.bin request so pass 1 knows when to interrupt.
|
||||||
|
serverlog="${tmpdir}/server.log"
|
||||||
|
counter="${tmpdir}/hits"
|
||||||
|
resumed="${tmpdir}/resumed" # gets a byte when the server serves a resume 206
|
||||||
|
OVERLAP_COUNTER="$counter" OVERLAP_RESUMED="$resumed" \
|
||||||
|
python3 "$server" --root "${testdir}/server-root" \
|
||||||
|
>"$serverlog" 2>&1 &
|
||||||
|
serverpid=$!
|
||||||
|
port=
|
||||||
|
for _ in $(seq 1 50); do
|
||||||
|
line=$(head -n1 "$serverlog" 2>/dev/null)
|
||||||
|
if test "${line%% *}" == "PORT"; then
|
||||||
|
port="${line#PORT }"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
kill -0 "$serverpid" 2>/dev/null || {
|
||||||
|
echo "server exited early: $(cat "$serverlog")"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
test -n "$port" || {
|
||||||
|
echo "could not discover server port"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
base="http://127.0.0.1:${port}"
|
||||||
|
|
||||||
|
which httrack >/dev/null || {
|
||||||
|
echo "could not find httrack"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
out="${tmpdir}/crawl"
|
||||||
|
common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0 -c1)
|
||||||
|
refdir="${out}/hts-cache/ref"
|
||||||
|
|
||||||
|
# pass 1: interrupt once flaky.bin's prefix is streaming (partial + temp-ref).
|
||||||
|
printf '[pass 1: interrupt flaky.bin] ..\t'
|
||||||
|
httrack "${common[@]}" "${base}/overlap/index.html" >"${tmpdir}/log1" 2>&1 &
|
||||||
|
crawlpid=$!
|
||||||
|
for _ in $(seq 1 300); do
|
||||||
|
test -s "$counter" && break
|
||||||
|
kill -0 "$crawlpid" 2>/dev/null || break
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
sleep 0.5
|
||||||
|
kill -TERM "$crawlpid" 2>/dev/null || true
|
||||||
|
wait "$crawlpid" 2>/dev/null || true
|
||||||
|
crawlpid=
|
||||||
|
test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
|
||||||
|
echo "FAIL: no temp-ref survived pass 1; cannot drive the resume"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
echo "OK (temp-ref present)"
|
||||||
|
|
||||||
|
# pass 2: --continue -> resume Range -> 206 that starts 8 bytes early.
|
||||||
|
printf '[pass 2: resume flaky.bin] ..\t'
|
||||||
|
httrack "${common[@]}" --continue "${base}/overlap/index.html" >"${tmpdir}/log2" 2>&1 || true
|
||||||
|
echo "OK"
|
||||||
|
|
||||||
|
# Guard against a silent full re-download: the byte-compare below only tests the
|
||||||
|
# fix if pass 2 actually went through the resume Range -> 206 path.
|
||||||
|
printf '[resume path was exercised] ..\t'
|
||||||
|
if ! test -s "$resumed"; then
|
||||||
|
echo "FAIL: pass 2 never triggered a resume 206; the overlap fix was not exercised"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "OK"
|
||||||
|
|
||||||
|
printf '[resumed file is not corrupted] ..\t'
|
||||||
|
dir=$(find "$out" -maxdepth 1 -type d -name '127.0.0.1*' | head -1)
|
||||||
|
flaky="${dir}/overlap/flaky.bin"
|
||||||
|
full="${dir}/overlap/full.bin"
|
||||||
|
if ! test -f "$flaky" || ! test -f "$full"; then
|
||||||
|
echo "FAIL: flaky.bin or full.bin missing after pass 2"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! cmp -s "$flaky" "$full"; then
|
||||||
|
echo "FAIL: resumed flaky.bin ($(wc -c <"$flaky")) != full.bin ($(wc -c <"$full")); overlap duplicated"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "OK ($(wc -c <"$flaky") bytes, byte-identical)"
|
||||||
16
tests/25_local-mime-exclude.test
Executable file
16
tests/25_local-mime-exclude.test
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# A -mime: exclusion must abort the transfer on the response Content-Type, not
|
||||||
|
# fetch the whole 1 MB body then discard it (#58). The bytes-received guard is
|
||||||
|
# the real one: the file is absent either way, but only the fix keeps the count
|
||||||
|
# tiny (header only) instead of pulling the body. Match it positively (a small,
|
||||||
|
# <=4-digit count) so a vanished/reworded summary line fails rather than passes.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||||
|
--found 'mimex/real.html' \
|
||||||
|
--not-found 'mimex/blob.pdf' \
|
||||||
|
--log-found 'excluded by MIME type filter' \
|
||||||
|
--log-found '\[[0-9]{1,4} bytes received' \
|
||||||
|
httrack 'BASEURL/mimex/index.html' '-mime:application/pdf'
|
||||||
23
tests/26_local-strip-query.test
Executable file
23
tests/26_local-strip-query.test
Executable file
@@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# End-to-end --strip-query (#112): two links to one resource differing only by
|
||||||
|
# ?utm_source dedup to a single saved file (2 files written: index + resource);
|
||||||
|
# the control crawl without the option keeps both variants (3 files). Locks the
|
||||||
|
# CLI->opt->hash plumbing the engine self-test can't reach.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
# stripped: the two ?utm_source variants collapse to one resource
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \
|
||||||
|
httrack 'BASEURL/stripquery/index.html' --strip-query 'utm_source'
|
||||||
|
|
||||||
|
# control: no stripping -> both query-named variants are saved
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \
|
||||||
|
httrack 'BASEURL/stripquery/index.html'
|
||||||
|
|
||||||
|
# strip still applies with url-hack off (-%u0): exercises the urlhack-off
|
||||||
|
# savename branch, which must normalize the dedup key the same way the hash does
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 2 \
|
||||||
|
httrack 'BASEURL/stripquery/index.html' -%u0 --strip-query 'utm_source'
|
||||||
@@ -5,6 +5,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
|||||||
proxy-https-server.py \
|
proxy-https-server.py \
|
||||||
local-crawl.sh local-server.py server.crt server.key \
|
local-crawl.sh local-server.py server.crt server.key \
|
||||||
server-root/simple/basic.html server-root/simple/link.html \
|
server-root/simple/basic.html server-root/simple/link.html \
|
||||||
|
server-root/stripquery/index.html server-root/stripquery/a.html \
|
||||||
fixtures/cache-golden/hts-cache/new.zip
|
fixtures/cache-golden/hts-cache/new.zip
|
||||||
|
|
||||||
TESTS_ENVIRONMENT =
|
TESTS_ENVIRONMENT =
|
||||||
@@ -13,6 +14,7 @@ TESTS_ENVIRONMENT += PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH
|
|||||||
### TESTS_ENVIRONMENT += $(SHLIBPATH_VAR)="$(top_builddir)/src/$(LT_CV_OBJDIR)$${$(SHLIBPATH_VAR):+$(PATH_SEPARATOR)}$$$(SHLIBPATH_VAR)"
|
### TESTS_ENVIRONMENT += $(SHLIBPATH_VAR)="$(top_builddir)/src/$(LT_CV_OBJDIR)$${$(SHLIBPATH_VAR):+$(PATH_SEPARATOR)}$$$(SHLIBPATH_VAR)"
|
||||||
TESTS_ENVIRONMENT += ONLINE_UNIT_TESTS=$(ONLINE_UNIT_TESTS)
|
TESTS_ENVIRONMENT += ONLINE_UNIT_TESTS=$(ONLINE_UNIT_TESTS)
|
||||||
TESTS_ENVIRONMENT += HTTPS_SUPPORT=$(HTTPS_SUPPORT)
|
TESTS_ENVIRONMENT += HTTPS_SUPPORT=$(HTTPS_SUPPORT)
|
||||||
|
TESTS_ENVIRONMENT += V6_SUPPORT=$(V6_SUPPORT)
|
||||||
TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir)
|
TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir)
|
||||||
|
|
||||||
TEST_EXTENSIONS = .test
|
TEST_EXTENSIONS = .test
|
||||||
@@ -25,6 +27,7 @@ TESTS = \
|
|||||||
00_runnable.test \
|
00_runnable.test \
|
||||||
01_engine-cache.test \
|
01_engine-cache.test \
|
||||||
01_engine-cache-golden.test \
|
01_engine-cache-golden.test \
|
||||||
|
01_engine-cache-writefail.test \
|
||||||
01_engine-charset.test \
|
01_engine-charset.test \
|
||||||
01_engine-cmdline.test \
|
01_engine-cmdline.test \
|
||||||
01_engine-cookies.test \
|
01_engine-cookies.test \
|
||||||
@@ -32,6 +35,7 @@ TESTS = \
|
|||||||
01_engine-dns.test \
|
01_engine-dns.test \
|
||||||
01_engine-doitlog.test \
|
01_engine-doitlog.test \
|
||||||
01_engine-entities.test \
|
01_engine-entities.test \
|
||||||
|
01_engine-filelist.test \
|
||||||
01_engine-filter.test \
|
01_engine-filter.test \
|
||||||
01_engine-hashtable.test \
|
01_engine-hashtable.test \
|
||||||
01_engine-idna.test \
|
01_engine-idna.test \
|
||||||
@@ -39,8 +43,12 @@ TESTS = \
|
|||||||
01_engine-parse.test \
|
01_engine-parse.test \
|
||||||
01_engine-rcfile.test \
|
01_engine-rcfile.test \
|
||||||
01_engine-relative.test \
|
01_engine-relative.test \
|
||||||
|
01_engine-savename.test \
|
||||||
|
01_engine-selftest-dispatch.test \
|
||||||
01_engine-simplify.test \
|
01_engine-simplify.test \
|
||||||
|
01_engine-stripquery.test \
|
||||||
01_engine-strsafe.test \
|
01_engine-strsafe.test \
|
||||||
|
01_engine-urlhack.test \
|
||||||
02_manpage-regen.test \
|
02_manpage-regen.test \
|
||||||
02_update-cache.test \
|
02_update-cache.test \
|
||||||
10_crawl-simple.test \
|
10_crawl-simple.test \
|
||||||
@@ -56,6 +64,14 @@ TESTS = \
|
|||||||
15_local-types.test \
|
15_local-types.test \
|
||||||
16_local-assume.test \
|
16_local-assume.test \
|
||||||
17_local-empty-ct.test \
|
17_local-empty-ct.test \
|
||||||
18_local-update.test
|
18_local-update.test \
|
||||||
|
19_local-connect-fallback.test \
|
||||||
|
20_local-resume-loop.test \
|
||||||
|
21_local-intl-update.test \
|
||||||
|
22_local-broken-size.test \
|
||||||
|
23_local-errpage.test \
|
||||||
|
24_local-resume-overlap.test \
|
||||||
|
25_local-mime-exclude.test \
|
||||||
|
26_local-strip-query.test
|
||||||
|
|
||||||
CLEANFILES = check-network_sh.cache
|
CLEANFILES = check-network_sh.cache
|
||||||
|
|||||||
@@ -14,7 +14,9 @@
|
|||||||
# Usage:
|
# Usage:
|
||||||
# bash local-crawl.sh [--tls] [--root DIR] \
|
# bash local-crawl.sh [--tls] [--root DIR] \
|
||||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||||
|
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||||
# httrack BASEURL/some/path [httrack-args...]
|
# httrack BASEURL/some/path [httrack-args...]
|
||||||
|
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||||
|
|
||||||
set -u
|
set -u
|
||||||
|
|
||||||
@@ -107,7 +109,7 @@ while test "$pos" -lt "$nargs"; do
|
|||||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||||
pos=$((pos + 1))
|
pos=$((pos + 1))
|
||||||
;;
|
;;
|
||||||
--found | --not-found | --directory)
|
--found | --not-found | --directory | --log-found | --log-not-found)
|
||||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||||
pos=$((pos + 1))
|
pos=$((pos + 1))
|
||||||
;;
|
;;
|
||||||
@@ -196,6 +198,15 @@ if test -n "$rerun"; then
|
|||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
result "OK (update)"
|
result "OK (update)"
|
||||||
|
# The update summary reports "files updated"; a fresh crawl never does. Assert
|
||||||
|
# it so a regression that bypasses the cache (re-crawls fresh) can't pass.
|
||||||
|
info "checking update used the cache"
|
||||||
|
if grep -aqE "mirror complete in .*files updated" "${out}/hts-log.txt"; then
|
||||||
|
result "OK"
|
||||||
|
else
|
||||||
|
result "update pass did not report cache activity"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||||
@@ -248,6 +259,22 @@ while test "$i" -lt "${#audit[@]}"; do
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
|
--log-found)
|
||||||
|
i=$((i + 1))
|
||||||
|
info "checking log matches ${audit[$i]}"
|
||||||
|
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then result "OK"; else
|
||||||
|
result "not in log"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
--log-not-found)
|
||||||
|
i=$((i + 1))
|
||||||
|
info "checking log lacks ${audit[$i]}"
|
||||||
|
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then
|
||||||
|
result "present in log"
|
||||||
|
exit 1
|
||||||
|
else result "OK"; fi
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
i=$((i + 1))
|
i=$((i + 1))
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||||
from urllib.parse import quote, unquote, urlsplit
|
from urllib.parse import quote, unquote, urlsplit
|
||||||
|
|
||||||
@@ -176,6 +177,170 @@ class Handler(SimpleHTTPRequestHandler):
|
|||||||
body, ctype = self.TYPE_MATRIX[path]
|
body, ctype = self.TYPE_MATRIX[path]
|
||||||
self.send_raw(body, ctype)
|
self.send_raw(body, ctype)
|
||||||
|
|
||||||
|
# --- MIME-type exclusion abort (issue #58) -----------------------------
|
||||||
|
# A -mime:application/pdf filter must abort the transfer once the header
|
||||||
|
# arrives, not download the whole body and discard it.
|
||||||
|
def route_mimex_index(self):
|
||||||
|
self.send_html(
|
||||||
|
'\t<a href="blob.pdf">pdf</a>\n' '\t<a href="real.html">real</a>\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
# 1 MB body: the fix aborts after the header, so httrack's "bytes received"
|
||||||
|
# stays tiny; without it the engine reads the body and the count jumps.
|
||||||
|
MIMEX_BLOB = b"%PDF-1.4\n" + b"\x00" * (1024 * 1024)
|
||||||
|
|
||||||
|
def route_mimex_blob(self):
|
||||||
|
self.send_raw(self.MIMEX_BLOB, "application/pdf")
|
||||||
|
|
||||||
|
def route_mimex_real(self):
|
||||||
|
self.send_raw(b"<html><body>real</body></html>", "text/html")
|
||||||
|
|
||||||
|
# --- special chars in URLs across an update (issue #157) ---------------
|
||||||
|
# A dotless, accented basename served as text/html (MediaWiki style). The
|
||||||
|
# name the first crawl picks (.html) must survive the update pass.
|
||||||
|
INTL_NAME = "Instalação_CVS_no_Ubuntu"
|
||||||
|
|
||||||
|
def route_intl_index(self):
|
||||||
|
self.send_html('\t<a href="%s">accented</a>\n' % self.INTL_NAME)
|
||||||
|
|
||||||
|
def route_intl_page(self):
|
||||||
|
self.send_raw(b"<html><body>accented page</body></html>\n", "text/html")
|
||||||
|
|
||||||
|
# resume / 416 loop (#206): the first GET stalls after a prefix so the crawl
|
||||||
|
# can be interrupted (partial + temp-ref); every later request is 416.
|
||||||
|
RESUME_PREFIX = b"PARTIAL-" + b"x" * 4096 # flushed before the stall
|
||||||
|
RESUME_LEN = len(RESUME_PREFIX) + 4096 # declared length never delivered
|
||||||
|
_resume_started = False
|
||||||
|
|
||||||
|
def route_resume_index(self):
|
||||||
|
self.send_html('\t<a href="blob.txt">blob</a>')
|
||||||
|
|
||||||
|
def route_resume(self):
|
||||||
|
counter = os.environ.get("RESUME_COUNTER")
|
||||||
|
if counter:
|
||||||
|
with open(counter, "a") as fp:
|
||||||
|
fp.write("x")
|
||||||
|
# First GET: stall mid-body so the crawl can be interrupted with a partial.
|
||||||
|
if not Handler._resume_started:
|
||||||
|
Handler._resume_started = True
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "image/png")
|
||||||
|
self.send_header("Content-Length", str(self.RESUME_LEN))
|
||||||
|
self.send_header("Accept-Ranges", "bytes")
|
||||||
|
self.end_headers()
|
||||||
|
if self.command != "HEAD":
|
||||||
|
self.wfile.write(self.RESUME_PREFIX)
|
||||||
|
self.wfile.flush()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(3600)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
self.send_response(416, "Requested Range Not Satisfiable")
|
||||||
|
self.send_header("Content-Type", "image/png")
|
||||||
|
self.send_header("Content-Range", "bytes */%d" % self.RESUME_LEN)
|
||||||
|
self.send_header("Content-Length", "0")
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
# 206 resume must honor the server's Content-Range, not the offset we asked
|
||||||
|
# for (#198): a server resuming a few bytes *before* the request must not
|
||||||
|
# leave httrack duplicating the overlap onto the partial. flaky.bin
|
||||||
|
# interrupts once then resumes OVERLAP_EARLY bytes early; full.bin serves
|
||||||
|
# the identical bytes in one shot, so the test can compare the two.
|
||||||
|
OVERLAP_BLOB = b"%PDF-1.4\n" + bytes((i * 37 + 11) % 256 for i in range(8000))
|
||||||
|
OVERLAP_EARLY = 8
|
||||||
|
OVERLAP_PREFIX_LEN = 4000 # flushed before the stall
|
||||||
|
_overlap_started = False
|
||||||
|
|
||||||
|
def route_overlap_index(self):
|
||||||
|
self.send_html('\t<a href="flaky.bin">flaky</a>\n\t<a href="full.bin">full</a>')
|
||||||
|
|
||||||
|
def route_overlap_full(self):
|
||||||
|
self.send_raw(self.OVERLAP_BLOB, "application/octet-stream")
|
||||||
|
|
||||||
|
def route_overlap(self):
|
||||||
|
counter = os.environ.get("OVERLAP_COUNTER")
|
||||||
|
if counter:
|
||||||
|
with open(counter, "a") as fp:
|
||||||
|
fp.write("x")
|
||||||
|
blob = self.OVERLAP_BLOB
|
||||||
|
rng = self.headers.get("Range")
|
||||||
|
# First GET: stream a prefix then stall, so the crawl can be interrupted
|
||||||
|
# mid-body (partial + temp-ref on disk).
|
||||||
|
if rng is None and not Handler._overlap_started:
|
||||||
|
Handler._overlap_started = True
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "application/octet-stream")
|
||||||
|
self.send_header("Content-Length", str(len(blob)))
|
||||||
|
self.send_header("Accept-Ranges", "bytes")
|
||||||
|
self.end_headers()
|
||||||
|
if self.command != "HEAD":
|
||||||
|
self.wfile.write(blob[: self.OVERLAP_PREFIX_LEN])
|
||||||
|
self.wfile.flush()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(3600)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
if rng is None: # no resume request: serve the whole file
|
||||||
|
return self.route_overlap_full()
|
||||||
|
# Resume: honor the Range, but back up OVERLAP_EARLY bytes.
|
||||||
|
start = (
|
||||||
|
int(rng[len("bytes=") :].split("-")[0]) if rng.startswith("bytes=") else 0
|
||||||
|
)
|
||||||
|
start = max(0, start - self.OVERLAP_EARLY)
|
||||||
|
# Signal that the resume Range -> 206 path actually fired, so the test
|
||||||
|
# can prove it was exercised (not a silent full re-download).
|
||||||
|
resumed = os.environ.get("OVERLAP_RESUMED")
|
||||||
|
if resumed:
|
||||||
|
with open(resumed, "a") as fp:
|
||||||
|
fp.write("x")
|
||||||
|
part = blob[start:]
|
||||||
|
self.send_response(206, "Partial Content")
|
||||||
|
self.send_header("Content-Type", "application/octet-stream")
|
||||||
|
self.send_header("Content-Length", str(len(part)))
|
||||||
|
self.send_header(
|
||||||
|
"Content-Range", "bytes %d-%d/%d" % (start, len(blob) - 1, len(blob))
|
||||||
|
)
|
||||||
|
self.end_headers()
|
||||||
|
if self.command != "HEAD":
|
||||||
|
self.wfile.write(part)
|
||||||
|
|
||||||
|
# error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
|
||||||
|
# bodies off disk; a genuine 0-byte 200 is a valid file and stays.
|
||||||
|
def route_errpage_index(self):
|
||||||
|
self.send_html(
|
||||||
|
'\t<a href="good.html">good</a>\n'
|
||||||
|
'\t<a href="missing.html">missing</a>\n'
|
||||||
|
'\t<a href="empty.html">empty</a>\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
def route_errpage_good(self):
|
||||||
|
self.send_raw(b"<html><body>good page</body></html>\n", "text/html")
|
||||||
|
|
||||||
|
def route_errpage_missing(self):
|
||||||
|
self.send_html("\t404 error body", status=404, extra_status="Not Found")
|
||||||
|
|
||||||
|
def route_errpage_empty(self):
|
||||||
|
self.send_raw(b"", "text/html")
|
||||||
|
|
||||||
|
# broken Content-Length (#32/#41): declared size != bytes sent. httrack
|
||||||
|
# warns "bogus state (broken size)" and skips the cache unless -%B.
|
||||||
|
def route_size_index(self):
|
||||||
|
self.send_html('\t<a href="oversize.bin">over</a>\n')
|
||||||
|
|
||||||
|
def route_size_oversize(self):
|
||||||
|
body = b"A" * 100
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "application/octet-stream")
|
||||||
|
self.send_header("Content-Length", str(len(body) - 2)) # lie: too short
|
||||||
|
self.send_header("Connection", "close")
|
||||||
|
self.end_headers()
|
||||||
|
if self.command != "HEAD":
|
||||||
|
self.wfile.write(body)
|
||||||
|
|
||||||
ROUTES = {
|
ROUTES = {
|
||||||
"/cookies/entrance.php": route_entrance,
|
"/cookies/entrance.php": route_entrance,
|
||||||
"/cookies/second.php": route_second,
|
"/cookies/second.php": route_second,
|
||||||
@@ -195,6 +360,22 @@ class Handler(SimpleHTTPRequestHandler):
|
|||||||
"/types/style.css": route_types,
|
"/types/style.css": route_types,
|
||||||
"/types/data.json": route_types,
|
"/types/data.json": route_types,
|
||||||
"/types/gen.php": route_types,
|
"/types/gen.php": route_types,
|
||||||
|
"/intl/index.html": route_intl_index,
|
||||||
|
"/intl/" + INTL_NAME: route_intl_page,
|
||||||
|
"/resume/index.html": route_resume_index,
|
||||||
|
"/resume/blob.txt": route_resume,
|
||||||
|
"/overlap/index.html": route_overlap_index,
|
||||||
|
"/overlap/flaky.bin": route_overlap,
|
||||||
|
"/overlap/full.bin": route_overlap_full,
|
||||||
|
"/size/index.html": route_size_index,
|
||||||
|
"/size/oversize.bin": route_size_oversize,
|
||||||
|
"/errpage/index.html": route_errpage_index,
|
||||||
|
"/errpage/good.html": route_errpage_good,
|
||||||
|
"/errpage/missing.html": route_errpage_missing,
|
||||||
|
"/errpage/empty.html": route_errpage_empty,
|
||||||
|
"/mimex/index.html": route_mimex_index,
|
||||||
|
"/mimex/blob.pdf": route_mimex_blob,
|
||||||
|
"/mimex/real.html": route_mimex_real,
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- dispatch ----------------------------------------------------------
|
# --- dispatch ----------------------------------------------------------
|
||||||
@@ -202,7 +383,8 @@ class Handler(SimpleHTTPRequestHandler):
|
|||||||
def dispatch(self):
|
def dispatch(self):
|
||||||
self._set_cookies = []
|
self._set_cookies = []
|
||||||
path = urlsplit(self.path).path
|
path = urlsplit(self.path).path
|
||||||
handler = self.ROUTES.get(path)
|
# Match percent-encoded paths (accented #157 route) by their decoded form.
|
||||||
|
handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
|
||||||
if handler is not None:
|
if handler is not None:
|
||||||
handler(self)
|
handler(self)
|
||||||
return True
|
return True
|
||||||
|
|||||||
1
tests/server-root/stripquery/a.html
Normal file
1
tests/server-root/stripquery/a.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<html><body>resource A</body></html>
|
||||||
5
tests/server-root/stripquery/index.html
Normal file
5
tests/server-root/stripquery/index.html
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
<html><body>
|
||||||
|
Two links to one resource, differing only by a tracking parameter.
|
||||||
|
<a href="a.html?utm_source=x">x</a>
|
||||||
|
<a href="a.html?utm_source=y">y</a>
|
||||||
|
</body></html>
|
||||||
Reference in New Issue
Block a user