mirror of
https://github.com/xroche/httrack.git
synced 2026-06-26 20:17:05 +03:00
Compare commits
24 Commits
feature/lo
...
selftest-n
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a21ec9acf | ||
|
|
756d8fb8bd | ||
|
|
5501faa7b1 | ||
|
|
6322b6fb1f | ||
|
|
58f368a91a | ||
|
|
c97b3e233e | ||
|
|
b615a4e7fd | ||
|
|
594cf0da39 | ||
|
|
3845cd1fb3 | ||
|
|
94bffb0804 | ||
|
|
a5c86e7e89 | ||
|
|
54f5717057 | ||
|
|
40fc9de360 | ||
|
|
4614eefefe | ||
|
|
b0e8262db0 | ||
|
|
addbd3136b | ||
|
|
a64c4cd160 | ||
|
|
1611dbcabf | ||
|
|
099501ee50 | ||
|
|
1b9eefa3b4 | ||
|
|
9c8d3a41eb | ||
|
|
ae77cd9d6d | ||
|
|
51b8dcd81c | ||
|
|
bcce664143 |
35
.github/workflows/ci.yml
vendored
35
.github/workflows/ci.yml
vendored
@@ -227,34 +227,47 @@ jobs:
|
||||
# Validate the Debian packaging via the same script maintainers release with.
|
||||
# One amd64/gcc run is enough: packaging (control/rules/manifest/lintian/quilt
|
||||
# source build) is arch- and compiler-independent, and the build matrix above
|
||||
# already covers compile portability. lintian runs with --fail-on=error.
|
||||
# already covers compile portability. mkdeb.sh runs lintian as an explicit gate
|
||||
# (debuild does not propagate lintian's exit) with --fail-on=error,warning.
|
||||
deb:
|
||||
name: deb package (lintian)
|
||||
runs-on: ubuntu-24.04
|
||||
# Build and gate inside Debian sid, the upload target. A Debian dpkg-deb
|
||||
# produces archive-legal xz members (an Ubuntu host defaults to zstd, which
|
||||
# the archive's lintian rejects), and sid's lintian carries the same
|
||||
# data-driven checks (embedded-lib fingerprints and the like) the buildds and
|
||||
# UDD apply -- so issues surface here instead of after upload.
|
||||
container: debian:sid
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install packaging toolchain
|
||||
run: |
|
||||
set -euo pipefail
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
apt-get update
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates git \
|
||||
build-essential autoconf automake libtool autoconf-archive \
|
||||
zlib1g-dev libssl-dev \
|
||||
debhelper devscripts lintian fakeroot
|
||||
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
# --unsigned: CI has no GPG key (also skips the release sig/checksums).
|
||||
# debuild builds every package, then lintian gates on errors.
|
||||
# mkdeb builds every package then runs the lintian gate (--fail-on=error,
|
||||
# warning); debuild runs the packaged test pass.
|
||||
#
|
||||
# DEB_BUILD_OPTIONS trims work CI does not need (release builds via
|
||||
# mkdeb.sh are untouched): noautodbgsym drops the -dbgsym packages whose
|
||||
# LTO payloads are slow to compress and that CI never ships; parallel uses
|
||||
# every core. We let debuild run its test pass -- the only one now that
|
||||
# mkdeb no longer runs its own -- so CI exercises the packaged tests.
|
||||
- name: Build Debian packages
|
||||
# every core.
|
||||
- name: Build and lint Debian packages
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# The workspace volume is owned by the host runner uid, but the
|
||||
# container runs as root, so mkdeb's git calls (superproject and the
|
||||
# coucal submodule) trip "dubious ownership"; mark them all safe.
|
||||
git config --global --add safe.directory "*"
|
||||
export DEB_BUILD_OPTIONS="noautodbgsym parallel=$(nproc)"
|
||||
bash tools/mkdeb.sh --unsigned --no-release-artifacts
|
||||
|
||||
|
||||
@@ -33,8 +33,9 @@ the operational checklist: toolchain, invariants, and how to ship a change.
|
||||
- Be terse. Comment the why, in English; translate French comments you touch.
|
||||
- Strip AI tells from prose (em-dash overuse, rule-of-three, filler, vague
|
||||
attributions). Ref: Wikipedia "Signs of AI writing". Claude Code: `/humanizer`.
|
||||
- Behavior change → add a test. Fast path: a hidden `httrack -#N` debug
|
||||
subcommand (`htscoremain.c`) driven by a `tests/NN_*.test`, over a slow crawl.
|
||||
- Behavior change → add a test. Fast path: a hidden `httrack -#test=NAME` engine
|
||||
self-test (registry in `htsselftest.c`; `-#test` lists them) driven by a
|
||||
`tests/NN_*.test`, over a slow crawl.
|
||||
|
||||
## Review your change adversarially (strongly suggested)
|
||||
Before pushing, and when reviewing others, don't skim for bugs:
|
||||
|
||||
12
configure.ac
12
configure.ac
@@ -1,6 +1,6 @@
|
||||
AC_PREREQ([2.71])
|
||||
|
||||
AC_INIT([httrack], [3.49.8], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_INIT([httrack], [3.49.9], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_COPYRIGHT([
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2015 Xavier Roche and other contributors
|
||||
@@ -29,9 +29,10 @@ AC_CONFIG_SRCDIR(src/httrack.c)
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
AM_INIT_AUTOMAKE([subdir-objects])
|
||||
# 3:0:0: htsblk layout changed (contenttype/charset/contentencoding widened to
|
||||
# 128), an incompatible ABI break, so bump current and reset revision/age.
|
||||
VERSION_INFO="3:0:0"
|
||||
# 3:1:0: 3.49.9 changed code but not the exported interface vs 3.49.8 (same 164
|
||||
# symbols, no struct-layout change), so bump revision only. (3:0:0 was the htsblk
|
||||
# mime-buffer widening, an ABI break that moved the soname .so.2 -> .so.3.)
|
||||
VERSION_INFO="3:1:0"
|
||||
AM_MAINTAINER_MODE
|
||||
AC_USE_SYSTEM_EXTENSIONS
|
||||
|
||||
@@ -214,9 +215,12 @@ AC_SUBST(OPENSSL_LIBS)
|
||||
fi
|
||||
|
||||
### Support IPv6
|
||||
V6_SUPPORT=no
|
||||
AC_CHECK_LIB(c, getaddrinfo, [V6_FLAG="-DINET6"
|
||||
V6_SUPPORT=yes
|
||||
AC_DEFINE(HTS_INET6, 1, [Check for IPv6])], AC_MSG_WARN([*** IPv6 not found IPv6 compatibility disabled]))
|
||||
AC_SUBST(V6_FLAG)
|
||||
AC_SUBST(V6_SUPPORT)
|
||||
|
||||
### Check for LFS
|
||||
AC_CHECK_LIB(c, fopen64, [LFS_FLAG="-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE"
|
||||
|
||||
11
debian/changelog
vendored
11
debian/changelog
vendored
@@ -1,3 +1,14 @@
|
||||
httrack (3.49.9-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: Content-Type and file-type detection fixes (trust a
|
||||
declared Content-Type over a binary URL extension, honor --assume under the
|
||||
delayed type check, keep a known extension against a bogus or empty
|
||||
Content-Type, and avoid an uninitialised read on an empty Content-Type), and
|
||||
restored C++ source-compatibility of the installed headers so reverse
|
||||
dependencies (httraqt) build again.
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sun, 21 Jun 2026 17:59:38 +0200
|
||||
|
||||
httrack (3.49.8-2) unstable; urgency=medium
|
||||
|
||||
* Rename libhttrack2 to libhttrack3 to follow the SONAME, which the 3.49.8
|
||||
|
||||
118
debian/copyright
vendored
118
debian/copyright
vendored
@@ -1,21 +1,109 @@
|
||||
This package was debianized by Xavier Roche <roche@httrack.com> on
|
||||
Fri, 27 Sep 2002 16:42:26 +0200
|
||||
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: httrack
|
||||
Upstream-Contact: Xavier Roche <roche@httrack.com>
|
||||
Source: https://www.httrack.com/
|
||||
|
||||
The current Debian maintainer is Xavier Roche <xavier@debian.org>
|
||||
Files: *
|
||||
Copyright: 1998-2026 Xavier Roche and other contributors
|
||||
License: GPL-3+
|
||||
Comment:
|
||||
The engine includes contributions from Yann Philippot (src/htsjava.c,
|
||||
src/htsjava.h). htsbasenet.h links against the system OpenSSL library
|
||||
(originally by Eric Young); no OpenSSL/SSLeay code is bundled here.
|
||||
|
||||
Upstream author: Xavier Roche <roche@httrack.com>
|
||||
Files: src/minizip/*
|
||||
Copyright: 1998-2010 Gilles Vollant
|
||||
2007-2008 Even Rouault
|
||||
2009-2010 Mathias Svensson
|
||||
1990-2000 Info-ZIP
|
||||
License: Zlib
|
||||
Comment:
|
||||
The decryption code in src/minizip/crypt.h and src/minizip/unzip.c derives
|
||||
from the Info-ZIP distribution, distributed under the same terms.
|
||||
|
||||
Copyright: 1998-2014 Xavier Roche and other contributors
|
||||
Files: src/md5.c
|
||||
Copyright: 1993 Colin Plumb
|
||||
License: public-domain-md5
|
||||
This code implements the MD5 message-digest algorithm, due to Ron Rivest.
|
||||
It was written by Colin Plumb in 1993, no copyright is claimed. This code
|
||||
is in the public domain; do with it what you wish.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
Files: src/coucal/*
|
||||
Copyright: 2013-2014 Xavier Roche
|
||||
License: BSD-3-clause
|
||||
|
||||
On Debian systems, the complete text of the GNU General Public
|
||||
License version 3 can be found in /usr/share/common-licenses/GPL-3 file.
|
||||
Files: src/coucal/murmurhash3.h*
|
||||
Copyright: Austin Appleby
|
||||
License: public-domain-murmurhash3
|
||||
MurmurHash3 was written by Austin Appleby, and is placed in the public
|
||||
domain. The author hereby disclaims copyright to this source code.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Files: html/server/div/com.httrack.WebHTTrack.metainfo.xml
|
||||
Copyright: 1998-2026 Xavier Roche and other contributors
|
||||
License: FSFAP
|
||||
Copying and distribution of this file, with or without modification, are
|
||||
permitted in any medium without royalty provided the copyright notice and
|
||||
this notice are preserved. This file is offered as-is, without any warranty.
|
||||
|
||||
Files: debian/*
|
||||
Copyright: 2002-2026 Xavier Roche <xavier@debian.org>
|
||||
License: GPL-3+
|
||||
|
||||
License: GPL-3+
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
.
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
.
|
||||
On Debian systems, the complete text of the GNU General Public License
|
||||
version 3 can be found in /usr/share/common-licenses/GPL-3.
|
||||
|
||||
License: Zlib
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the
|
||||
use of this software.
|
||||
.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
.
|
||||
1. The origin of this software must not be misrepresented; you must not claim
|
||||
that you wrote the original software. If you use this software in a product,
|
||||
an acknowledgment in the product documentation would be appreciated but is
|
||||
not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
License: BSD-3-clause
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
.
|
||||
1. Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. Neither the name of the copyright holder nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
3
debian/httrack-doc.lintian-overrides
vendored
3
debian/httrack-doc.lintian-overrides
vendored
@@ -4,3 +4,6 @@
|
||||
# so the path lives in the display pointer, not the override -- match with '*'.
|
||||
httrack-doc: extra-license-file *
|
||||
httrack-doc: package-contains-documentation-outside-usr-share-doc *
|
||||
# search.sh is a sample CGI shipped alongside the HTML manual, not meant to be
|
||||
# run from the package tree; it stays non-executable by design.
|
||||
httrack-doc: script-not-executable *
|
||||
|
||||
5
debian/libhttrack3.lintian-overrides
vendored
5
debian/libhttrack3.lintian-overrides
vendored
@@ -1,3 +1,8 @@
|
||||
# The shared libraries ship without a versioned symbols control file (ABI is
|
||||
# tracked via the SONAME plus a >= upstream-version dependency, see debian/rules).
|
||||
libhttrack3: no-symbols-control-file usr/lib/*
|
||||
|
||||
# Bundled, locally patched minizip (src/minizip): it adds a zipFlush() API the
|
||||
# system libminizip lacks (htscache.c flushes the cache .zip so an interrupted
|
||||
# crawl leaves a valid archive), plus Android/old-zlib portability fixes.
|
||||
libhttrack3: embedded-library *libminizip*
|
||||
|
||||
3
debian/proxytrack.lintian-overrides
vendored
Normal file
3
debian/proxytrack.lintian-overrides
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# Statically linked against httrack's bundled, patched minizip (see src/minizip
|
||||
# and libhttrack3's override): the zipFlush() API is absent from the system one.
|
||||
proxytrack: embedded-library *libminizip*
|
||||
@@ -4,6 +4,12 @@ HTTrack Website Copier release history:
|
||||
|
||||
This file lists all changes and fixes that have been made for HTTrack
|
||||
|
||||
3.49-9
|
||||
+ Fixed: file-type detection from the Content-Type header: trust a declared type over a binary URL extension, honor --assume under the delayed type check, and keep a known extension against a bogus or empty Content-Type (#267, #29, #56)
|
||||
+ Fixed: an uninitialized-buffer read when the Content-Type is empty (#411)
|
||||
+ Fixed: restored C++ source-compatibility of the installed headers so reverse dependencies (httraqt) build again (#413)
|
||||
+ Changed: multiple internal build, packaging and test-harness improvements
|
||||
|
||||
3.49-8
|
||||
+ New: tunnel HTTPS downloads through the configured HTTP proxy via CONNECT (#85)
|
||||
+ New: parse every candidate URL in <img> and <source> srcset lists (#326)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
.\"
|
||||
.\" This file is generated by man/makeman.sh; do not edit by hand.
|
||||
.\" SPDX-License-Identifier: GPL-3.0-or-later
|
||||
.TH httrack 1 "13 June 2026" "httrack website copier"
|
||||
.TH httrack 1 "26 June 2026" "httrack website copier"
|
||||
.SH NAME
|
||||
httrack \- offline browser : copy websites to a local directory
|
||||
.SH SYNOPSIS
|
||||
@@ -313,12 +313,8 @@ debug HTTP headers in logfile (\-\-debug\-headers)
|
||||
.SS Guru options: (do NOT use if possible)
|
||||
.IP \-#X
|
||||
*use optimized engine (limited memory boundary checks) (\-\-fast\-engine)
|
||||
.IP \-#0
|
||||
filter test (\-#0 '*.gif' 'www.bar.com/foo.gif') (\-\-debug\-testfilters <param>)
|
||||
.IP \-#1
|
||||
simplify test (\-#1 ./foo/bar/../foobar)
|
||||
.IP \-#2
|
||||
type test (\-#2 /foo/bar.php)
|
||||
.IP \-#test
|
||||
list engine self\-tests (run one with \-#test=NAME [args])
|
||||
.IP \-#C
|
||||
cache list (\-#C '*.com/spider*.gif' (\-\-debug\-cache <param>)
|
||||
.IP \-#R
|
||||
|
||||
@@ -56,7 +56,7 @@ whttrackrundir = $(bindir)
|
||||
whttrackrun_SCRIPTS = webhttrack
|
||||
|
||||
libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
htscache_selftest.c \
|
||||
htscache_selftest.c htsdns_selftest.c htsselftest.c \
|
||||
htscatchurl.c htsfilters.c htsftp.c htshash.c coucal/coucal.c \
|
||||
htshelp.c htslib.c htscoremain.c \
|
||||
htsname.c htsrobots.c htstools.c htswizard.c \
|
||||
@@ -66,7 +66,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
md5.c \
|
||||
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
||||
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
||||
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htscatchurl.h \
|
||||
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htsdns_selftest.h htsselftest.h htscatchurl.h \
|
||||
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
||||
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
||||
htshelp.h htsindex.h htslib.h htsmd5.h \
|
||||
|
||||
153
src/htsback.c
153
src/htsback.c
@@ -73,6 +73,8 @@ struct_back *back_new(httrackp *opt, int back_max) {
|
||||
|
||||
sback->count = back_max;
|
||||
sback->lnk = (lien_back *) calloct((back_max + 1), sizeof(lien_back));
|
||||
sback->connect_fallback = (hts_connect_fallback *) calloct(
|
||||
(back_max + 1), sizeof(hts_connect_fallback));
|
||||
sback->ready = coucal_new(0);
|
||||
hts_set_hash_handler(sback->ready, opt);
|
||||
coucal_set_name(sback->ready, "back_new");
|
||||
@@ -83,6 +85,7 @@ struct_back *back_new(httrackp *opt, int back_max) {
|
||||
sback->lnk[i].r.location = sback->lnk[i].location_buffer;
|
||||
sback->lnk[i].status = STATUS_FREE;
|
||||
sback->lnk[i].r.soc = INVALID_SOCKET;
|
||||
sback->connect_fallback[i].addr_count = -1; // not yet probed
|
||||
}
|
||||
return sback;
|
||||
}
|
||||
@@ -93,6 +96,7 @@ void back_free(struct_back ** sback) {
|
||||
freet((*sback)->lnk);
|
||||
(*sback)->lnk = NULL;
|
||||
}
|
||||
freet((*sback)->connect_fallback);
|
||||
if ((*sback)->ready != NULL) {
|
||||
coucal_delete(&(*sback)->ready);
|
||||
(*sback)->ready_size_bytes = 0;
|
||||
@@ -102,6 +106,72 @@ void back_free(struct_back ** sback) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Per-candidate connect deadline cap (seconds): a connecting slot with another
|
||||
address to try waits at most this long before falling back, instead of the
|
||||
full (default 120s) slot timeout. Caps the dead-IPv6 stall while staying well
|
||||
above a normal handshake. The last candidate still gets the full timeout. */
|
||||
#define HTS_CONNECT_FALLBACK_TIMEOUT 10
|
||||
|
||||
int back_connect_fallback_due(int addr_index, int addr_count, int elapsed,
|
||||
int timeout) {
|
||||
int deadline;
|
||||
|
||||
if (addr_index + 1 >= addr_count) // last (or only) candidate: no fallback
|
||||
return 0;
|
||||
if (timeout <= 0) // no timeout management: never force it
|
||||
return 0;
|
||||
deadline = (timeout < HTS_CONNECT_FALLBACK_TIMEOUT)
|
||||
? timeout
|
||||
: HTS_CONNECT_FALLBACK_TIMEOUT;
|
||||
return elapsed >= deadline;
|
||||
}
|
||||
|
||||
/* Pending-connect result for a non-blocking socket reported ready by select():
|
||||
0 = connected, >0 = the connect errno (refused, unreachable, ...), -1 if the
|
||||
probe itself failed. A failed connect is reported writable too, so this is
|
||||
how success is told from failure without blocking. */
|
||||
static int connect_socket_error(T_SOC soc) {
|
||||
int soerr = 0;
|
||||
socklen_t len = (socklen_t) sizeof(soerr);
|
||||
|
||||
if (getsockopt(soc, SOL_SOCKET, SO_ERROR, (char *) &soerr, &len) != 0)
|
||||
return -1;
|
||||
return soerr;
|
||||
}
|
||||
|
||||
/* Retry a stuck/failed connecting slot against its next resolved address.
|
||||
Closes the current socket and starts a non-blocking connect to the next
|
||||
candidate, leaving the slot in STATUS_CONNECTING. Returns 1 if a new connect
|
||||
was started, 0 if no fallback address remains (caller fails the slot). */
|
||||
static int back_connect_next(httrackp *opt, struct_back *sback, int i) {
|
||||
hts_connect_fallback *const cf = &sback->connect_fallback[i];
|
||||
lien_back *const back = sback->lnk;
|
||||
const int next = cf->addr_index + 1;
|
||||
T_SOC soc;
|
||||
|
||||
if (next >= cf->addr_count)
|
||||
return 0;
|
||||
|
||||
if (back[i].r.soc != INVALID_SOCKET) {
|
||||
deletehttp(&back[i].r);
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
}
|
||||
soc = newhttp_addr(opt, back[i].url_adr, &back[i].r, -1, 0, next, NULL);
|
||||
if (soc == INVALID_SOCKET)
|
||||
return 0;
|
||||
|
||||
back[i].r.soc = soc;
|
||||
cf->addr_index = next;
|
||||
cf->connect_start = time_local();
|
||||
if (back[i].timeout > 0)
|
||||
back[i].timeout_refresh = cf->connect_start;
|
||||
back[i].status = STATUS_CONNECTING;
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"connect failed, trying next address (%d/%d) for %s", next + 1,
|
||||
cf->addr_count, back[i].url_adr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
void back_delete_all(httrackp * opt, cache_back * cache, struct_back * sback) {
|
||||
if (sback != NULL) {
|
||||
int i;
|
||||
@@ -1911,8 +1981,11 @@ int back_add(struct_back * sback, httrackp * opt, cache_back * cache, const char
|
||||
// ouvrir liaison, envoyer requète
|
||||
// ne pas traiter ou recevoir l'en tête immédiatement
|
||||
hts_init_htsblk(&back[p].r);
|
||||
//memset(&(back[p].r), 0, sizeof(htsblk));
|
||||
// memset(&(back[p].r), 0, sizeof(htsblk));
|
||||
back[p].r.location = back[p].location_buffer;
|
||||
// fresh connect: address list not yet probed, start at the first
|
||||
sback->connect_fallback[p].addr_index = 0;
|
||||
sback->connect_fallback[p].addr_count = -1;
|
||||
// recopier proxy
|
||||
if ((back[p].r.req.proxy.active = opt->proxy.active)) {
|
||||
if (StringBuff(opt->proxy.bindhost) != NULL)
|
||||
@@ -2369,21 +2442,25 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
// en cas de gestion du connect préemptif
|
||||
#if HTS_XCONN
|
||||
if (back[i].status == STATUS_CONNECTING) { // connexion
|
||||
do_wait = 1;
|
||||
// a connecting slot always carries a live socket; guard anyway so a
|
||||
// stray INVALID_SOCKET can never reach FD_SET (mirrors the recv branch)
|
||||
if (back[i].r.soc != INVALID_SOCKET) {
|
||||
do_wait = 1;
|
||||
|
||||
// noter socket write
|
||||
FD_SET(back[i].r.soc, &fds_c);
|
||||
// noter socket write
|
||||
FD_SET(back[i].r.soc, &fds_c);
|
||||
|
||||
// noter socket erreur
|
||||
FD_SET(back[i].r.soc, &fds_e);
|
||||
// noter socket erreur
|
||||
FD_SET(back[i].r.soc, &fds_e);
|
||||
|
||||
// calculer max
|
||||
if (max_c) {
|
||||
max_c = 0;
|
||||
nfds = back[i].r.soc;
|
||||
} else if (back[i].r.soc > nfds) {
|
||||
// ID socket la plus élevée
|
||||
nfds = back[i].r.soc;
|
||||
// calculer max
|
||||
if (max_c) {
|
||||
max_c = 0;
|
||||
nfds = back[i].r.soc;
|
||||
} else if (back[i].r.soc > nfds) {
|
||||
// ID socket la plus élevée
|
||||
nfds = back[i].r.soc;
|
||||
}
|
||||
}
|
||||
|
||||
} else
|
||||
@@ -2517,8 +2594,20 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
}
|
||||
// ---- FLAG WRITE MIS A UN?: POUR LE CONNECT
|
||||
if (back[i].status == STATUS_CONNECTING) { // attendre connect
|
||||
hts_connect_fallback *const cf = &sback->connect_fallback[i];
|
||||
int dispo = 0;
|
||||
|
||||
// probe the resolved address list once per fresh connect (cache hit:
|
||||
// the host was resolved when this connect was opened)
|
||||
if (cf->addr_count < 0 && back[i].r.soc != INVALID_SOCKET &&
|
||||
!back[i].r.is_file) {
|
||||
SOCaddr scratch[HTS_MAXADDRNUM];
|
||||
|
||||
cf->addr_count = hts_dns_resolve_all(opt, back[i].url_adr, scratch,
|
||||
HTS_MAXADDRNUM, NULL);
|
||||
cf->connect_start = time_local();
|
||||
}
|
||||
|
||||
// vérifier l'existance de timeout-check
|
||||
if (!gestion_timeout)
|
||||
if (back[i].timeout > 0)
|
||||
@@ -2526,7 +2615,20 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
|
||||
// connecté?
|
||||
dispo = FD_ISSET(back[i].r.soc, &fds_c);
|
||||
if (dispo) { // ok connected!!
|
||||
if (dispo) { // socket ready: connect() finished (ok or failed)
|
||||
// a refused/failed connect is reported writable too; probe SO_ERROR
|
||||
// and, on failure, fall back to the next address (or fail the slot)
|
||||
if (connect_socket_error(back[i].r.soc) != 0) {
|
||||
if (!back_connect_next(opt, sback, i)) {
|
||||
deletehttp(&back[i].r);
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
back[i].r.statuscode = STATUSCODE_CONNERROR;
|
||||
strcpybuff(back[i].r.msg, "Connect Error");
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
}
|
||||
continue; // reconnected (stay connecting) or failed
|
||||
}
|
||||
busy_state = 1;
|
||||
|
||||
#if HTS_USEOPENSSL
|
||||
@@ -3884,6 +3986,29 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
|
||||
if (back[i].status > 0) { // réception/connexion/..
|
||||
if (back[i].timeout > 0) {
|
||||
// a stuck connect with a fallback address: retry the next one well
|
||||
// before the full timeout (dead IPv6 on a dual-stack host, ...)
|
||||
if (back[i].status == STATUS_CONNECTING) {
|
||||
const hts_connect_fallback *const cf =
|
||||
&sback->connect_fallback[i];
|
||||
|
||||
if (back_connect_fallback_due(cf->addr_index, cf->addr_count,
|
||||
(int) (act - cf->connect_start),
|
||||
back[i].timeout)) {
|
||||
if (back_connect_next(opt, sback, i)) {
|
||||
continue; // reconnected to the next candidate
|
||||
}
|
||||
// fallback was due but no socket could be opened
|
||||
// (back_connect_next closed the dead one): stop now rather than
|
||||
// spin on an invalid fd
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
back[i].r.statuscode = STATUSCODE_CONNERROR;
|
||||
strcpybuff(back[i].r.msg, "Connect Error");
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
//printf("time check %d\n",((int) (act-back[i].timeout_refresh))-back[i].timeout);
|
||||
if (((int) (act - back[i].timeout_refresh)) >= back[i].timeout) {
|
||||
hts_log_print(opt, LOG_DEBUG, "connection timed out for %s%s", back[i].url_adr,
|
||||
|
||||
@@ -220,6 +220,25 @@ struct cache_back_zip_entry {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/* A cache (new.zip) write failed: storage is gone (disk full / dropped share),
|
||||
so the mirror is doomed too. Abort it via exit_xh, don't crash as assertf
|
||||
did. */
|
||||
static void cache_zip_write_failed(httrackp *opt, cache_back *cache,
|
||||
const char *what, int zErr) {
|
||||
if (!cache->zipWriteFailed) {
|
||||
cache->zipWriteFailed = HTS_TRUE;
|
||||
if (check_fatal_io_errno()) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: cache write failed (%s): %s", what,
|
||||
hts_get_zerror(zErr));
|
||||
}
|
||||
}
|
||||
opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
|
||||
}
|
||||
|
||||
/* Ajout d'un fichier en cache */
|
||||
void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
const char *url_adr, const char *url_fil, const char *url_save,
|
||||
@@ -236,6 +255,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
const char *url_save_suffix = url_save;
|
||||
int zErr;
|
||||
|
||||
/* already failed and aborting; don't touch the broken stream again */
|
||||
if (cache->zipWriteFailed)
|
||||
return;
|
||||
|
||||
// robots.txt hack
|
||||
if (url_save == NULL) {
|
||||
dataincache = 0; // testing links
|
||||
@@ -346,9 +369,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
*/
|
||||
headers, (uInt) strlen(headers), NULL, 0, NULL, /* comment */
|
||||
Z_DEFLATED, Z_DEFAULT_COMPRESSION)) != Z_OK) {
|
||||
int zip_zipOpenNewFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipOpenNewFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "opening a cache entry", zErr);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Write data in cache */
|
||||
@@ -358,9 +380,8 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, r->adr,
|
||||
(int) r->size)) != Z_OK) {
|
||||
int zip_zipWriteInFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipWriteInFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache", zErr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -381,9 +402,10 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
if ((zErr =
|
||||
zipWriteInFileInZip((zipFile) cache->zipOutput, buff,
|
||||
(int) nl)) != Z_OK) {
|
||||
int zip_zipWriteInFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipWriteInFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "writing to the cache",
|
||||
zErr);
|
||||
fclose(fp);
|
||||
return;
|
||||
}
|
||||
}
|
||||
} while(nl > 0);
|
||||
@@ -397,16 +419,14 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
|
||||
|
||||
/* Close */
|
||||
if ((zErr = zipCloseFileInZip((zipFile) cache->zipOutput)) != Z_OK) {
|
||||
int zip_zipCloseFileInZip_failed = 0;
|
||||
|
||||
assertf(zip_zipCloseFileInZip_failed);
|
||||
cache_zip_write_failed(opt, cache, "closing a cache entry", zErr);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Flush */
|
||||
if ((zErr = zipFlush((zipFile) cache->zipOutput)) != 0) {
|
||||
int zip_zipFlush_failed = 0;
|
||||
|
||||
assertf(zip_zipFlush_failed);
|
||||
cache_zip_write_failed(opt, cache, "flushing the cache", zErr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htslib.h"
|
||||
#include "htszlib.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
@@ -316,6 +317,136 @@ static int disk_fallback_selftest(httrackp *opt) {
|
||||
return fail;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
size_t budget; /**< bytes allowed through before writes start failing */
|
||||
int fail_errno; /**< errno set on the failing write (ENOSPC, EIO, ...) */
|
||||
int writes; /**< zwrite call count, to detect re-entry into the stream */
|
||||
} writefail_inject;
|
||||
|
||||
/* zwrite that copies until the budget runs out, then fails with inj->fail_errno
|
||||
(the #174/#219 condition). Counts calls so the test can prove a flagged cache
|
||||
never re-enters the stream. */
|
||||
static uLong selftest_failing_zwrite(voidpf opaque, voidpf stream,
|
||||
const void *buf, uLong size) {
|
||||
writefail_inject *inj = (writefail_inject *) opaque;
|
||||
|
||||
inj->writes++;
|
||||
if (inj->budget >= (size_t) size) {
|
||||
inj->budget -= (size_t) size;
|
||||
return (uLong) fwrite(buf, 1, (size_t) size, (FILE *) stream);
|
||||
}
|
||||
errno = inj->fail_errno;
|
||||
return 0; /* short write -> the minizip op returns an error */
|
||||
}
|
||||
|
||||
/* Open a ZIP whose writes fail past inj->budget, so cache_add() hits an error.
|
||||
*/
|
||||
static zipFile selftest_open_failing_zip(const char *path,
|
||||
writefail_inject *inj) {
|
||||
zlib_filefunc_def ff;
|
||||
|
||||
fill_fopen_filefunc(&ff); /* real fopen/read/seek/close; ignores opaque */
|
||||
ff.zwrite_file = selftest_failing_zwrite;
|
||||
ff.opaque = inj;
|
||||
return zipOpen2(path, APPEND_STATUS_CREATE, NULL, &ff);
|
||||
}
|
||||
|
||||
/* Store one octet-stream body into `cache` (all-in-cache, body in the ZIP). */
|
||||
static void writefail_store(httrackp *opt, cache_back *cache, const char *fil,
|
||||
const char *body, size_t body_len) {
|
||||
htsblk r;
|
||||
char locbuf[4];
|
||||
char *bodycopy = malloct(body_len);
|
||||
|
||||
hts_init_htsblk(&r);
|
||||
r.statuscode = 200;
|
||||
r.size = (LLint) body_len;
|
||||
strcpybuff(r.msg, "OK");
|
||||
strcpybuff(r.contenttype, "application/octet-stream");
|
||||
locbuf[0] = '\0';
|
||||
r.location = locbuf;
|
||||
r.is_write = 0;
|
||||
memcpy(bodycopy, body, body_len);
|
||||
r.adr = bodycopy;
|
||||
cache_add(opt, cache, &r, "example.com", fil, "example.com/blob.bin", 1,
|
||||
NULL);
|
||||
freet(bodycopy);
|
||||
}
|
||||
|
||||
/* #174/#219: a failing cache write used to crash via assertf(); it must instead
|
||||
stop the mirror (exit_xh = -1) without crashing. Assert that, plus the cache
|
||||
is flagged and a sibling write doesn't re-enter the broken stream. */
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir) {
|
||||
int fail = 0;
|
||||
char path[HTS_URLMAXSIZE];
|
||||
/* incompressible + big, so deflate flushes (and fails) mid-write, before
|
||||
* close */
|
||||
static const size_t body_len = 256 * 1024;
|
||||
char *body = malloct(body_len);
|
||||
int phase;
|
||||
|
||||
gen_body(body, body_len, 1 /* incompressible */);
|
||||
fconcat(path, sizeof(path), dir, "/wfail.zip");
|
||||
|
||||
/* phase 0: fail on the body write, fatal errno (ENOSPC, the disk-full
|
||||
branch). phase 1: fail on the open, non-fatal errno (EIO, dropped-share
|
||||
branch). Both must abort the mirror. */
|
||||
for (phase = 0; phase < 2; phase++) {
|
||||
cache_back cache;
|
||||
writefail_inject inj;
|
||||
int writes_after_fail;
|
||||
|
||||
inj.budget = (phase == 0) ? 4096 : 0;
|
||||
inj.fail_errno = (phase == 0) ? ENOSPC : EIO;
|
||||
inj.writes = 0;
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = stderr;
|
||||
cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.zipOutput = selftest_open_failing_zip(path, &inj);
|
||||
if (cache.zipOutput == NULL) {
|
||||
fprintf(stderr, "cache-writefail: could not open injected ZIP\n");
|
||||
fail++;
|
||||
continue;
|
||||
}
|
||||
|
||||
opt->state.exit_xh = 0; /* clear; the failing write must set it to -1 */
|
||||
writefail_store(opt, &cache, "/blob.bin", body, body_len);
|
||||
if (!cache.zipWriteFailed) {
|
||||
fprintf(stderr, "cache-writefail: phase %d: write error not caught\n",
|
||||
phase);
|
||||
fail++;
|
||||
}
|
||||
if (opt->state.exit_xh != -1) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: phase %d: mirror not aborted (exit_xh=%d)\n",
|
||||
phase, opt->state.exit_xh);
|
||||
fail++;
|
||||
}
|
||||
|
||||
/* a flagged cache must no-op a sibling write: no further backend write */
|
||||
writes_after_fail = inj.writes;
|
||||
writefail_store(opt, &cache, "/blob2.bin", body, 16);
|
||||
if (inj.writes != writes_after_fail) {
|
||||
fprintf(stderr,
|
||||
"cache-writefail: phase %d: sibling write re-entered the broken "
|
||||
"stream (%d extra backend writes)\n",
|
||||
phase, inj.writes - writes_after_fail);
|
||||
fail++;
|
||||
}
|
||||
|
||||
if (cache.zipOutput != NULL) {
|
||||
zipClose(cache.zipOutput,
|
||||
NULL); /* best-effort; may fail on the backend */
|
||||
cache.zipOutput = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
freet(body);
|
||||
return fail;
|
||||
}
|
||||
|
||||
int cache_selftests(httrackp *opt, const char *dir) {
|
||||
int failures = 0;
|
||||
cache_back cache;
|
||||
|
||||
@@ -52,6 +52,10 @@ int cache_selftests(httrackp *opt, const char *dir);
|
||||
committed file, never by the test). Returns the failed-check count. */
|
||||
int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
|
||||
|
||||
/* #174/#219: assert a failing cache write aborts the mirror cleanly instead of
|
||||
crashing. Returns the failed-check count. */
|
||||
int cache_write_failure_selftest(httrackp *opt, const char *dir);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3703,9 +3703,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
||||
if (from->maxsoc > 0)
|
||||
to->maxsoc = from->maxsoc;
|
||||
|
||||
/* hts_boolean/enum fields are unsigned (GCC), so a bare `> -1` unset-guard
|
||||
is always false; cast to int to keep the -1 "unset" sentinel test. */
|
||||
if ((int) from->nearlink > -1)
|
||||
/* hts_tristate fields use HTS_DEFAULT (-1) for "unspecified": copy_htsopt
|
||||
skips them so the target keeps its value. */
|
||||
if (from->nearlink > -1)
|
||||
to->nearlink = from->nearlink;
|
||||
|
||||
if (from->timeout > -1)
|
||||
@@ -3732,10 +3732,10 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
||||
if (from->hostcontrol > -1)
|
||||
to->hostcontrol = from->hostcontrol;
|
||||
|
||||
if ((int) from->errpage > -1)
|
||||
if (from->errpage > -1)
|
||||
to->errpage = from->errpage;
|
||||
|
||||
if ((int) from->parseall > -1)
|
||||
if (from->parseall > -1)
|
||||
to->parseall = from->parseall;
|
||||
|
||||
// test all: bit 8 de travel
|
||||
|
||||
@@ -152,6 +152,15 @@ struct lien_adrfilsave {
|
||||
char save[HTS_URLMAXSIZE * 2]; /**< local save path (with directory) */
|
||||
};
|
||||
|
||||
/** Per-slot connect-fallback bookkeeping (parallel to struct_back.lnk).
|
||||
Tracks which resolved address the slot is currently connecting to so a
|
||||
stuck connect can be retried against the next one. */
|
||||
typedef struct hts_connect_fallback {
|
||||
int addr_index; /**< candidate being connected (0-based) */
|
||||
int addr_count; /**< resolved addresses; -1 = not yet probed */
|
||||
TStamp connect_start; /**< when the current candidate's connect began */
|
||||
} hts_connect_fallback;
|
||||
|
||||
/** The download-slot ring: the set of concurrent transfers in flight.
|
||||
Allocated/owned by the engine; consumers (status callbacks, the loop)
|
||||
read it but do not resize or free it. */
|
||||
@@ -168,6 +177,7 @@ struct struct_back {
|
||||
int count; /**< number of usable slots (back_max) */
|
||||
coucal ready; /**< index of slots whose transfer completed */
|
||||
LLint ready_size_bytes; /**< total bytes buffered in completed slots */
|
||||
hts_connect_fallback *connect_fallback; /**< per-slot, count+1 entries */
|
||||
};
|
||||
|
||||
typedef struct cache_back_zip_entry cache_back_zip_entry;
|
||||
@@ -204,6 +214,8 @@ struct cache_back {
|
||||
cache_back_zip_entry *zipEntries;
|
||||
int zipEntriesOffs;
|
||||
int zipEntriesCapa;
|
||||
hts_boolean
|
||||
zipWriteFailed; /**< a cache write failed; stop touching the stream */
|
||||
};
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_hash_struct
|
||||
@@ -372,6 +384,13 @@ void check_rate(TStamp stat_timestart, int maxrate);
|
||||
/* Backing (download-slot) scheduler. Operate on the back[] ring (struct_back).
|
||||
Not thread-safe; call from the single crawl loop. */
|
||||
|
||||
/* True if a connecting slot should give up on the current address and try the
|
||||
next one: a fallback address remains (addr_index+1 < addr_count) and the
|
||||
candidate has been connecting for at least its deadline, min(timeout, an
|
||||
internal cap). elapsed/timeout in seconds. Exposed for the -#D self-test. */
|
||||
int back_connect_fallback_due(int addr_index, int addr_count, int elapsed,
|
||||
int timeout);
|
||||
|
||||
/* How many new sockets may be opened now, honoring maxsoc and the maxconn rate
|
||||
limit (>=0). _strict ignores reserved-slot headroom; the plain form leaves
|
||||
room for naming tests and stops at 0 when the stack is nearly full. */
|
||||
|
||||
@@ -45,8 +45,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htsmodules.h"
|
||||
#include "htszlib.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htscache_selftest.h"
|
||||
#include "htsselftest.h"
|
||||
#include "htsmd5.h"
|
||||
|
||||
#include <ctype.h>
|
||||
@@ -113,442 +112,6 @@ HTSEXT_API int hts_main(int argc, char **argv) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// very minimalistic internal tests
|
||||
static void basic_selftests(void) {
|
||||
// BUG 756328
|
||||
const char *const source = "/intent/tweet?url=https%3A%2F%2Fwww.httrack.com%2Fvacatures%2F1562519%2Fmedewerker-data-services&text=Medewerker+Data+Services&via=httrackcom";
|
||||
char buffer[1024];
|
||||
fil_normalized(source, buffer);
|
||||
// MD5 selftests
|
||||
md5selftest();
|
||||
// cookie_get field extraction (tab-separated, 0-based)
|
||||
{
|
||||
char cbuf[8192];
|
||||
|
||||
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 0), "a") == 0);
|
||||
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 1), "b") == 0);
|
||||
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 2), "c") == 0);
|
||||
// multi-char fields catch length/boundary bugs that 1-char fields hide
|
||||
assertf(strcmp(cookie_get(cbuf, "host\tx\t/path/to", 0), "host") == 0);
|
||||
assertf(strcmp(cookie_get(cbuf, "host\tx\t/path/to", 2), "/path/to") == 0);
|
||||
assertf(strcmp(cookie_get(cbuf, "a\t\tc", 1), "") == 0); // empty field
|
||||
assertf(strcmp(cookie_get(cbuf, "a\tb\tc", 9), "") == 0); // beyond last
|
||||
}
|
||||
// back_infostr() status-line formatting (no sockets: pure formatting over
|
||||
// in-memory slots). Stresses a few thousand entries across every status-code
|
||||
// arm. Regression for a clobber bug where the size/totalsize trailer was
|
||||
// written straight into the destination, wiping the URL it had just built.
|
||||
{
|
||||
static const struct {
|
||||
int code;
|
||||
const char *tag;
|
||||
} cases[] = {
|
||||
{200, "READY "}, {-1, "ERROR "}, {-2, "TIMEOUT "},
|
||||
{-3, "TOOSLOW "}, {400, "BADREQUEST "}, {403, "FORBIDDEN "},
|
||||
{404, "NOT FOUND "}, {500, "SERVERROR "}, {999, "ERROR(999)"},
|
||||
};
|
||||
const int ncases = (int) (sizeof(cases) / sizeof(cases[0]));
|
||||
const int n = 2000;
|
||||
lien_back *slots = calloct(n, sizeof(lien_back));
|
||||
char line[HTS_URLMAXSIZE * 4 + 1024];
|
||||
char expect[HTS_URLMAXSIZE * 4 + 1024];
|
||||
struct_back sb;
|
||||
int idx;
|
||||
|
||||
sb.lnk = slots;
|
||||
sb.count = n;
|
||||
sb.ready = NULL;
|
||||
sb.ready_size_bytes = 0;
|
||||
for (idx = 0; idx < n; idx++) {
|
||||
lien_back *const slot = &slots[idx];
|
||||
|
||||
slot->r.location = slot->location_buffer;
|
||||
slot->status = STATUS_READY;
|
||||
slot->r.statuscode = cases[idx % ncases].code;
|
||||
slot->r.size = idx;
|
||||
slot->r.totalsize = idx + 1;
|
||||
snprintf(slot->url_adr, sizeof(slot->url_adr), "http://h%d.example", idx);
|
||||
snprintf(slot->url_fil, sizeof(slot->url_fil), "/p/%d.html", idx);
|
||||
}
|
||||
for (idx = 0; idx < n; idx++) {
|
||||
line[0] = '\0';
|
||||
back_infostr(&sb, idx, 3, line, sizeof(line));
|
||||
// Exact match (not substring): pins tag/URL/trailer order and rejects a
|
||||
// partial clobber, duplication, or truncation that a presence check would
|
||||
// let through. The expected format is stated here independently.
|
||||
snprintf(expect, sizeof(expect),
|
||||
"%s\"http://h%d.example/p/%d.html\" " LLintP " " LLintP " ",
|
||||
cases[idx % ncases].tag, idx, idx, (LLint) idx,
|
||||
(LLint) (idx + 1));
|
||||
assertf(strcmp(line, expect) == 0);
|
||||
}
|
||||
// Near-maximal URL, driven through back_info() (which owns the status
|
||||
// buffer internally and prints to a FILE*). url_adr + url_fil together
|
||||
// overrun the old HTS_URLMAXSIZE*2+1024 buffer, so the bounded appends
|
||||
// would abort unless that buffer is sized to hold both fields. Regression
|
||||
// for that sizing -- exercising back_infostr() directly would miss it,
|
||||
// since the caller's buffer is what matters.
|
||||
{
|
||||
lien_back *const slot = &slots[0];
|
||||
const size_t adrlen = sizeof(slot->url_adr) - 8;
|
||||
const size_t fillen = sizeof(slot->url_fil) - 8;
|
||||
FILE *const fp = tmpfile();
|
||||
size_t got;
|
||||
|
||||
assertf(fp != NULL);
|
||||
slot->status = STATUS_READY;
|
||||
slot->r.statuscode = 200;
|
||||
slot->r.size = 1;
|
||||
slot->r.totalsize = 2;
|
||||
memset(slot->url_adr, 'a', adrlen);
|
||||
slot->url_adr[adrlen] = '\0';
|
||||
slot->url_fil[0] = '/';
|
||||
memset(slot->url_fil + 1, 'b', fillen - 1);
|
||||
slot->url_fil[fillen] = '\0';
|
||||
back_info(&sb, 0, 3, fp);
|
||||
rewind(fp);
|
||||
got = fread(line, 1, sizeof(line) - 1, fp);
|
||||
line[got] = '\0';
|
||||
fclose(fp);
|
||||
snprintf(expect, sizeof(expect),
|
||||
"READY \"%s%s\" " LLintP " " LLintP " " LF, slot->url_adr,
|
||||
slot->url_fil, (LLint) 1, (LLint) 2);
|
||||
assertf(strcmp(line, expect) == 0);
|
||||
}
|
||||
freet(slots);
|
||||
}
|
||||
// next_token(): in-place token scanner. Strips surrounding quotes, unescapes
|
||||
// \" and \\ when flag is set, and returns the token terminator (the space, or
|
||||
// NULL at end of string). The unquote/unescape rewrites the string in place
|
||||
// by shifting left, so the result is always shorter -- regression for that
|
||||
// compaction.
|
||||
{
|
||||
char tok[64];
|
||||
|
||||
// plain token: unchanged, returns a pointer AT the separating space (exact
|
||||
// position, not just any space -- a strchr-style impl would land elsewhere
|
||||
// once quotes shift the content)
|
||||
strcpybuff(tok, "abc def");
|
||||
{
|
||||
char *const end = next_token(tok, 0);
|
||||
assertf(end == tok + 3 && *end == ' ' && strcmp(tok, "abc def") == 0);
|
||||
}
|
||||
// surrounding quotes stripped, returns the (post-shift) trailing space
|
||||
strcpybuff(tok, "\"ab\" cd");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == tok + 2 && *end == ' ' && strcmp(tok, "ab cd") == 0);
|
||||
}
|
||||
// a space inside quotes does not end the token; end of string returns NULL
|
||||
strcpybuff(tok, "\"a b\"c");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == NULL && strcmp(tok, "a bc") == 0);
|
||||
}
|
||||
// \" and \\ are unescaped to literal " and \ in place
|
||||
strcpybuff(tok, "\"a\\\"b\\\\c\"");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == NULL && strcmp(tok, "a\"b\\c") == 0);
|
||||
}
|
||||
// unterminated quote: the opening quote is dropped, the rest survives, and
|
||||
// the scan runs to the NUL (returns NULL)
|
||||
strcpybuff(tok, "\"ab");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == NULL && strcmp(tok, "ab") == 0);
|
||||
}
|
||||
// trailing lone backslash in a quote: *(p+1) is the NUL, not an escape, so
|
||||
// the backslash is kept intact (and there is no over-read past the NUL)
|
||||
strcpybuff(tok, "\"a\\");
|
||||
{
|
||||
char *const end = next_token(tok, 1);
|
||||
assertf(end == NULL && strcmp(tok, "a\\") == 0);
|
||||
}
|
||||
}
|
||||
// fil_normalized(): canonicalizes a URL path. Query arguments are sorted
|
||||
// alphabetically (by the text after each '?'/'&') and the query is rebuilt
|
||||
// through a bounded builder; outside the query, "//" collapses to "/".
|
||||
// Regression for that builder.
|
||||
{
|
||||
char norm[256];
|
||||
|
||||
assertf(strcmp(fil_normalized("/p?b=2&a=1&c=3", norm), "/p?a=1&b=2&c=3") ==
|
||||
0);
|
||||
assertf(strcmp(fil_normalized("/a//b", norm), "/a/b") == 0);
|
||||
// "//" is collapsed only before the query; inside the query it is kept
|
||||
assertf(strcmp(fil_normalized("/a//b?x=c//d", norm), "/a/b?x=c//d") == 0);
|
||||
}
|
||||
// give_mimext(): mime type -> file extension, bounded into the caller buffer.
|
||||
// Returns 1 when an extension was written, 0 otherwise.
|
||||
{
|
||||
char ext[16];
|
||||
|
||||
assertf(give_mimext(ext, sizeof(ext), "image/gif") == 1);
|
||||
assertf(strcmp(ext, "gif") == 0);
|
||||
assertf(give_mimext(ext, sizeof(ext), "text/html") == 1);
|
||||
assertf(strcmp(ext, "html") == 0);
|
||||
assertf(give_mimext(ext, sizeof(ext), "no/such-mime-type") == 0);
|
||||
assertf(ext[0] == '\0');
|
||||
}
|
||||
// convtolower(): lower-cases into the caller buffer (bounded by its size).
|
||||
{
|
||||
char low[64];
|
||||
|
||||
assertf(strcmp(convtolower(low, sizeof(low), "ABC/Def.HTML"),
|
||||
"abc/def.html") == 0);
|
||||
}
|
||||
// cut_path(): splits a path into directory (with trailing '/') and basename,
|
||||
// each bounded by its buffer size.
|
||||
{
|
||||
char path[256];
|
||||
char pname[256];
|
||||
|
||||
{
|
||||
char full[] = "/dir/sub/file.html";
|
||||
|
||||
cut_path(full, path, sizeof(path), pname, sizeof(pname));
|
||||
assertf(strcmp(path, "/dir/sub/") == 0);
|
||||
assertf(strcmp(pname, "file.html") == 0);
|
||||
}
|
||||
{ // a trailing slash is trimmed before the split
|
||||
char full[] = "/dir/sub/";
|
||||
|
||||
cut_path(full, path, sizeof(path), pname, sizeof(pname));
|
||||
assertf(strcmp(path, "/dir/") == 0);
|
||||
assertf(strcmp(pname, "sub") == 0);
|
||||
}
|
||||
{ // a path of length <= 1 yields empty results
|
||||
char full[] = "/";
|
||||
|
||||
cut_path(full, path, sizeof(path), pname, sizeof(pname));
|
||||
assertf(path[0] == '\0' && pname[0] == '\0');
|
||||
}
|
||||
}
|
||||
// get_httptype_sized(): a long MIME type (Office OOXML reaches 73 chars) is
|
||||
// written whole into a contenttype-sized buffer; returns 1 on a match, 0 when
|
||||
// flag==0 and nothing matched. Regression for the old contenttype[64]
|
||||
// overflow.
|
||||
{
|
||||
httrackp *opt = hts_create_opt();
|
||||
htsblk r; // write into the real struct field, not a stand-in
|
||||
|
||||
assertf(opt != NULL);
|
||||
// a long MIME (Office OOXML reaches 73 chars) must fit htsblk.contenttype
|
||||
// whole: a [64] field would make this bounded copy abort.
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"deck.pptx", 0) == 1);
|
||||
assertf(strcmp(r.contenttype,
|
||||
"application/vnd.openxmlformats-officedocument."
|
||||
"presentationml.presentation") == 0);
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"x.gif", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "image/gif") == 0);
|
||||
// no extension and flag==0: nothing written, returns 0
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"noextfile", 0) == 0);
|
||||
assertf(r.contenttype[0] == '\0');
|
||||
// no extension and flag==1: octet-stream fallback, returns 1
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"noextfile", 1) == 1);
|
||||
assertf(strcmp(r.contenttype, "application/octet-stream") == 0);
|
||||
// a user --assume rule with an empty value matches but writes nothing:
|
||||
// get_userhttptype returns 1 with the buffer empty, so get_httptype_sized
|
||||
// must still report 0 (callers test the return like the old
|
||||
// strnotempty(s)).
|
||||
StringCopy(opt->mimedefs, "\ncgi=\n");
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"/x.cgi", 0) == 0);
|
||||
assertf(r.contenttype[0] == '\0');
|
||||
StringCopy(opt->mimedefs, "\ncgi=text/html\n");
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"/x.cgi", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "text/html") == 0);
|
||||
hts_free_opt(opt);
|
||||
}
|
||||
// adr_normalized_sized(): bounded host normalization (passthrough when
|
||||
// already normal).
|
||||
{
|
||||
char n[HTS_URLMAXSIZE];
|
||||
|
||||
assertf(strcmp(adr_normalized_sized("example.com", n, sizeof(n)),
|
||||
"example.com") == 0);
|
||||
}
|
||||
// standard_name(): builds "<name><md5?>.<ext>" into a bounded buffer. The md5
|
||||
// is appended (4 chars) only when the URL has a query string (see url_md5),
|
||||
// so test both; pin the structure (name + ext, lengths), not the md5 chars.
|
||||
{
|
||||
char b[HTS_URLMAXSIZE * 2];
|
||||
const char *nom = "index.html"; // name part
|
||||
const char *dot = nom + 5; // points at ".html"
|
||||
size_t len;
|
||||
|
||||
// no query -> no md5: "index" + ".html"
|
||||
standard_name(b, sizeof(b), dot, nom, "http://example.com/index.html", 0);
|
||||
assertf(strcmp(b, "index.html") == 0);
|
||||
// query -> 4 md5 chars between name and ext: "index" + md5(4) + ".html"
|
||||
standard_name(b, sizeof(b), dot, nom, "http://example.com/index.html?v=1",
|
||||
0);
|
||||
len = strlen(b);
|
||||
assertf(len == 5 + 4 + 5);
|
||||
assertf(strncmp(b, "index", 5) == 0);
|
||||
assertf(strcmp(b + len - 5, ".html") == 0);
|
||||
// short names: name kept (<=8), the extension is clamped to 3 -> ".htm"
|
||||
standard_name(b, sizeof(b), dot, nom, "http://example.com/index.html?v=1",
|
||||
1);
|
||||
len = strlen(b);
|
||||
assertf(len == 5 + 4 + 4);
|
||||
assertf(strcmp(b + len - 4, ".htm") == 0);
|
||||
// short names with a >8-char name: the name is clamped to 8 ("indexpag")
|
||||
{
|
||||
const char *lnom = "indexpage.html";
|
||||
const char *ldot = lnom + 9; // points at ".html"
|
||||
|
||||
standard_name(b, sizeof(b), ldot, lnom,
|
||||
"http://example.com/indexpage.html?v=1", 1);
|
||||
len = strlen(b);
|
||||
assertf(len == 8 + 4 + 4);
|
||||
assertf(strncmp(b, "indexpag", 8) == 0);
|
||||
assertf(strcmp(b + len - 4, ".htm") == 0);
|
||||
}
|
||||
}
|
||||
// longfile_to_83(): single-name 8-3 (mode 1) / ISO9660 (mode 2) conversion;
|
||||
// uppercases, clamps the name (8 / 31) and the extension (3). It rewrites
|
||||
// 'save' in place, so pass a mutable array.
|
||||
{
|
||||
char n83[256];
|
||||
|
||||
{
|
||||
char save[] = "longfilename.html";
|
||||
|
||||
longfile_to_83(1, n83, sizeof(n83), save); // 8-3: name->8, ext->3
|
||||
assertf(strcmp(n83, "LONGFILE.HTM") == 0);
|
||||
}
|
||||
{
|
||||
char save[] = "longfilename.html";
|
||||
|
||||
longfile_to_83(2, n83, sizeof(n83), save); // ISO9660: name->31, ext->3
|
||||
assertf(strcmp(n83, "LONGFILENAME.HTM") == 0);
|
||||
}
|
||||
{ // sanitization: leading '.'->'_', interior dots
|
||||
char save[] = ".a b.c.d e"; // collapse to '_', spaces/specials -> '_'
|
||||
// (only the last dot stays as the separator)
|
||||
longfile_to_83(1, n83, sizeof(n83), save);
|
||||
assertf(strcmp(n83, "_A_B_C.D_E") == 0);
|
||||
}
|
||||
}
|
||||
// long_to_83(): per-segment 8-3 conversion of a whole path.
|
||||
{
|
||||
char n83[HTS_URLMAXSIZE * 2];
|
||||
char save[] = "dir/longfilename.html";
|
||||
|
||||
long_to_83(1, n83, sizeof(n83), save);
|
||||
assertf(strcmp(n83, "DIR/LONGFILE.HTM") == 0);
|
||||
}
|
||||
// lienrelatif(): relative path from the directory of curr_fil to link.
|
||||
{
|
||||
char s[HTS_URLMAXSIZE * 2];
|
||||
|
||||
// same directory -> just the basename
|
||||
assertf(lienrelatif(s, sizeof(s), "dir/page.html", "dir/index.html") == 0);
|
||||
assertf(strcmp(s, "page.html") == 0);
|
||||
// link one level up -> a "../" prefix
|
||||
assertf(lienrelatif(s, sizeof(s), "a.html", "dir/index.html") == 0);
|
||||
assertf(strcmp(s, "../a.html") == 0);
|
||||
}
|
||||
}
|
||||
|
||||
/* Self-tests for the htssafe.h bounded string ops (driven by httrack -#8).
|
||||
Returns 0 if every bounded operation behaved correctly, 1 otherwise.
|
||||
The abort-on-overflow guarantee is checked separately by the -#8 "overflow"
|
||||
sub-mode (it aborts the process by design). */
|
||||
static int string_safety_selftests(void) {
|
||||
char buf[8];
|
||||
|
||||
/* strcpybuff into a sized array: exact copy */
|
||||
strcpybuff(buf, "abc");
|
||||
if (strcmp(buf, "abc") != 0)
|
||||
return 1;
|
||||
|
||||
/* strcatbuff append within capacity */
|
||||
strcatbuff(buf, "de");
|
||||
if (strcmp(buf, "abcde") != 0)
|
||||
return 1;
|
||||
|
||||
/* strncatbuff appends at most N source chars */
|
||||
strcpybuff(buf, "ab");
|
||||
strncatbuff(buf, "cdef", 2);
|
||||
if (strcmp(buf, "abcd") != 0)
|
||||
return 1;
|
||||
|
||||
/* strlcpybuff: explicit-capacity copy into a pointer destination, the form
|
||||
the migration moves toward */
|
||||
{
|
||||
char storage[8];
|
||||
char *const p = storage;
|
||||
|
||||
strlcpybuff(p, "hello", sizeof(storage));
|
||||
if (strcmp(p, "hello") != 0)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* strcpybuff into a pointer destination: routes through the unchecked
|
||||
strcpybuff_ptr_ fallback (the path the -#8 warning flags). The warning is
|
||||
intentional here; we only verify the fallback still copies correctly. */
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wattribute-warning"
|
||||
#endif
|
||||
{
|
||||
char storage[8];
|
||||
char *const p = storage;
|
||||
|
||||
strcpybuff(p, "ptr");
|
||||
if (strcmp(p, "ptr") != 0)
|
||||
return 1;
|
||||
}
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
/* htsbuff: bounded builder over a fixed array (append, truncating append,
|
||||
reset, and length tracking) */
|
||||
{
|
||||
char dst[8];
|
||||
htsbuff b = htsbuff_array(dst);
|
||||
|
||||
htsbuff_cat(&b, "ab");
|
||||
htsbuff_cat(&b, "cd");
|
||||
if (strcmp(htsbuff_str(&b), "abcd") != 0 || b.len != 4)
|
||||
return 1;
|
||||
|
||||
htsbuff_catn(&b, "efghij", 2); /* append at most 2 */
|
||||
if (strcmp(htsbuff_str(&b), "abcdef") != 0)
|
||||
return 1;
|
||||
|
||||
htsbuff_cpy(&b, "xyz"); /* reset */
|
||||
if (strcmp(htsbuff_str(&b), "xyz") != 0 || b.len != 3)
|
||||
return 1;
|
||||
|
||||
htsbuff_catc(&b, '!'); /* single character */
|
||||
if (strcmp(htsbuff_str(&b), "xyz!") != 0 || b.len != 4)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* boundary: filling to exactly cap-1 must succeed (one more aborts, which the
|
||||
-#8 overflow-buff mode checks) */
|
||||
{
|
||||
char d2[4];
|
||||
htsbuff c = htsbuff_array(d2);
|
||||
|
||||
htsbuff_cat(&c, "abc");
|
||||
if (strcmp(htsbuff_str(&c), "abc") != 0 || c.len != 3)
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hts_main_internal(int argc, char **argv, httrackp * opt);
|
||||
|
||||
// Main, récupère les paramètres et appelle le robot
|
||||
@@ -1343,6 +906,25 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
}
|
||||
*/
|
||||
|
||||
/* Engine self-tests: -#test lists them, -#test=NAME [args] runs one. Handled
|
||||
here, ahead of the no-URL usage gate below, so they need no dummy URL. */
|
||||
{
|
||||
int k;
|
||||
|
||||
for (k = 1; k < argc; k++) {
|
||||
const char *const a = argv[k];
|
||||
|
||||
if (a[0] == '-' && a[1] == '#' && strncmp(a + 2, "test", 4) == 0 &&
|
||||
(a[6] == '\0' || a[6] == '=')) {
|
||||
const char *const name = a[6] == '=' ? a + 7 : NULL;
|
||||
const int code = hts_selftest(opt, name, argc - (k + 1), &argv[k + 1]);
|
||||
|
||||
htsmain_free();
|
||||
return code;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pas d'URL
|
||||
#if DEBUG_STEPS
|
||||
printf("Checking URLs\n");
|
||||
@@ -2431,35 +2013,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
case '#':{ // non documenté
|
||||
com++;
|
||||
switch (*com) {
|
||||
case 'A': // cache self-test: httrack -#A <dir>
|
||||
if (na + 1 < argc) {
|
||||
const int err = cache_selftests(opt, argv[na + 1]);
|
||||
|
||||
printf("cache-selftest: %s\n", err ? "FAIL" : "OK");
|
||||
htsmain_free();
|
||||
return err;
|
||||
} else {
|
||||
fprintf(stderr, "Option #A requires a directory argument\n");
|
||||
htsmain_free();
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
case 'B': // golden cache fixture read: httrack -#B <dir> [regen]
|
||||
if (na + 1 < argc) {
|
||||
const int regen =
|
||||
(na + 2 < argc && strcmp(argv[na + 2], "regen") == 0);
|
||||
const int err =
|
||||
cache_golden_selftest(opt, argv[na + 1], regen);
|
||||
|
||||
printf("cache-golden: %s\n", err ? "FAIL" : "OK");
|
||||
htsmain_free();
|
||||
return err;
|
||||
} else {
|
||||
fprintf(stderr, "Option #B requires a directory argument\n");
|
||||
htsmain_free();
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
case 'C': // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file
|
||||
{
|
||||
int hasFilter = 0;
|
||||
@@ -2579,7 +2132,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
(r.size >= 0) ? r.size : (-r.size));
|
||||
if (r.contenttype >= 0) {
|
||||
fprintf(stdout, "Content-Type: %s\r\n",
|
||||
r.contenttype);
|
||||
hts_effective_mime(r.contenttype));
|
||||
}
|
||||
if (r.cdispo[0]) {
|
||||
fprintf(stdout, "Content-Disposition: %s\r\n",
|
||||
@@ -2757,458 +2310,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
}
|
||||
break;
|
||||
|
||||
case '0': /* test #0 : filters */
|
||||
if (na + 2 >= argc) {
|
||||
HTS_PANIC_PRINTF
|
||||
("Option #0 needs to be followed by a filter string and a string");
|
||||
printf("Example: '-#0' '*.gif' 'foo.gif'\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
if (strjoker(argv[na + 2], argv[na + 1], NULL, NULL))
|
||||
printf("%s does match %s\n", argv[na + 2], argv[na + 1]);
|
||||
else
|
||||
printf("%s does NOT match %s\n", argv[na + 2],
|
||||
argv[na + 1]);
|
||||
htsmain_free();
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
case '1': /* test #1 : fil_simplifie */
|
||||
if (na + 1 >= argc) {
|
||||
HTS_PANIC_PRINTF("Option #1 needs to be followed by an URL");
|
||||
printf("Example: '-#1' ./foo/bar/../foobar\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
fil_simplifie(argv[na + 1]);
|
||||
printf("simplified=%s\n", argv[na + 1]);
|
||||
htsmain_free();
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
case 'l': /* lienrelatif: relative link from curr_fil to link */
|
||||
if (na + 2 >= argc) {
|
||||
HTS_PANIC_PRINTF(
|
||||
"Option #l needs a link and a current-file path");
|
||||
printf(
|
||||
"Example: '-#l' 'host/dir/img.gif' 'host/dir/p.html'\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
char s[HTS_URLMAXSIZE * 2];
|
||||
|
||||
if (lienrelatif(s, sizeof(s), argv[na + 1], argv[na + 2]) ==
|
||||
0)
|
||||
printf("relative=%s\n", s);
|
||||
else
|
||||
printf("relative=<ERROR>\n");
|
||||
htsmain_free();
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
case 'i': /* ident_url_relatif: resolve a link -> adr/fil */
|
||||
if (na + 3 >= argc) {
|
||||
HTS_PANIC_PRINTF(
|
||||
"Option #i needs a link, an origin address and file");
|
||||
printf("Example: '-#i' '../img.gif' 'www.foo.com' "
|
||||
"'/d/p.html'\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
lien_adrfil af;
|
||||
const int r = ident_url_relatif(argv[na + 1], argv[na + 2],
|
||||
argv[na + 3], &af);
|
||||
|
||||
if (r == 0)
|
||||
printf("adr=%s fil=%s\n", af.adr, af.fil);
|
||||
else
|
||||
printf("error=%d\n", r);
|
||||
htsmain_free();
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
case '2': // mimedefs
|
||||
if (na + 1 >= argc) {
|
||||
HTS_PANIC_PRINTF("Option #2 needs to be followed by an URL");
|
||||
printf("Example: '-#2' /foo/bar.php\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
char mime[256];
|
||||
|
||||
// initialiser mimedefs
|
||||
//get_userhttptype(opt,1,opt->mimedefs,NULL);
|
||||
// check
|
||||
if (get_httptype_sized(opt, mime, sizeof(mime), argv[na + 1],
|
||||
0)) {
|
||||
char ext[256];
|
||||
|
||||
printf("%s is '%s'\n", argv[na + 1], mime);
|
||||
if (give_mimext(ext, sizeof(ext), mime)) {
|
||||
printf("and its local type is '.%s'\n", ext);
|
||||
}
|
||||
} else {
|
||||
printf("%s is of an unknown MIME type\n", argv[na + 1]);
|
||||
}
|
||||
htsmain_free();
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
case '3': // charset tests: httrack -#3 "iso-8859-1" "café"
|
||||
if (++na + 1 < argc) {
|
||||
char *s =
|
||||
hts_convertStringToUTF8(argv[na+1], strlen(argv[na+1]), argv[na]);
|
||||
if (s != NULL) {
|
||||
printf("%s\n", s);
|
||||
free(s);
|
||||
} else {
|
||||
fprintf(stderr, "invalid string for charset %s\n", argv[na]);
|
||||
}
|
||||
na += 2;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"Option #3 needs to be followed by a charset and a string");
|
||||
}
|
||||
htsmain_free();
|
||||
return 0;
|
||||
break;
|
||||
case '4': // IDNA encoder: httrack -#4 "www.café.com"
|
||||
if (++na < argc) {
|
||||
char *s = hts_convertStringUTF8ToIDNA(argv[na], strlen(argv[na]));
|
||||
if (s != NULL) {
|
||||
printf("%s\n", s);
|
||||
free(s);
|
||||
} else {
|
||||
fprintf(stderr, "invalid string '%s'\n", argv[na]);
|
||||
}
|
||||
na += 1;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"Option #4 needs to be followed by an IDNA string");
|
||||
}
|
||||
htsmain_free();
|
||||
return 0;
|
||||
break;
|
||||
case '5': // IDNA encoder: httrack -#5
|
||||
if (++na < argc) {
|
||||
char *s = hts_convertStringIDNAToUTF8(argv[na], strlen(argv[na]));
|
||||
if (s != NULL) {
|
||||
printf("%s\n", s);
|
||||
free(s);
|
||||
} else {
|
||||
fprintf(stderr, "invalid string '%s'\n", argv[na]);
|
||||
}
|
||||
na += 1;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"Option #5 needs to be followed by an IDNA string");
|
||||
}
|
||||
htsmain_free();
|
||||
return 0;
|
||||
break;
|
||||
case '6': // entities: httrack -#6 "&foo;" ["encoding"]
|
||||
if (++na < argc) {
|
||||
char *const s = strdup(argv[na]);
|
||||
const char *const enc = na + 1 < argc ? argv[na + 1] : "UTF-8";
|
||||
if (s != NULL
|
||||
&& hts_unescapeEntitiesWithCharset(s, s, strlen(s),
|
||||
enc) == 0) {
|
||||
printf("%s\n", s);
|
||||
free(s);
|
||||
} else {
|
||||
fprintf(stderr, "invalid string '%s'\n", argv[na]);
|
||||
}
|
||||
na += 1;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"Option #6 needs to be followed by a string");
|
||||
}
|
||||
htsmain_free();
|
||||
return 0;
|
||||
break;
|
||||
case '8': /* string-safety selftest: httrack -#8 [overflow <bigstr>] */
|
||||
if (na + 1 < argc
|
||||
&& strncmp(argv[na + 1], "overflow", 8) == 0) {
|
||||
/* Deliberately exceed a sized buffer: the bounded op must
|
||||
abort. The source comes from argv so its length is opaque
|
||||
to the compiler (no static -Wstringop-overflow, genuine
|
||||
runtime check). "overflow-buff" exercises htsbuff. */
|
||||
char small[4];
|
||||
const char *const src =
|
||||
(na + 2 < argc) ? argv[na + 2] : "overflowing";
|
||||
|
||||
if (strcmp(argv[na + 1], "overflow-buff") == 0) {
|
||||
htsbuff b = htsbuff_array(small);
|
||||
|
||||
htsbuff_cat(&b, src);
|
||||
} else {
|
||||
strcpybuff(small, src);
|
||||
}
|
||||
printf("strsafe: NOT aborted\n"); /* must be unreachable */
|
||||
htsmain_free();
|
||||
return 1;
|
||||
} else {
|
||||
const int err = string_safety_selftests();
|
||||
|
||||
printf("strsafe: %s\n", err ? "FAIL" : "OK");
|
||||
htsmain_free();
|
||||
return err;
|
||||
}
|
||||
break;
|
||||
case '7': // hashtable selftest: httrack -#7 nb_entries
|
||||
basic_selftests();
|
||||
if (++na < argc) {
|
||||
char *const snum = strdup(argv[na]);
|
||||
unsigned long count = 0;
|
||||
const char *const names[] = {
|
||||
"", "add", "delete", "dry-add", "dry-del",
|
||||
"test-exists", "test-not-exist"
|
||||
};
|
||||
const struct {
|
||||
enum {
|
||||
DO_END,
|
||||
DO_ADD,
|
||||
DO_DEL,
|
||||
DO_DRY_ADD,
|
||||
DO_DRY_DEL,
|
||||
TEST_ADD,
|
||||
TEST_DEL
|
||||
} type;
|
||||
size_t modulus;
|
||||
size_t offset;
|
||||
} bench[] = {
|
||||
{ DO_ADD, 4, 0 }, /* add 4/0 */
|
||||
{ TEST_ADD, 4, 0 }, /* check 4/0 */
|
||||
{ TEST_DEL, 4, 1 }, /* check 4/1 */
|
||||
{ TEST_DEL, 4, 2 }, /* check 4/2 */
|
||||
{ TEST_DEL, 4, 3 }, /* check 4/3 */
|
||||
{ DO_DRY_DEL, 4, 1 }, /* del 4/1 */
|
||||
{ DO_DRY_DEL, 4, 2 }, /* del 4/2 */
|
||||
{ DO_DRY_DEL, 4, 3 }, /* del 4/3 */
|
||||
{ DO_ADD, 4, 1 }, /* add 4/1 */
|
||||
{ DO_DRY_ADD, 4, 1 }, /* add 4/1 */
|
||||
{ TEST_ADD, 4, 0 }, /* check 4/0 */
|
||||
{ TEST_ADD, 4, 1 }, /* check 4/1 */
|
||||
{ TEST_DEL, 4, 2 }, /* check 4/2 */
|
||||
{ TEST_DEL, 4, 3 }, /* check 4/3 */
|
||||
{ DO_ADD, 4, 2 }, /* add 4/2 */
|
||||
{ DO_DRY_DEL, 4, 3 }, /* del 4/3 */
|
||||
{ DO_ADD, 4, 3 }, /* add 4/3 */
|
||||
{ DO_DEL, 4, 3 }, /* del 4/3 */
|
||||
{ TEST_ADD, 4, 0 }, /* check 4/0 */
|
||||
{ TEST_ADD, 4, 1 }, /* check 4/1 */
|
||||
{ TEST_ADD, 4, 2 }, /* check 4/2 */
|
||||
{ TEST_DEL, 4, 3 }, /* check 4/3 */
|
||||
{ DO_DEL, 4, 0 }, /* del 4/0 */
|
||||
{ DO_DEL, 4, 1 }, /* del 4/1 */
|
||||
{ DO_DEL, 4, 2 }, /* del 4/2 */
|
||||
/* empty here */
|
||||
{ TEST_DEL, 1, 0 }, /* check */
|
||||
{ DO_ADD, 4, 0 }, /* add 4/0 */
|
||||
{ DO_ADD, 4, 1 }, /* add 4/1 */
|
||||
{ DO_ADD, 4, 2 }, /* add 4/2 */
|
||||
{ DO_DEL, 42, 0 }, /* add 42/0 */
|
||||
{ TEST_DEL, 42, 0 }, /* check 42/0 */
|
||||
{ TEST_ADD, 42, 2 }, /* check 42/2 */
|
||||
{ DO_END }
|
||||
};
|
||||
char *buff = NULL;
|
||||
const char **strings = NULL;
|
||||
|
||||
/* produce key #i */
|
||||
#define FMT() \
|
||||
char buffer[256]; \
|
||||
const char *name; \
|
||||
const long expected = (long) i * 1664525 + 1013904223; \
|
||||
do { \
|
||||
if (strings == NULL) { \
|
||||
snprintf(buffer, sizeof(buffer), \
|
||||
"http://www.example.com/website/sample/for/hashtable/" \
|
||||
"%ld/index.html?foo=%ld&bar", \
|
||||
(long) i, (long) (expected)); \
|
||||
name = buffer; \
|
||||
} else { \
|
||||
name = strings[i]; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/* produce random patterns, or read from a file */
|
||||
if (sscanf(snum, "%lu", &count) != 1) {
|
||||
const off_t size = fsize(snum);
|
||||
FILE *fp = fopen(snum, "rb");
|
||||
if (fp != NULL) {
|
||||
buff = malloc(size);
|
||||
if (buff != NULL && fread(buff, 1, size, fp) == size) {
|
||||
size_t capa = 0;
|
||||
size_t i, last;
|
||||
for(i = 0, last = 0, count = 0 ; i < size ; i++) {
|
||||
if (buff[i] == 10 || buff[i] == 0) {
|
||||
buff[i] = '\0';
|
||||
if (capa == count) {
|
||||
if (capa == 0) {
|
||||
capa = 16;
|
||||
} else {
|
||||
capa <<= 1;
|
||||
}
|
||||
strings = (const char **) realloc((void*) strings, capa*sizeof(char*));
|
||||
}
|
||||
strings[count++] = &buff[last];
|
||||
last = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
/* successfully read */
|
||||
if (count > 0) {
|
||||
coucal hashtable = coucal_new(0);
|
||||
size_t loop;
|
||||
for(loop = 0 ; bench[loop].type != DO_END ; loop++) {
|
||||
size_t i;
|
||||
for(i = bench[loop].offset ; i < (size_t) count
|
||||
; i += bench[loop].modulus) {
|
||||
int result;
|
||||
FMT();
|
||||
if (bench[loop].type == DO_ADD
|
||||
|| bench[loop].type == DO_DRY_ADD) {
|
||||
size_t k;
|
||||
result = coucal_write(hashtable, name, (uintptr_t) expected);
|
||||
for(k = 0 ; k < /* stash_size*2 */ 32 ; k++) {
|
||||
(void) coucal_write(hashtable, name, (uintptr_t) expected);
|
||||
}
|
||||
/* revert logic */
|
||||
if (bench[loop].type == DO_DRY_ADD) {
|
||||
result = result ? 0 : 1;
|
||||
}
|
||||
}
|
||||
else if (bench[loop].type == DO_DEL
|
||||
|| bench[loop].type == DO_DRY_DEL) {
|
||||
size_t k;
|
||||
result = coucal_remove(hashtable, name);
|
||||
for(k = 0 ; k < /* stash_size*2 */ 32 ; k++) {
|
||||
(void) coucal_remove(hashtable, name);
|
||||
}
|
||||
/* revert logic */
|
||||
if (bench[loop].type == DO_DRY_DEL) {
|
||||
result = result ? 0 : 1;
|
||||
}
|
||||
}
|
||||
else if (bench[loop].type == TEST_ADD
|
||||
|| bench[loop].type == TEST_DEL) {
|
||||
intptr_t value = -1;
|
||||
result = coucal_readptr(hashtable, name, &value);
|
||||
if (bench[loop].type == TEST_ADD && result
|
||||
&& value != expected) {
|
||||
fprintf(stderr, "value failed for %s (expected %ld, got %ld)\n",
|
||||
name, (long) expected, (long) value);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
/* revert logic */
|
||||
if (bench[loop].type == TEST_DEL) {
|
||||
result = result ? 0 : 1;
|
||||
}
|
||||
}
|
||||
if (!result) {
|
||||
fprintf(stderr, "failed %s{%d/+%d} test on loop %ld"
|
||||
" at offset %ld for %s\n",
|
||||
names[bench[loop].type],
|
||||
(int) bench[loop].modulus,
|
||||
(int) bench[loop].offset,
|
||||
(long) loop, (long) i, name);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
coucal_delete(&hashtable);
|
||||
fprintf(stderr, "all hashtable tests were successful!\n");
|
||||
} else {
|
||||
fprintf(stderr, "Malformed number\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#undef FMT
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"Option #7 needs to be followed by a number");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
htsmain_free();
|
||||
return 0;
|
||||
break;
|
||||
case '9': { // copy_htsopt selftest: httrack -#9
|
||||
httrackp *from = hts_create_opt();
|
||||
httrackp *to = hts_create_opt();
|
||||
int err = 0;
|
||||
|
||||
/* from-values differ from both the to-values and the
|
||||
hts_create_opt() defaults (nearlink FALSE, errpage/parseall
|
||||
TRUE), so a copy that no-ops or just resets to defaults is
|
||||
caught too, not only the unsigned-guard bug. */
|
||||
from->retry = 7; /* int field: positive control */
|
||||
to->retry = 0;
|
||||
from->nearlink = HTS_TRUE;
|
||||
to->nearlink = HTS_FALSE;
|
||||
from->errpage = HTS_FALSE;
|
||||
to->errpage = HTS_TRUE;
|
||||
from->parseall = HTS_FALSE;
|
||||
to->parseall = HTS_TRUE;
|
||||
|
||||
copy_htsopt(from, to);
|
||||
|
||||
if (to->retry != 7)
|
||||
err = 1;
|
||||
if (to->nearlink != HTS_TRUE)
|
||||
err = 1;
|
||||
if (to->errpage != HTS_FALSE)
|
||||
err = 1;
|
||||
if (to->parseall != HTS_FALSE)
|
||||
err = 1;
|
||||
|
||||
hts_free_opt(from);
|
||||
hts_free_opt(to);
|
||||
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
|
||||
htsmain_free();
|
||||
return err;
|
||||
} break;
|
||||
case 'Q': { // cookie request-header selftest: httrack -#Q
|
||||
static t_cookie cookie;
|
||||
char hdr[1024];
|
||||
/* RFC 6265: bare name=value pairs, no $Version/$Path (#151). */
|
||||
const char *expected = "Cookie: name=value; has_js=1" H_CRLF;
|
||||
int err = 0;
|
||||
|
||||
const char *dom = "www.example.com";
|
||||
int added;
|
||||
|
||||
cookie.max_len = (int) sizeof(cookie.data);
|
||||
cookie.data[0] = '\0';
|
||||
added = cookie_add(&cookie, "name", "value", dom, "/");
|
||||
added |= cookie_add(&cookie, "has_js", "1", dom, "/");
|
||||
/* different domain: must be filtered out */
|
||||
added |= cookie_add(&cookie, "junk", "x", "other.org", "/");
|
||||
if (added) {
|
||||
printf("cookie-header: FAIL (cookie_add setup)\n");
|
||||
htsmain_free();
|
||||
return 1;
|
||||
}
|
||||
|
||||
http_cookie_header_selftest(&cookie, dom, "/", hdr,
|
||||
sizeof(hdr));
|
||||
if (strcmp(hdr, expected) != 0)
|
||||
err = 1;
|
||||
if (strstr(hdr, "$Version") != NULL ||
|
||||
strstr(hdr, "$Path") != NULL)
|
||||
err = 1;
|
||||
if (strstr(hdr, "junk") != NULL) // wrong-domain cookie leaked
|
||||
err = 1;
|
||||
printf("cookie-header: %s\n", err ? "FAIL" : "OK");
|
||||
if (err)
|
||||
printf(" got: %s\n", hdr);
|
||||
htsmain_free();
|
||||
return err;
|
||||
} break;
|
||||
case '!':
|
||||
HTS_PANIC_PRINTF
|
||||
("Option #! is disabled for security reasons");
|
||||
|
||||
359
src/htsdns_selftest.c
Normal file
359
src/htsdns_selftest.c
Normal file
@@ -0,0 +1,359 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 2026 Xavier Roche and other contributors
|
||||
|
||||
SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Ethical use: we kindly ask that you NOT use this software to harvest email
|
||||
addresses or to collect any other private information about people. Doing so
|
||||
would dishonor our work and waste the many hours we have spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: htsdns_selftest.c subroutines: */
|
||||
/* in-process self-test for the DNS resolver and cache */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/* Routes the resolver through a scripted getaddrinfo (hts_resolver_backend)
|
||||
instead of the network, so resolution and the DNS cache are testable for a
|
||||
fixed set of scenarios (IPv4/IPv6/dual-stack, errors, family filter,
|
||||
cache reuse) with no live DNS. */
|
||||
|
||||
#define HTS_INTERNAL_BYTECODE
|
||||
|
||||
#include "htsdns_selftest.h"
|
||||
|
||||
#include "htscore.h"
|
||||
#include "htslib.h"
|
||||
#include "htsnet.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#if HTS_INET6 != 0
|
||||
|
||||
/* IPV6_resolver: 0 = v4+v6, 1 = v4 only, 2 = v6 only (htscoremain -@i). */
|
||||
extern int IPV6_resolver;
|
||||
|
||||
/* One scripted host: either a getaddrinfo error, or an ordered address list. */
|
||||
typedef struct mock_addr {
|
||||
int family; /* AF_INET / AF_INET6 */
|
||||
unsigned char addr[16]; /* 4 (v4) or 16 (v6) meaningful bytes */
|
||||
} mock_addr;
|
||||
|
||||
typedef struct mock_host {
|
||||
const char *name;
|
||||
int gai_err; /* non-zero: getaddrinfo returns this */
|
||||
int naddr;
|
||||
mock_addr addr[6];
|
||||
int calls; /* times the backend resolved this host */
|
||||
} mock_host;
|
||||
|
||||
static mock_host mock_hosts[] = {
|
||||
{"v4only.test", 0, 1, {{AF_INET, {1, 2, 3, 4}}}, 0},
|
||||
{"v6only.test", 0, 1, {{AF_INET6, {0x20, 0x01, 0x0d, 0xb8, [15] = 1}}}, 0},
|
||||
/* dual stack, IPv6 first (RFC 6724 order) then IPv4 */
|
||||
{"dual.test",
|
||||
0,
|
||||
2,
|
||||
{{AF_INET6, {0x20, 0x01, 0x0d, 0xb8, [15] = 2}}, {AF_INET, {5, 6, 7, 8}}},
|
||||
0},
|
||||
/* dual stack, IPv4 first: distinguishes "keep the first address" from
|
||||
"prefer a family", so the selection contract is actually pinned. */
|
||||
{"dual4.test",
|
||||
0,
|
||||
2,
|
||||
{{AF_INET, {9, 10, 11, 12}},
|
||||
{AF_INET6, {0x20, 0x01, 0x0d, 0xb8, [15] = 3}}},
|
||||
0},
|
||||
/* more addresses than HTS_MAXADDRNUM: the list must clamp to the cap. */
|
||||
{"many.test",
|
||||
0,
|
||||
6,
|
||||
{{AF_INET, {10, 0, 0, 1}},
|
||||
{AF_INET, {10, 0, 0, 2}},
|
||||
{AF_INET, {10, 0, 0, 3}},
|
||||
{AF_INET, {10, 0, 0, 4}},
|
||||
{AF_INET, {10, 0, 0, 5}},
|
||||
{AF_INET, {10, 0, 0, 6}}},
|
||||
0},
|
||||
{"nodns.test", EAI_NONAME, 0, {{0}}, 0},
|
||||
};
|
||||
|
||||
static mock_host *mock_find(const char *name) {
|
||||
for (size_t i = 0; i < sizeof(mock_hosts) / sizeof(mock_hosts[0]); i++) {
|
||||
if (strcmp(mock_hosts[i].name, name) == 0)
|
||||
return &mock_hosts[i];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void mock_reset_calls(void) {
|
||||
for (size_t i = 0; i < sizeof(mock_hosts) / sizeof(mock_hosts[0]); i++)
|
||||
mock_hosts[i].calls = 0;
|
||||
}
|
||||
|
||||
/* Build one addrinfo node owning its sockaddr (freed by mock_freeaddrinfo). */
|
||||
static struct addrinfo *mock_mkai(const mock_addr *a) {
|
||||
struct addrinfo *ai = calloct(1, sizeof(*ai));
|
||||
|
||||
ai->ai_family = a->family;
|
||||
if (a->family == AF_INET) {
|
||||
struct sockaddr_in *sin = calloct(1, sizeof(*sin));
|
||||
|
||||
sin->sin_family = AF_INET;
|
||||
memcpy(&sin->sin_addr, a->addr, 4);
|
||||
ai->ai_addr = (struct sockaddr *) sin;
|
||||
ai->ai_addrlen = sizeof(*sin);
|
||||
} else {
|
||||
struct sockaddr_in6 *sin6 = calloct(1, sizeof(*sin6));
|
||||
|
||||
sin6->sin6_family = AF_INET6;
|
||||
memcpy(&sin6->sin6_addr, a->addr, 16);
|
||||
ai->ai_addr = (struct sockaddr *) sin6;
|
||||
ai->ai_addrlen = sizeof(*sin6);
|
||||
}
|
||||
return ai;
|
||||
}
|
||||
|
||||
static int mock_getaddrinfo(const char *node, const char *service,
|
||||
const struct addrinfo *hints,
|
||||
struct addrinfo **res) {
|
||||
mock_host *const h = mock_find(node);
|
||||
const int want = (hints != NULL) ? hints->ai_family : PF_UNSPEC;
|
||||
struct addrinfo *head = NULL, *tail = NULL;
|
||||
|
||||
(void) service;
|
||||
*res = NULL;
|
||||
if (h == NULL)
|
||||
return EAI_NONAME;
|
||||
h->calls++; /* a real backend hit; a cached host skips this */
|
||||
if (h->gai_err != 0)
|
||||
return h->gai_err;
|
||||
for (int i = 0; i < h->naddr; i++) {
|
||||
if (want != PF_UNSPEC && want != h->addr[i].family)
|
||||
continue; /* honor the requested family (v4/v6 only) */
|
||||
struct addrinfo *const ai = mock_mkai(&h->addr[i]);
|
||||
|
||||
if (head == NULL)
|
||||
head = ai;
|
||||
else
|
||||
tail->ai_next = ai;
|
||||
tail = ai;
|
||||
}
|
||||
if (head == NULL)
|
||||
return EAI_NONAME; /* filtered to empty, as the libc resolver does */
|
||||
*res = head;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void mock_freeaddrinfo(struct addrinfo *res) {
|
||||
while (res != NULL) {
|
||||
struct addrinfo *const next = res->ai_next;
|
||||
|
||||
freet(res->ai_addr);
|
||||
freet(res);
|
||||
res = next;
|
||||
}
|
||||
}
|
||||
|
||||
static const hts_resolver_backend mock_backend = {mock_getaddrinfo,
|
||||
mock_freeaddrinfo};
|
||||
|
||||
static int failures = 0;
|
||||
|
||||
#define CHECK(cond) \
|
||||
do { \
|
||||
if (!(cond)) { \
|
||||
failures++; \
|
||||
fprintf(stderr, "dns-selftest: FAIL at %s:%d: %s\n", __FILE__, __LINE__, \
|
||||
#cond); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/* Resolve via the uncached entry point; return the address family, or
|
||||
AF_UNSPEC if the host did not resolve. */
|
||||
static int resolve_family_nocache(const char *host) {
|
||||
SOCaddr addr;
|
||||
const char *err = NULL;
|
||||
|
||||
if (hts_dns_resolve_nocache2(host, &addr, &err) == NULL)
|
||||
return AF_UNSPEC;
|
||||
return SOCaddr_sinfamily(addr);
|
||||
}
|
||||
|
||||
int dns_selftests(httrackp *opt) {
|
||||
failures = 0;
|
||||
hts_dns_set_resolver_backend(&mock_backend);
|
||||
|
||||
/* IPv4-only / IPv6-only hosts map to the right family. */
|
||||
IPV6_resolver = 0;
|
||||
CHECK(resolve_family_nocache("v4only.test") == AF_INET);
|
||||
CHECK(resolve_family_nocache("v6only.test") == AF_INET6);
|
||||
|
||||
/* Dual-stack: the single-address API returns the *first* resolved address.
|
||||
Both orderings pin selection by position, not a family preference. The
|
||||
multi-address API (resolve_all, below) exposes the whole list. */
|
||||
CHECK(resolve_family_nocache("dual.test") == AF_INET6); /* v6 listed first */
|
||||
CHECK(resolve_family_nocache("dual4.test") == AF_INET); /* v4 listed first */
|
||||
|
||||
/* Unknown host does not resolve. */
|
||||
CHECK(resolve_family_nocache("nodns.test") == AF_UNSPEC);
|
||||
|
||||
/* Family filter (-@i4 / -@i6) selects v4 / v6 out of the dual-stack host. */
|
||||
IPV6_resolver = 1;
|
||||
CHECK(resolve_family_nocache("dual.test") == AF_INET);
|
||||
IPV6_resolver = 2;
|
||||
CHECK(resolve_family_nocache("dual.test") == AF_INET6);
|
||||
IPV6_resolver = 0;
|
||||
|
||||
/* Cached driver resolves a host once and reuses the *same* address. */
|
||||
mock_reset_calls();
|
||||
{
|
||||
SOCaddr a1, a2;
|
||||
char ip1[64], ip2[64];
|
||||
const char *err = NULL;
|
||||
|
||||
CHECK(hts_dns_resolve2(opt, "v4only.test", &a1, &err) != NULL);
|
||||
CHECK(hts_dns_resolve2(opt, "v4only.test", &a2, &err) != NULL);
|
||||
CHECK(mock_find("v4only.test")->calls == 1);
|
||||
/* the cache returns the right address, not merely a hit for the key */
|
||||
SOCaddr_inetntoa(ip1, sizeof(ip1), a1);
|
||||
SOCaddr_inetntoa(ip2, sizeof(ip2), a2);
|
||||
CHECK(strcmp(ip1, "1.2.3.4") == 0);
|
||||
CHECK(strcmp(ip1, ip2) == 0);
|
||||
}
|
||||
|
||||
/* A negative result is cached too: a second lookup does not re-resolve. */
|
||||
{
|
||||
SOCaddr a1, a2;
|
||||
const char *err = NULL;
|
||||
|
||||
CHECK(hts_dns_resolve2(opt, "nodns.test", &a1, &err) == NULL);
|
||||
CHECK(hts_dns_resolve2(opt, "nodns.test", &a2, &err) == NULL);
|
||||
CHECK(mock_find("nodns.test")->calls == 1); /* resolved once, then cached */
|
||||
}
|
||||
|
||||
/* Multi-address resolution: count and order are the connect-fallback
|
||||
contract. A dead first address is retried against the next, so both must be
|
||||
exact. */
|
||||
mock_reset_calls();
|
||||
{
|
||||
SOCaddr addrs[HTS_MAXADDRNUM];
|
||||
char ip[64];
|
||||
const char *err = NULL;
|
||||
|
||||
/* dual-stack, in resolver order: [0]=v6, [1]=v4 */
|
||||
CHECK(hts_dns_resolve_all(opt, "dual.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||
2);
|
||||
CHECK(SOCaddr_sinfamily(addrs[0]) == AF_INET6);
|
||||
CHECK(SOCaddr_sinfamily(addrs[1]) == AF_INET);
|
||||
SOCaddr_inetntoa(ip, sizeof(ip), addrs[1]);
|
||||
CHECK(strcmp(ip, "5.6.7.8") == 0);
|
||||
CHECK(mock_find("dual.test")->calls ==
|
||||
1); /* one backend hit for the list */
|
||||
|
||||
/* single-address host: count 1 */
|
||||
CHECK(hts_dns_resolve_all(opt, "v4only.test", addrs, HTS_MAXADDRNUM,
|
||||
&err) == 1);
|
||||
SOCaddr_inetntoa(ip, sizeof(ip), addrs[0]);
|
||||
CHECK(strcmp(ip, "1.2.3.4") == 0);
|
||||
|
||||
/* does-not-resolve: count 0 (negative), no addresses */
|
||||
CHECK(hts_dns_resolve_all(opt, "nodns.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||
0);
|
||||
|
||||
/* more than the cap: the kept list is clamped to HTS_MAXADDRNUM, keeping
|
||||
the FIRST addresses in resolver order (not some other window) */
|
||||
CHECK(hts_dns_resolve_all(opt, "many.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||
HTS_MAXADDRNUM);
|
||||
SOCaddr_inetntoa(ip, sizeof(ip), addrs[0]);
|
||||
CHECK(strcmp(ip, "10.0.0.1") == 0);
|
||||
SOCaddr_inetntoa(ip, sizeof(ip), addrs[HTS_MAXADDRNUM - 1]);
|
||||
CHECK(strcmp(ip, "10.0.0.4") == 0);
|
||||
|
||||
/* family filter still applies through the list path */
|
||||
IPV6_resolver = 1;
|
||||
CHECK(hts_dns_resolve_all(opt, "dual4.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||
1);
|
||||
CHECK(SOCaddr_sinfamily(addrs[0]) == AF_INET);
|
||||
IPV6_resolver = 0;
|
||||
}
|
||||
|
||||
/* newhttp_addr() must connect to the addr_index-th address, not always the
|
||||
first: this is what back_connect_next relies on to reach the fallback. */
|
||||
{
|
||||
htsblk r;
|
||||
int count = -1;
|
||||
T_SOC s;
|
||||
|
||||
hts_init_htsblk(&r);
|
||||
s = newhttp_addr(opt, "dual.test", &r, 80, 0, 0, &count);
|
||||
CHECK(count == 2);
|
||||
CHECK(SOCaddr_sinfamily(r.address) == AF_INET6); /* index 0 = v6 */
|
||||
if (s != INVALID_SOCKET)
|
||||
deletesoc(s);
|
||||
|
||||
hts_init_htsblk(&r);
|
||||
count = -1;
|
||||
s = newhttp_addr(opt, "dual.test", &r, 80, 0, 1, &count);
|
||||
CHECK(count == 2);
|
||||
CHECK(SOCaddr_sinfamily(r.address) == AF_INET); /* index 1 = v4 */
|
||||
if (s != INVALID_SOCKET)
|
||||
deletesoc(s);
|
||||
|
||||
/* out-of-range index: no address selected (address stays unset) */
|
||||
hts_init_htsblk(&r);
|
||||
s = newhttp_addr(opt, "dual.test", &r, 80, 0, 2, NULL);
|
||||
CHECK(s == INVALID_SOCKET);
|
||||
if (s != INVALID_SOCKET)
|
||||
deletesoc(s);
|
||||
}
|
||||
|
||||
/* Connect-fallback decision (consumer of the multi-address list): when a
|
||||
stuck connect should abandon the current address for the next one. */
|
||||
{
|
||||
/* no fallback for the last/only candidate, whatever the elapsed time */
|
||||
CHECK(back_connect_fallback_due(0, 1, 9999, 120) == 0);
|
||||
CHECK(back_connect_fallback_due(1, 2, 9999, 120) == 0);
|
||||
CHECK(back_connect_fallback_due(3, 4, 9999, 120) == 0);
|
||||
/* fallback available: wait the per-candidate deadline (cap 10s here) */
|
||||
CHECK(back_connect_fallback_due(0, 2, 9, 120) == 0);
|
||||
CHECK(back_connect_fallback_due(0, 2, 10, 120) == 1);
|
||||
CHECK(back_connect_fallback_due(2, 4, 10, 120) == 1);
|
||||
/* a shorter slot timeout shortens the deadline (min(timeout, cap)) */
|
||||
CHECK(back_connect_fallback_due(0, 2, 4, 5) == 0);
|
||||
CHECK(back_connect_fallback_due(0, 2, 5, 5) == 1);
|
||||
/* no timeout management: never force a fallback */
|
||||
CHECK(back_connect_fallback_due(0, 2, 9999, 0) == 0);
|
||||
}
|
||||
|
||||
hts_dns_set_resolver_backend(NULL);
|
||||
return failures;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int dns_selftests(httrackp *opt) {
|
||||
(void) opt;
|
||||
return 0; /* resolver seam only exists in the IPv6 build */
|
||||
}
|
||||
|
||||
#endif
|
||||
51
src/htsdns_selftest.h
Normal file
51
src/htsdns_selftest.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 2026 Xavier Roche and other contributors
|
||||
|
||||
SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Ethical use: we kindly ask that you NOT use this software to harvest email
|
||||
addresses or to collect any other private information about people. Doing so
|
||||
would dishonor our work and waste the many hours we have spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: htsdns_selftest.h */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#ifndef HTSDNS_SELFTEST_DEFH
|
||||
#define HTSDNS_SELFTEST_DEFH
|
||||
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_httrackp
|
||||
#define HTS_DEF_FWSTRUCT_httrackp
|
||||
typedef struct httrackp httrackp;
|
||||
#endif
|
||||
|
||||
/* Drive the DNS resolver and cache through a scripted (mock) getaddrinfo,
|
||||
asserting address family, single-address selection, negative caching, the
|
||||
IPv4/IPv6 family filter, and that a cached host is resolved only once.
|
||||
Returns the number of failed checks (0 == success). */
|
||||
int dns_selftests(httrackp *opt);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -43,8 +43,8 @@ Please visit our Website: http://www.httrack.com
|
||||
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
||||
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
||||
LIB_VERSION the data/cache format generation. */
|
||||
#define HTTRACK_VERSION "3.49-8"
|
||||
#define HTTRACK_VERSIONID "3.49.8"
|
||||
#define HTTRACK_VERSION "3.49-9"
|
||||
#define HTTRACK_VERSIONID "3.49.9"
|
||||
#define HTTRACK_AFF_VERSION "3.x"
|
||||
#define HTTRACK_LIB_VERSION "2.0"
|
||||
|
||||
@@ -247,13 +247,23 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_NOPARAM "(none)"
|
||||
#define HTS_NOPARAM2 "\"(none)\""
|
||||
|
||||
/* Boolean flag for option fields and API yes/no returns. An enum (not C bool)
|
||||
so it stays int-sized: option fields keep the httrackp layout/ABI, and a
|
||||
return type stays compatible with the int it replaces. */
|
||||
/* Boolean flag for option fields and API yes/no returns. Int-backed, not an
|
||||
enum: an enum makes C++ reject `field = 1` / `f(0)` on the exported fields
|
||||
and params. Int-sized, so the httrackp layout and the ABI are unchanged. */
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_boolean
|
||||
#define HTS_DEF_DEFSTRUCT_hts_boolean
|
||||
|
||||
typedef enum hts_boolean { HTS_FALSE = 0, HTS_TRUE = 1 } hts_boolean;
|
||||
typedef int hts_boolean;
|
||||
#define HTS_FALSE 0
|
||||
#define HTS_TRUE 1
|
||||
#endif
|
||||
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_tristate
|
||||
#define HTS_DEF_DEFSTRUCT_hts_tristate
|
||||
/* Tri-state hts_boolean: HTS_DEFAULT (-1) = "unspecified" (copy_htsopt leaves
|
||||
the target untouched); HTS_FALSE/HTS_TRUE = off/on. */
|
||||
typedef int hts_tristate;
|
||||
#define HTS_DEFAULT (-1)
|
||||
#endif
|
||||
|
||||
/* Larger/smaller of two values. Macros: arguments are evaluated twice. */
|
||||
@@ -398,6 +408,10 @@ typedef int T_SOC;
|
||||
/* Buffer size for a printed network address (IPv4 or IPv6, NUL included). */
|
||||
#define HTS_MAXADDRLEN 64
|
||||
|
||||
/* Max resolved addresses kept per host for connect fallback (dead IPv6 etc.).
|
||||
*/
|
||||
#define HTS_MAXADDRNUM 4
|
||||
|
||||
#ifdef _WIN32
|
||||
#else
|
||||
#define __cdecl
|
||||
|
||||
@@ -646,9 +646,7 @@ void help(const char *app, int more) {
|
||||
infomsg("");
|
||||
infomsg("Guru options: (do NOT use if possible)");
|
||||
infomsg(" #X *use optimized engine (limited memory boundary checks)");
|
||||
infomsg(" #0 filter test (-#0 '*.gif' 'www.bar.com/foo.gif')");
|
||||
infomsg(" #1 simplify test (-#1 ./foo/bar/../foobar)");
|
||||
infomsg(" #2 type test (-#2 /foo/bar.php)");
|
||||
infomsg(" #test list engine self-tests (run one with -#test=NAME [args])");
|
||||
infomsg(" #C cache list (-#C '*.com/spider*.gif'");
|
||||
infomsg(" #R cache repair (damaged cache)");
|
||||
infomsg(" #d debug parser");
|
||||
|
||||
529
src/htslib.c
529
src/htslib.c
@@ -1423,7 +1423,7 @@ void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
else
|
||||
infostatuscode(retour->msg, retour->statuscode);
|
||||
// type MIME par défaut2
|
||||
strcpybuff(retour->contenttype, HTS_HYPERTEXT_DEFAULT_MIME);
|
||||
strcpybuff(retour->contenttype, HTS_UNKNOWN_MIME);
|
||||
} else { // pas de code!
|
||||
retour->statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(retour->msg, "Unknown response structure");
|
||||
@@ -1438,7 +1438,7 @@ void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
retour->statuscode = HTTP_OK;
|
||||
retour->keep_alive = 0;
|
||||
strcpybuff(retour->msg, "Unknown, assuming junky server");
|
||||
strcpybuff(retour->contenttype, HTS_HYPERTEXT_DEFAULT_MIME);
|
||||
strcpybuff(retour->contenttype, HTS_UNKNOWN_MIME);
|
||||
} else if (strnotempty(a)) {
|
||||
retour->statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(retour->msg, "Unknown (not HTTP/xx) response structure");
|
||||
@@ -1447,7 +1447,7 @@ void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
retour->statuscode = HTTP_OK;
|
||||
retour->keep_alive = 0;
|
||||
strcpybuff(retour->msg, "Unknown, assuming junky server");
|
||||
strcpybuff(retour->contenttype, HTS_HYPERTEXT_DEFAULT_MIME);
|
||||
strcpybuff(retour->contenttype, HTS_UNKNOWN_MIME);
|
||||
}
|
||||
}
|
||||
} else { // vide!
|
||||
@@ -1458,7 +1458,7 @@ void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
/* This is dirty .. */
|
||||
retour->statuscode = HTTP_OK;
|
||||
strcpybuff(retour->msg, "Unknown, assuming junky server");
|
||||
strcpybuff(retour->contenttype, HTS_HYPERTEXT_DEFAULT_MIME);
|
||||
strcpybuff(retour->contenttype, HTS_UNKNOWN_MIME);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1589,11 +1589,15 @@ void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * ret
|
||||
}
|
||||
}
|
||||
}
|
||||
sscanf(rcvd + p, "%s", tempo);
|
||||
if (strlen(tempo) < sizeof(retour->contenttype) - 2) // pas trop long!!
|
||||
strcpybuff(retour->contenttype, tempo);
|
||||
else
|
||||
strcpybuff(retour->contenttype, "application/octet-stream-unknown"); // erreur
|
||||
// An empty/whitespace Content-Type value yields no token: keep the
|
||||
// sentinel default rather than reading an uninitialized tempo.
|
||||
if (sscanf(rcvd + p, "%s", tempo) == 1) {
|
||||
if (strlen(tempo) < sizeof(retour->contenttype) - 2) // pas trop long!!
|
||||
strcpybuff(retour->contenttype, tempo);
|
||||
else
|
||||
strcpybuff(retour->contenttype,
|
||||
"application/octet-stream-unknown"); // erreur
|
||||
}
|
||||
}
|
||||
} else if ((p = strfield(rcvd, "Content-Range:")) != 0) {
|
||||
// Content-Range: bytes 0-70870/70871
|
||||
@@ -2293,14 +2297,27 @@ htsblk http_test(httrackp * opt, const char *adr, const char *fil, char *loc) {
|
||||
// peut ouvrir avec des connect() non bloquants: waitconnect=0/1
|
||||
T_SOC newhttp(httrackp * opt, const char *_iadr, htsblk * retour, int port,
|
||||
int waitconnect) {
|
||||
return newhttp_addr(opt, _iadr, retour, port, waitconnect, 0, NULL);
|
||||
}
|
||||
|
||||
T_SOC newhttp_addr(httrackp *opt, const char *_iadr, htsblk *retour, int port,
|
||||
int waitconnect, int addr_index, int *addr_count) {
|
||||
T_SOC soc; // descipteur de la socket
|
||||
|
||||
if (addr_count != NULL) {
|
||||
*addr_count = 0;
|
||||
}
|
||||
|
||||
if (strcmp(_iadr, "file://") != 0) { /* non fichier */
|
||||
SOCaddr server;
|
||||
SOCaddr addrs[HTS_MAXADDRNUM];
|
||||
int naddr;
|
||||
const char *error = "unknown error";
|
||||
|
||||
// tester un éventuel id:pass et virer id:pass@ si détecté
|
||||
const char *const iadr = jump_identification_const(_iadr);
|
||||
const char *resolve_host = iadr;
|
||||
char BIGSTK iadr2[HTS_URLMAXSIZE * 2];
|
||||
|
||||
SOCaddr_clear(server);
|
||||
|
||||
@@ -2322,7 +2339,6 @@ T_SOC newhttp(httrackp * opt, const char *_iadr, htsblk * retour, int port,
|
||||
#endif
|
||||
|
||||
if (a != NULL) {
|
||||
char BIGSTK iadr2[HTS_URLMAXSIZE * 2];
|
||||
int i = -1;
|
||||
|
||||
iadr2[0] = '\0';
|
||||
@@ -2333,18 +2349,19 @@ T_SOC newhttp(httrackp * opt, const char *_iadr, htsblk * retour, int port,
|
||||
|
||||
// adresse véritable (sans :xx)
|
||||
strncatbuff(iadr2, iadr, (int) (a - iadr));
|
||||
|
||||
// adresse sans le :xx
|
||||
hts_dns_resolve2(opt, iadr2, &server, &error);
|
||||
|
||||
} else {
|
||||
|
||||
// adresse normale (port par défaut par la suite)
|
||||
hts_dns_resolve2(opt, iadr, &server, &error);
|
||||
resolve_host = iadr2;
|
||||
}
|
||||
}
|
||||
|
||||
} else { // port défini
|
||||
hts_dns_resolve2(opt, iadr, &server, &error);
|
||||
// resolve the full address list and pick the requested candidate; the
|
||||
// scheduler retries the next index when a connect fails (dead IPv6 etc.)
|
||||
naddr =
|
||||
hts_dns_resolve_all(opt, resolve_host, addrs, HTS_MAXADDRNUM, &error);
|
||||
if (addr_count != NULL) {
|
||||
*addr_count = naddr;
|
||||
}
|
||||
if (addr_index >= 0 && addr_index < naddr) {
|
||||
SOCaddr_copy_SOCaddr(server, addrs[addr_index]);
|
||||
}
|
||||
|
||||
if (!SOCaddr_is_valid(server)) {
|
||||
@@ -4160,9 +4177,10 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
/* Check html -> text/html */
|
||||
const char *a = fil + strlen(fil) - 1;
|
||||
|
||||
while((*a != '.') && (*a != '/') && (a > fil))
|
||||
/* a < fil when fil is empty: bound before dereferencing */
|
||||
while ((a > fil) && (*a != '.') && (*a != '/'))
|
||||
a--;
|
||||
if (*a == '.' && strlen(a) < 32) {
|
||||
if (a >= fil && *a == '.' && strlen(a) < 32) {
|
||||
int j = 0;
|
||||
|
||||
a++;
|
||||
@@ -4310,6 +4328,7 @@ int give_mimext(char *s, size_t ssize, const char *st) {
|
||||
int ok = 0;
|
||||
int j = 0;
|
||||
|
||||
st = hts_effective_mime(st); /* no declared type: derive an html ext */
|
||||
s[0] = '\0';
|
||||
while((!ok) && (strnotempty(hts_mime[j][1]))) {
|
||||
if (strfield2(hts_mime[j][0], st)) {
|
||||
@@ -4748,144 +4767,299 @@ int hts_read(htsblk * r, char *buff, int size) {
|
||||
// -- Gestion cache DNS --
|
||||
// 'RX98
|
||||
|
||||
// 'capsule' contenant uniquement le cache
|
||||
t_dnscache *hts_cache(httrackp * opt) {
|
||||
// Free a DNS cache record (coucal value handler).
|
||||
static void hts_cache_value_free(coucal_opaque arg, coucal_value value) {
|
||||
void *record = value.ptr;
|
||||
|
||||
(void) arg;
|
||||
freet(record);
|
||||
}
|
||||
|
||||
// opt's DNS cache hashtable, created on first use. Records (t_dnscache*) are
|
||||
// owned by the table and freed by hts_cache_value_free on coucal_delete.
|
||||
coucal hts_cache(httrackp *opt) {
|
||||
assertf(opt != NULL);
|
||||
if (opt->state.dns_cache == NULL) {
|
||||
opt->state.dns_cache = (t_dnscache *) malloct(sizeof(t_dnscache));
|
||||
memset(opt->state.dns_cache, 0, sizeof(t_dnscache));
|
||||
coucal cache = coucal_new(0);
|
||||
|
||||
coucal_set_name(cache, "dns_cache");
|
||||
coucal_value_set_value_handler(cache, hts_cache_value_free, NULL);
|
||||
opt->state.dns_cache = cache;
|
||||
}
|
||||
assertf(opt->state.dns_cache != NULL);
|
||||
/* first entry is NULL */
|
||||
assertf(opt->state.dns_cache->iadr == NULL);
|
||||
return opt->state.dns_cache;
|
||||
}
|
||||
|
||||
// Free DNS cache.
|
||||
void hts_cache_free(t_dnscache *const root) {
|
||||
if (root != NULL) {
|
||||
t_dnscache *cache;
|
||||
for(cache = root; cache != NULL; ) {
|
||||
t_dnscache *const next = cache->next;
|
||||
cache->next = NULL;
|
||||
freet(cache);
|
||||
cache = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
// MUST BE LOCKED (coucal is not internally serialized vs FTP/web threads)
|
||||
// Look up iadr in the DNS cache, filling out[0..min(count,max)-1].
|
||||
// Returns: -1 not yet tested; 0 negative-cached (not in DNS); >0 address count.
|
||||
static int hts_ghbn_all(coucal cache, const char *const iadr,
|
||||
SOCaddr *const out, const int max) {
|
||||
void *ptr;
|
||||
|
||||
// lock le cache dns pour tout opération d'ajout
|
||||
// plus prudent quand plusieurs threads peuvent écrire dedans..
|
||||
// -1: status? 0: libérer 1:locker
|
||||
|
||||
// MUST BE LOCKED
|
||||
// routine pour le cache - retour optionnel à donner à chaque fois
|
||||
// NULL: nom non encore testé dans le cache
|
||||
// si h_length==0 alors le nom n'existe pas dans le dns
|
||||
static SOCaddr* hts_ghbn(const t_dnscache *cache, const char *const iadr, SOCaddr *const addr) {
|
||||
assertf(addr != NULL);
|
||||
assertf(out != NULL);
|
||||
assertf(iadr != NULL);
|
||||
if (*iadr == '\0') {
|
||||
return NULL;
|
||||
return -1;
|
||||
}
|
||||
/* first entry is empty */
|
||||
if (cache->iadr == NULL) {
|
||||
cache = cache->next;
|
||||
}
|
||||
for(; cache != NULL; cache = cache->next) {
|
||||
assertf(cache != NULL);
|
||||
assertf(cache->iadr != NULL);
|
||||
assertf(cache->iadr == (const char*) cache + sizeof(t_dnscache));
|
||||
if (strcmp(cache->iadr, iadr) == 0) { // ok trouvé
|
||||
if (cache->host_length != 0) { // entrée valide
|
||||
assertf(cache->host_length <= sizeof(cache->host_addr));
|
||||
SOCaddr_copyaddr2(*addr, cache->host_addr, cache->host_length);
|
||||
return addr;
|
||||
} else { // erreur dans le dns, déja vérifié
|
||||
SOCaddr_clear(*addr);
|
||||
return addr;
|
||||
}
|
||||
if (coucal_read_pvoid(cache, iadr, &ptr)) { // ok trouvé
|
||||
const t_dnscache *const record = (const t_dnscache *) ptr;
|
||||
int i;
|
||||
|
||||
assertf(record->host_count <= HTS_MAXADDRNUM);
|
||||
for (i = 0; i < record->host_count && i < max; i++) {
|
||||
assertf(record->host_length[i] <= sizeof(record->host_addr[i]));
|
||||
SOCaddr_copyaddr2(out[i], record->host_addr[i], record->host_length[i]);
|
||||
}
|
||||
return record->host_count;
|
||||
}
|
||||
return NULL;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static SOCaddr* hts_dns_resolve_nocache2_(const char *const hostname,
|
||||
SOCaddr *const addr,
|
||||
const char **error) {
|
||||
#if HTS_INET6 != 0
|
||||
/* Active resolver backend; defaults to the libc resolver. The self-test
|
||||
reroutes it to script DNS answers in-process (see
|
||||
hts_dns_set_resolver_backend). */
|
||||
static const hts_resolver_backend hts_resolver_libc = {getaddrinfo,
|
||||
freeaddrinfo};
|
||||
static const hts_resolver_backend *hts_resolver = &hts_resolver_libc;
|
||||
|
||||
void hts_dns_set_resolver_backend(const hts_resolver_backend *backend) {
|
||||
hts_resolver = (backend != NULL) ? backend : &hts_resolver_libc;
|
||||
}
|
||||
|
||||
/* Debug/test hook: HTTRACK_DEBUG_RESOLVE="host:ip[,ip...]" pins the resolution
|
||||
of `host` to the listed addresses (curl --resolve style), so the connect
|
||||
fallback can be exercised deterministically (a dead address first, a live one
|
||||
next). Any other host resolves normally. Below: an addrinfo backend that owns
|
||||
its chain (its own freeaddrinfo), so a synthesized and a delegated result
|
||||
free the same way. */
|
||||
|
||||
/* Deep-copy a libc addrinfo chain into our own allocations. */
|
||||
static struct addrinfo *resolver_dup_chain(const struct addrinfo *src) {
|
||||
struct addrinfo *head = NULL, *tail = NULL;
|
||||
|
||||
for (; src != NULL; src = src->ai_next) {
|
||||
struct addrinfo *const ai = calloct(1, sizeof(*ai));
|
||||
|
||||
ai->ai_family = src->ai_family;
|
||||
ai->ai_socktype = src->ai_socktype;
|
||||
ai->ai_protocol = src->ai_protocol;
|
||||
ai->ai_addrlen = src->ai_addrlen;
|
||||
ai->ai_addr = malloct(src->ai_addrlen);
|
||||
memcpy(ai->ai_addr, src->ai_addr, src->ai_addrlen);
|
||||
if (head == NULL)
|
||||
head = ai;
|
||||
else
|
||||
tail->ai_next = ai;
|
||||
tail = ai;
|
||||
}
|
||||
return head;
|
||||
}
|
||||
|
||||
/* Build one addrinfo node from an IPv4/IPv6 literal, or NULL if it does not
|
||||
parse or is filtered out by want_family (AF_INET/AF_INET6/PF_UNSPEC). */
|
||||
static struct addrinfo *resolver_make_ai(const char *ip, int want_family) {
|
||||
struct addrinfo *ai;
|
||||
|
||||
if (strchr(ip, ':') != NULL) { // IPv6 literal
|
||||
struct sockaddr_in6 sa6;
|
||||
|
||||
if (want_family != PF_UNSPEC && want_family != AF_INET6)
|
||||
return NULL;
|
||||
memset(&sa6, 0, sizeof(sa6));
|
||||
if (inet_pton(AF_INET6, ip, &sa6.sin6_addr) != 1)
|
||||
return NULL;
|
||||
sa6.sin6_family = AF_INET6;
|
||||
ai = calloct(1, sizeof(*ai));
|
||||
ai->ai_family = AF_INET6;
|
||||
ai->ai_addrlen = sizeof(sa6);
|
||||
ai->ai_addr = malloct(sizeof(sa6));
|
||||
memcpy(ai->ai_addr, &sa6, sizeof(sa6));
|
||||
} else { // IPv4 literal
|
||||
struct sockaddr_in sa;
|
||||
|
||||
if (want_family != PF_UNSPEC && want_family != AF_INET)
|
||||
return NULL;
|
||||
memset(&sa, 0, sizeof(sa));
|
||||
if (inet_pton(AF_INET, ip, &sa.sin_addr) != 1)
|
||||
return NULL;
|
||||
sa.sin_family = AF_INET;
|
||||
ai = calloct(1, sizeof(*ai));
|
||||
ai->ai_family = AF_INET;
|
||||
ai->ai_addrlen = sizeof(sa);
|
||||
ai->ai_addr = malloct(sizeof(sa));
|
||||
memcpy(ai->ai_addr, &sa, sizeof(sa));
|
||||
}
|
||||
return ai;
|
||||
}
|
||||
|
||||
static void override_freeaddrinfo(struct addrinfo *res) {
|
||||
while (res != NULL) {
|
||||
struct addrinfo *const next = res->ai_next;
|
||||
|
||||
freet(res->ai_addr);
|
||||
freet(res);
|
||||
res = next;
|
||||
}
|
||||
}
|
||||
|
||||
static int override_getaddrinfo(const char *node, const char *service,
|
||||
const struct addrinfo *hints,
|
||||
struct addrinfo **res) {
|
||||
const char *const spec = getenv("HTTRACK_DEBUG_RESOLVE");
|
||||
const int want = (hints != NULL) ? hints->ai_family : PF_UNSPEC;
|
||||
const char *colon;
|
||||
|
||||
*res = NULL;
|
||||
if (spec != NULL && node != NULL && (colon = strchr(spec, ':')) != NULL &&
|
||||
(size_t) (colon - spec) == strlen(node) &&
|
||||
strncmp(spec, node, colon - spec) == 0) {
|
||||
struct addrinfo *head = NULL, *tail = NULL;
|
||||
char buf[256];
|
||||
char *p;
|
||||
|
||||
buf[0] = '\0';
|
||||
strncatbuff(buf, colon + 1, sizeof(buf) - 1);
|
||||
for (p = strtok(buf, ","); p != NULL; p = strtok(NULL, ",")) {
|
||||
struct addrinfo *const ai = resolver_make_ai(p, want);
|
||||
|
||||
if (ai != NULL) {
|
||||
if (head == NULL)
|
||||
head = ai;
|
||||
else
|
||||
tail->ai_next = ai;
|
||||
tail = ai;
|
||||
}
|
||||
}
|
||||
if (head == NULL)
|
||||
return EAI_NONAME;
|
||||
*res = head;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* not overridden: delegate to libc, copying into our owned format */
|
||||
{
|
||||
#if HTS_INET6==0
|
||||
/* IPv4 resolver */
|
||||
struct hostent *const hp = gethostbyname(hostname);
|
||||
struct addrinfo *sys = NULL;
|
||||
int gerr = getaddrinfo(node, service, hints, &sys);
|
||||
|
||||
if (hp != NULL) {
|
||||
SOCaddr_copyaddr2(addr, hp->h_addr_list[0], hp->h_length);
|
||||
return SOCaddr_is_valid(addr) ? &addr : NULL;
|
||||
} else {
|
||||
SOCaddr_clear(*addr);
|
||||
}
|
||||
#else
|
||||
/* IPv6 resolver */
|
||||
struct addrinfo *res = NULL;
|
||||
struct addrinfo hints;
|
||||
int gerr;
|
||||
|
||||
SOCaddr_clear(*addr);
|
||||
memset(&hints, 0, sizeof(hints));
|
||||
if (IPV6_resolver == 1) // V4 only (for bogus V6 entries)
|
||||
hints.ai_family = PF_INET;
|
||||
else if (IPV6_resolver == 2) // V6 only (for testing V6 only)
|
||||
hints.ai_family = PF_INET6;
|
||||
else // V4 + V6
|
||||
hints.ai_family = PF_UNSPEC;
|
||||
hints.ai_socktype = SOCK_STREAM;
|
||||
hints.ai_protocol = IPPROTO_TCP;
|
||||
if ( ( gerr = getaddrinfo(hostname, NULL, &hints, &res) ) == 0) {
|
||||
if (res != NULL) {
|
||||
if (res->ai_addr != NULL && res->ai_addrlen != 0) {
|
||||
SOCaddr_copyaddr2(*addr, res->ai_addr, res->ai_addrlen);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (error != NULL) {
|
||||
*error = gai_strerror(gerr);
|
||||
}
|
||||
}
|
||||
if (res) {
|
||||
freeaddrinfo(res);
|
||||
}
|
||||
#endif
|
||||
if (gerr != 0)
|
||||
return gerr;
|
||||
*res = resolver_dup_chain(sys);
|
||||
freeaddrinfo(sys);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return SOCaddr_is_valid(*addr) ? addr : NULL;
|
||||
}
|
||||
|
||||
HTSEXT_API SOCaddr* hts_dns_resolve_nocache2(const char *const hostname,
|
||||
SOCaddr *const addr, const char **error) {
|
||||
/* Protection */
|
||||
if (!strnotempty(hostname)) {
|
||||
return NULL;
|
||||
}
|
||||
static const hts_resolver_backend hts_resolver_override = {
|
||||
override_getaddrinfo, override_freeaddrinfo};
|
||||
|
||||
/*
|
||||
Strip [] if any : [3ffe:b80:1234:1::1]
|
||||
The resolver doesn't seem to handle IP6 addresses in brackets
|
||||
*/
|
||||
/* Install the env override once, unless a backend was already set (self-test).
|
||||
*/
|
||||
static void hts_resolver_check_env(void) {
|
||||
static int checked = 0;
|
||||
|
||||
if (!checked) {
|
||||
checked = 1;
|
||||
if (hts_resolver == &hts_resolver_libc &&
|
||||
getenv("HTTRACK_DEBUG_RESOLVE") != NULL) {
|
||||
hts_resolver = &hts_resolver_override;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Resolve hostname into up to max addresses (resolver/RFC 6724 order), no
|
||||
// cache. Returns the count copied into out[0..count-1]; 0 = does not resolve.
|
||||
static int hts_dns_resolve_nocache_list_(const char *const hostname,
|
||||
SOCaddr *const out, const int max,
|
||||
const char **error) {
|
||||
int count = 0;
|
||||
|
||||
#if HTS_INET6==0
|
||||
/* IPv4 resolver */
|
||||
struct hostent *const hp = gethostbyname(hostname);
|
||||
|
||||
if (hp != NULL) {
|
||||
char **h;
|
||||
|
||||
for (h = hp->h_addr_list; count < max && h != NULL && *h != NULL; h++) {
|
||||
SOCaddr_clear(out[count]);
|
||||
SOCaddr_copyaddr2(out[count], *h, hp->h_length);
|
||||
if (SOCaddr_is_valid(out[count]))
|
||||
count++;
|
||||
}
|
||||
}
|
||||
#else
|
||||
/* IPv6 resolver */
|
||||
struct addrinfo *res = NULL, *cur;
|
||||
struct addrinfo hints;
|
||||
int gerr;
|
||||
|
||||
hts_resolver_check_env();
|
||||
memset(&hints, 0, sizeof(hints));
|
||||
if (IPV6_resolver == 1) // V4 only (for bogus V6 entries)
|
||||
hints.ai_family = PF_INET;
|
||||
else if (IPV6_resolver == 2) // V6 only (for testing V6 only)
|
||||
hints.ai_family = PF_INET6;
|
||||
else // V4 + V6
|
||||
hints.ai_family = PF_UNSPEC;
|
||||
hints.ai_socktype = SOCK_STREAM;
|
||||
hints.ai_protocol = IPPROTO_TCP;
|
||||
if ((gerr = hts_resolver->getaddrinfo(hostname, NULL, &hints, &res)) == 0) {
|
||||
for (cur = res; cur != NULL && count < max; cur = cur->ai_next) {
|
||||
if (cur->ai_addr != NULL && cur->ai_addrlen != 0) {
|
||||
SOCaddr_clear(out[count]);
|
||||
SOCaddr_copyaddr2(out[count], cur->ai_addr, cur->ai_addrlen);
|
||||
if (SOCaddr_is_valid(out[count]))
|
||||
count++;
|
||||
}
|
||||
}
|
||||
} else if (error != NULL) {
|
||||
*error = gai_strerror(gerr);
|
||||
}
|
||||
if (res) {
|
||||
hts_resolver->freeaddrinfo(res);
|
||||
}
|
||||
#endif
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
// Strip [] around a literal IPv6 ([3ffe:b80:1234:1::1]) the resolver won't
|
||||
// take, then resolve into a list. Returns the count.
|
||||
static int hts_dns_resolve_nocache_list(const char *const hostname,
|
||||
SOCaddr *const out, const int max,
|
||||
const char **error) {
|
||||
if (!strnotempty(hostname) || max <= 0) {
|
||||
return 0;
|
||||
}
|
||||
if ((hostname[0] == '[') && (hostname[strlen(hostname) - 1] == ']')) {
|
||||
SOCaddr *ret;
|
||||
size_t size = strlen(hostname);
|
||||
char *copy = malloct(size + 1);
|
||||
int count;
|
||||
|
||||
assertf(copy != NULL);
|
||||
copy[0] = '\0';
|
||||
strncat(copy, hostname + 1, size - 2);
|
||||
ret = hts_dns_resolve_nocache2_(copy, addr, error);
|
||||
count = hts_dns_resolve_nocache_list_(copy, out, max, error);
|
||||
freet(copy);
|
||||
return ret;
|
||||
return count;
|
||||
} else {
|
||||
return hts_dns_resolve_nocache2_(hostname, addr, error);
|
||||
return hts_dns_resolve_nocache_list_(hostname, out, max, error);
|
||||
}
|
||||
}
|
||||
|
||||
HTSEXT_API SOCaddr *hts_dns_resolve_nocache2(const char *const hostname,
|
||||
SOCaddr *const addr,
|
||||
const char **error) {
|
||||
SOCaddr_clear(*addr);
|
||||
if (hts_dns_resolve_nocache_list(hostname, addr, 1, error) > 0) {
|
||||
return SOCaddr_is_valid(*addr) ? addr : NULL;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
HTSEXT_API SOCaddr* hts_dns_resolve_nocache(const char *const hostname, SOCaddr *const addr) {
|
||||
return hts_dns_resolve_nocache2(hostname, addr, NULL);
|
||||
}
|
||||
@@ -4896,16 +5070,18 @@ HTSEXT_API int check_hostname_dns(const char *const hostname) {
|
||||
}
|
||||
|
||||
// Needs locking
|
||||
// cache dns interne à HTS // ** FREE A FAIRE sur la chaine
|
||||
static SOCaddr* hts_dns_resolve_(httrackp * opt, const char *_iadr,
|
||||
SOCaddr *const addr, const char **error) {
|
||||
// Internal DNS cache. Fill out[0..count-1] with up to max addresses for _iadr,
|
||||
// resolving (and caching the full list) on a miss. Returns the count.
|
||||
static int hts_dns_resolve_list_(httrackp *opt, const char *_iadr,
|
||||
SOCaddr *const out, const int max,
|
||||
const char **error) {
|
||||
char BIGSTK iadr[HTS_URLMAXSIZE * 2];
|
||||
t_dnscache *cache = hts_cache(opt); // adresse du cache
|
||||
SOCaddr *sa;
|
||||
coucal cache = hts_cache(opt); // le cache dns
|
||||
int count;
|
||||
|
||||
assertf(opt != NULL);
|
||||
assertf(_iadr != NULL);
|
||||
assertf(addr != NULL);
|
||||
assertf(out != NULL);
|
||||
|
||||
strcpybuff(iadr, jump_identification_const(_iadr));
|
||||
// couper éventuel :
|
||||
@@ -4917,55 +5093,67 @@ static SOCaddr* hts_dns_resolve_(httrackp * opt, const char *_iadr,
|
||||
}
|
||||
|
||||
/* get IP from the dns cache */
|
||||
sa = hts_ghbn(cache, iadr, addr);
|
||||
if (sa != NULL) {
|
||||
return SOCaddr_is_valid(*sa) ? sa : NULL;
|
||||
} else { // non présent dans le cache dns, tester
|
||||
const size_t iadr_len = strlen(iadr) + 1;
|
||||
|
||||
// find queue
|
||||
for(; cache->next != NULL; cache = cache->next) ;
|
||||
count = hts_ghbn_all(cache, iadr, out, max);
|
||||
if (count >= 0) { // cache hit (0 == negative-cached)
|
||||
return count;
|
||||
} else { // non présent dans le cache dns, tester
|
||||
SOCaddr resolved[HTS_MAXADDRNUM];
|
||||
t_dnscache *record;
|
||||
int i;
|
||||
|
||||
#if DEBUGDNS
|
||||
printf("resolving (not cached) %s\n", iadr);
|
||||
#endif
|
||||
|
||||
sa = hts_dns_resolve_nocache2(iadr, addr, error); // calculer IP host
|
||||
count = hts_dns_resolve_nocache_list(iadr, resolved, HTS_MAXADDRNUM, error);
|
||||
|
||||
#if HTS_WIDE_DEBUG
|
||||
DEBUG_W("gethostbyname done\n");
|
||||
#endif
|
||||
|
||||
/* attempt to store new entry */
|
||||
cache->next = malloct(sizeof(t_dnscache) + iadr_len);
|
||||
if (cache->next != NULL) {
|
||||
t_dnscache *const next = cache->next;
|
||||
char *const block = (char*) cache->next;
|
||||
char *const str = block + sizeof(t_dnscache);
|
||||
memcpy(str, iadr, iadr_len);
|
||||
next->iadr = str;
|
||||
if (sa != NULL) {
|
||||
next->host_length = SOCaddr_size(*sa);
|
||||
assertf(next->host_length <= sizeof(next->host_addr));
|
||||
memcpy(next->host_addr, &SOCaddr_sockaddr(*sa), next->host_length);
|
||||
} else {
|
||||
next->host_length = 0; // non existant dans le dns
|
||||
/* attempt to store new entry (coucal owns it and dups the host key) */
|
||||
record = malloct(sizeof(t_dnscache));
|
||||
if (record != NULL) {
|
||||
memset(record, 0, sizeof(*record));
|
||||
record->host_count = count;
|
||||
for (i = 0; i < count; i++) {
|
||||
record->host_length[i] = SOCaddr_size(resolved[i]);
|
||||
assertf(record->host_length[i] <= sizeof(record->host_addr[i]));
|
||||
memcpy(record->host_addr[i], &SOCaddr_sockaddr(resolved[i]),
|
||||
record->host_length[i]);
|
||||
}
|
||||
next->next = NULL;
|
||||
return sa;
|
||||
coucal_add_pvoid(cache, iadr, record);
|
||||
}
|
||||
|
||||
/* return result if any */
|
||||
return sa;
|
||||
} // retour hp du cache
|
||||
/* copy result to caller (cache store may have failed; result still valid)
|
||||
*/
|
||||
for (i = 0; i < count && i < max; i++) {
|
||||
SOCaddr_copy_SOCaddr(out[i], resolved[i]);
|
||||
}
|
||||
return count;
|
||||
} // retour hp du cache
|
||||
}
|
||||
|
||||
SOCaddr* hts_dns_resolve2(httrackp * opt, const char *_iadr, SOCaddr *const addr, const char **error) {
|
||||
SOCaddr *ret;
|
||||
int hts_dns_resolve_all(httrackp *opt, const char *iadr, SOCaddr *out, int max,
|
||||
const char **error) {
|
||||
int count;
|
||||
|
||||
if (!strnotempty(iadr) || max <= 0) {
|
||||
return 0;
|
||||
}
|
||||
hts_mutexlock(&opt->state.lock);
|
||||
ret = hts_dns_resolve_(opt, _iadr, addr, error);
|
||||
count = hts_dns_resolve_list_(opt, iadr, out, max, error);
|
||||
hts_mutexrelease(&opt->state.lock);
|
||||
return ret;
|
||||
return count;
|
||||
}
|
||||
|
||||
SOCaddr *hts_dns_resolve2(httrackp *opt, const char *_iadr, SOCaddr *const addr,
|
||||
const char **error) {
|
||||
SOCaddr_clear(*addr);
|
||||
if (hts_dns_resolve_all(opt, _iadr, addr, 1, error) > 0) {
|
||||
return SOCaddr_is_valid(*addr) ? addr : NULL;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
SOCaddr* hts_dns_resolve(httrackp * opt, const char *_iadr, SOCaddr *const addr) {
|
||||
@@ -5300,6 +5488,11 @@ static int get_loglevel_from_coucal(coucal_loglevel level) {
|
||||
static void default_coucal_loghandler(void *arg, coucal_loglevel level,
|
||||
const char* format, va_list args) {
|
||||
|
||||
/* informational chatter (hashtable stats on delete, etc.) only when
|
||||
debugging; keep warnings and critical errors always visible. */
|
||||
if (level > coucal_log_warning && hts_dgb_init <= 0) {
|
||||
return;
|
||||
}
|
||||
if (level <= coucal_log_warning) {
|
||||
fprintf(stderr, "** warning: ");
|
||||
}
|
||||
@@ -5800,14 +5993,14 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
|
||||
|
||||
/* Cache */
|
||||
if (opt->state.dns_cache != NULL) {
|
||||
t_dnscache *root;
|
||||
coucal root;
|
||||
|
||||
hts_mutexlock(&opt->state.lock);
|
||||
root = opt->state.dns_cache;
|
||||
opt->state.dns_cache = NULL;
|
||||
hts_mutexrelease(&opt->state.lock);
|
||||
|
||||
hts_cache_free(root);
|
||||
coucal_delete(&root); // frees records via hts_cache_value_free
|
||||
}
|
||||
|
||||
/* Cancel chain */
|
||||
|
||||
51
src/htslib.h
51
src/htslib.h
@@ -147,11 +147,13 @@ struct OLD_htsblk {
|
||||
#define HTS_DEF_FWSTRUCT_t_dnscache
|
||||
typedef struct t_dnscache t_dnscache;
|
||||
#endif
|
||||
// One DNS cache record, stored as a coucal value keyed by hostname.
|
||||
struct t_dnscache {
|
||||
struct t_dnscache *next;
|
||||
const char *iadr;
|
||||
size_t host_length; // length ; (4 or 16) ; 0 for error
|
||||
char host_addr[HTS_MAXADDRLEN];
|
||||
// resolved addresses, in resolver (RFC 6724) order; host_count==0 means the
|
||||
// name does not resolve (negative cache). host_count<=HTS_MAXADDRNUM.
|
||||
int host_count;
|
||||
size_t host_length[HTS_MAXADDRNUM]; // sockaddr length of each (16 or 28)
|
||||
char host_addr[HTS_MAXADDRNUM][HTS_MAXADDRLEN];
|
||||
};
|
||||
|
||||
/* Library internal definictions */
|
||||
@@ -191,6 +193,13 @@ int http_cookie_header_selftest(t_cookie *cookie, const char *domain,
|
||||
//int newhttp(char* iadr,char* err=NULL);
|
||||
T_SOC newhttp(httrackp * opt, const char *iadr, htsblk * retour, int port,
|
||||
int waitconnect);
|
||||
/* Like newhttp(), but connect to the addr_index-th resolved address of the host
|
||||
(0-based) instead of always the first; *addr_count, if non-NULL, is set to
|
||||
the total resolved addresses. newhttp() == newhttp_addr(...,0,NULL). Used by
|
||||
the slot scheduler to try the next address when a connect fails (dead IPv6
|
||||
etc.). */
|
||||
T_SOC newhttp_addr(httrackp *opt, const char *iadr, htsblk *retour, int port,
|
||||
int waitconnect, int addr_index, int *addr_count);
|
||||
HTS_INLINE void deletehttp(htsblk * r);
|
||||
HTS_INLINE int deleteaddr(htsblk * r);
|
||||
HTS_INLINE void deletesoc(T_SOC soc);
|
||||
@@ -215,9 +224,14 @@ void treatfirstline(htsblk * retour, const char *rcvd);
|
||||
|
||||
// sous-fonctions
|
||||
LLint http_xfread1(htsblk * r, int bufl);
|
||||
HTS_INLINE SOCaddr* hts_dns_resolve2(httrackp * opt, const char *iadr,
|
||||
SOCaddr *const addr,
|
||||
const char **error);
|
||||
/* Cached resolver: fill out[0..count-1] with up to max addresses for iadr (in
|
||||
resolver order), returning the count (0 = does not resolve, negative-cached).
|
||||
Resolves once per host; later calls read the DNS cache. Must hold no lock
|
||||
(brackets opt->state.lock itself). */
|
||||
int hts_dns_resolve_all(httrackp *opt, const char *iadr, SOCaddr *out, int max,
|
||||
const char **error);
|
||||
HTS_INLINE SOCaddr *hts_dns_resolve2(httrackp *opt, const char *iadr,
|
||||
SOCaddr *const addr, const char **error);
|
||||
HTS_INLINE SOCaddr* hts_dns_resolve(httrackp * opt, const char *iadr,
|
||||
SOCaddr *const addr);
|
||||
HTSEXT_API SOCaddr* hts_dns_resolve_nocache2(const char *const hostname,
|
||||
@@ -230,8 +244,9 @@ HTSEXT_API int check_hostname_dns(const char *const hostname);
|
||||
int ftp_available(void);
|
||||
|
||||
#if HTS_DNSCACHE
|
||||
void hts_cache_free(t_dnscache *const cache);
|
||||
t_dnscache *hts_cache(httrackp * opt);
|
||||
/* Return opt's DNS cache hashtable (hostname -> t_dnscache record), creating it
|
||||
on first use. Records are owned by the table and freed on coucal_delete. */
|
||||
coucal hts_cache(httrackp *opt);
|
||||
#endif
|
||||
|
||||
// outils divers
|
||||
@@ -481,10 +496,22 @@ HTS_STATIC int strcmpnocase(const char *a, const char *b) {
|
||||
|
||||
// is this MIME an hypertext MIME (text/html), html/js-style or other script/text type?
|
||||
#define HTS_HYPERTEXT_DEFAULT_MIME "text/html"
|
||||
/* Sentinel stored when the server declared no Content-Type. It is html-ish
|
||||
for every type test (so a typeless response still parses/stores as today),
|
||||
but the naming code (wire_patches_ext) treats it as "no declared type" and
|
||||
keeps the URL extension. It rides the cache, so updates name consistently. */
|
||||
#define HTS_UNKNOWN_MIME "unknown/unknown"
|
||||
/* Map the no-declared-type sentinel back to a real type for any header or
|
||||
record we EMIT or PERSIST, so "unknown/unknown" never reaches a consumer
|
||||
(a served Content-Type, a ProxyTrack .arc record, ...). */
|
||||
#define hts_effective_mime(m) \
|
||||
(strfield2((m), HTS_UNKNOWN_MIME) ? HTS_HYPERTEXT_DEFAULT_MIME : (m))
|
||||
|
||||
#define is_html_mime_type(a) \
|
||||
( (strfield2((a),"text/html")!=0)\
|
||||
|| (strfield2((a),"application/xhtml+xml")!=0) \
|
||||
#define is_html_mime_type(a) \
|
||||
((strfield2((a), "text/html") != 0) || \
|
||||
(strfield2((a), "application/xhtml+xml") != 0) || \
|
||||
(strfield2((a), HTS_UNKNOWN_MIME) != \
|
||||
0) /* no declared type: treat as html */ \
|
||||
)
|
||||
#define is_hypertext_mime__(a) \
|
||||
( \
|
||||
|
||||
@@ -138,6 +138,35 @@ static void cleanEndingSpaceOrDot(char *s) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Should the wire Content-Type override the URL's own extension when naming the
|
||||
saved file? True when the type is patchable (may_unknown2) and either the URL
|
||||
extension implies no specific type or the server declared a disagreeing one.
|
||||
A URL extension mapping to a specific non-HTML type is kept only when the
|
||||
server declared NO type (the HTS_UNKNOWN_MIME sentinel; the #267 mangle
|
||||
guard): a typeless .png stays .png, but a .pdf explicitly served as text/html
|
||||
is named .html. The sentinel rides the cache, so updates stay consistent. */
|
||||
static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
char urlmime[256];
|
||||
|
||||
if (may_unknown2(opt, wiremime, file))
|
||||
return 0; /* type kept verbatim (keep-list / bogus-multiple) */
|
||||
urlmime[0] = '\0';
|
||||
/* type implied by the URL extension, only when confidently known (flag 0) */
|
||||
if (!get_httptype_sized(opt, urlmime, sizeof(urlmime), file, 0))
|
||||
return 1; /* URL ext implies no known type: trust the wire type */
|
||||
if (strfield2(wiremime, urlmime))
|
||||
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
|
||||
/* wire disagrees with a specific non-HTML URL ext. Keep the ext only when
|
||||
the server declared no type (the sentinel); an explicitly declared type,
|
||||
even text/html, is trusted, so a binary-looking URL that really serves
|
||||
HTML (login/error interstitial, soft-404) is named .html. */
|
||||
if (!is_hypertext_mime(opt, urlmime, file) &&
|
||||
strfield2(wiremime, HTS_UNKNOWN_MIME))
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// forme le nom du fichier à sauver (save) à partir de fil et adr
|
||||
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
|
||||
int url_savename(lien_adrfilsave *const afs,
|
||||
@@ -325,7 +354,10 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
}
|
||||
|
||||
/* replace shtml to html.. */
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD)
|
||||
/* HARD delays every type, except one the user pinned with --assume: honor it
|
||||
immediately (ishtml() consults the user type), no delayed name (#56) */
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD &&
|
||||
!is_userknowntype(opt, fil))
|
||||
is_html = -1; /* ALWAYS delay type */
|
||||
else
|
||||
is_html = ishtml(opt, fil);
|
||||
@@ -380,7 +412,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (strnotempty(r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, r.cdispo);
|
||||
} else if (!may_unknown2(opt, r.contenttype, fil)) { // on peut patcher à priori?
|
||||
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
|
||||
if (give_mimext(s, sizeof(s),
|
||||
r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
@@ -425,7 +457,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (strnotempty(headers->r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, headers->r.cdispo);
|
||||
} else if (!may_unknown2(opt, headers->r.contenttype, headers->url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
|
||||
} else if (wire_patches_ext(opt, headers->r.contenttype,
|
||||
headers->url_fil)) {
|
||||
char s[16];
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
@@ -641,7 +674,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (!has_been_moved) {
|
||||
if (back[b].r.statuscode != -10) { // erreur
|
||||
if (strnotempty(back[b].r.contenttype) == 0)
|
||||
strcpybuff(back[b].r.contenttype, "text/html"); // message d'erreur en html
|
||||
strcpybuff(back[b].r.contenttype,
|
||||
HTS_UNKNOWN_MIME); // no declared type
|
||||
// Finalement on, renvoie un erreur, pour ne toucher à rien dans le code
|
||||
// libérer emplacement backing
|
||||
}
|
||||
@@ -653,7 +687,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (strnotempty(back[b].r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, back[b].r.cdispo);
|
||||
} else if (!may_unknown2(opt, back[b].r.contenttype, back[b].url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
|
||||
} else if (wire_patches_ext(opt, back[b].r.contenttype,
|
||||
back[b].url_fil)) {
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
back[b].r.contenttype)) { // recognized extension
|
||||
@@ -725,9 +760,9 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
strcatbuff(fil, DEFAULT_HTML); // nommer page par défaut (à priori ici html depuis un proxy http)
|
||||
}
|
||||
}
|
||||
// Changer extension?
|
||||
// par exemple, php3 sera sauvé en html, cgi en html ou gif, xbm etc.. selon les cas
|
||||
if (ext_chg && !opt->no_type_change) { // changer ext
|
||||
// Change the extension? e.g. php3 saved as html, cgi as html or gif/xbm
|
||||
// depending on the resolved type.
|
||||
if (ext_chg && !opt->no_type_change) {
|
||||
char *a = fil + strlen(fil) - 1;
|
||||
|
||||
if ((opt->debug > 1) && (opt->log != NULL)) {
|
||||
@@ -739,11 +774,19 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
adr_complete, fil_complete, ext);
|
||||
}
|
||||
if (ext_chg == 1) {
|
||||
// Cut the old extension only when it is empty (a bare trailing dot), the
|
||||
// new one, or a recognized one; an unknown trailing ".token" (e.g.
|
||||
// /article-1.884291, #115) is part of the name, not an extension.
|
||||
const char *const old_ext = get_ext(catbuff, sizeof(catbuff), fil);
|
||||
const int known_ext = !*old_ext || strfield2(old_ext, ext) ||
|
||||
is_knowntype(opt, fil) || is_dyntype(old_ext) ||
|
||||
ishtml_ext(old_ext) != -1;
|
||||
|
||||
while((a > fil) && (*a != '.') && (*a != '/'))
|
||||
a--;
|
||||
if (*a == '.')
|
||||
*a = '\0'; // couper
|
||||
strcatbuff(fil, "."); // recopier point
|
||||
if (*a == '.' && known_ext)
|
||||
*a = '\0'; // cut
|
||||
strcatbuff(fil, "."); // re-add the dot
|
||||
} else {
|
||||
while((a > fil) && (*a != '/'))
|
||||
a--;
|
||||
@@ -751,7 +794,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
a++;
|
||||
*a = '\0';
|
||||
}
|
||||
strcatbuff(fil, ext); // copier ext/nom
|
||||
strcatbuff(fil, ext); // append ext/name
|
||||
}
|
||||
// Rechercher premier / et dernier .
|
||||
{
|
||||
@@ -1686,10 +1729,10 @@ char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||
StringBuff(opt->path_log), digest_filename);
|
||||
}
|
||||
|
||||
/* remove refname if any */
|
||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
||||
const char *fil) {
|
||||
/* remove refname if any; HTS_TRUE if it was removed */
|
||||
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||
const char *fil) {
|
||||
char *filename = url_savename_refname_fullpath(opt, adr, fil);
|
||||
|
||||
(void) UNLINK(filename);
|
||||
return UNLINK(filename) == 0 ? HTS_TRUE : HTS_FALSE;
|
||||
}
|
||||
|
||||
@@ -104,8 +104,9 @@ char *url_md5(char *digest_buffer, const char *fil_complete);
|
||||
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
||||
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||
const char *fil);
|
||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
||||
const char *fil);
|
||||
/* Remove the temp-ref for (adr,fil); HTS_TRUE if it was removed. */
|
||||
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||
const char *fil);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
16
src/htsnet.h
16
src/htsnet.h
@@ -304,6 +304,22 @@ static HTS_UNUSED void SOCaddr_inetntoa_(char *namebuf, size_t namebuflen,
|
||||
/** Length type for socket APIs (getsockname, accept, ...). */
|
||||
typedef socklen_t SOClen;
|
||||
|
||||
#if HTS_INET6 != 0
|
||||
/** Resolver backend: getaddrinfo/freeaddrinfo as a swappable pair, so the
|
||||
self-test can script DNS answers (families, multiplicity, errors)
|
||||
in-process. The free function must match its getaddrinfo (a fake allocates
|
||||
its own chain), hence the pair. */
|
||||
typedef struct hts_resolver_backend {
|
||||
int (*getaddrinfo)(const char *node, const char *service,
|
||||
const struct addrinfo *hints, struct addrinfo **res);
|
||||
void (*freeaddrinfo)(struct addrinfo *res);
|
||||
} hts_resolver_backend;
|
||||
|
||||
/** Install a resolver backend for the process; NULL restores the libc default.
|
||||
Test-only seam, not thread-safe; callers must serialize against resolves. */
|
||||
void hts_dns_set_resolver_backend(const hts_resolver_backend *backend);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
12
src/htsopt.h
12
src/htsopt.h
@@ -241,7 +241,7 @@ struct htsoptstate {
|
||||
char *userhttptype;
|
||||
int verif_backblue_done; /**< backblue.gif/fade.gif already emitted */
|
||||
int verif_external_status;
|
||||
t_dnscache *dns_cache; /**< DNS resolution cache */
|
||||
coucal dns_cache; /**< DNS resolution cache: hostname -> t_dnscache record */
|
||||
int dns_cache_nthreads; /**< number of in-flight DNS resolver threads */
|
||||
/* HTML parsing state */
|
||||
char _hts_errmsg[HTS_CDLMAXSIZE + 256]; /**< last engine error message */
|
||||
@@ -428,11 +428,11 @@ struct httrackp {
|
||||
LLint maxfile_html; /**< max bytes per HTML file */
|
||||
int maxsoc; /**< max simultaneous sockets (-cN) */
|
||||
LLint fragment; /**< split site after this many bytes */
|
||||
hts_boolean
|
||||
hts_tristate
|
||||
nearlink; /**< also fetch images/data adjacent to a page but off-site */
|
||||
hts_boolean makeindex; /**< build a top-level index.html */
|
||||
hts_boolean kindex; /**< build a keyword index */
|
||||
hts_boolean delete_old; /**< delete locally obsolete files after update */
|
||||
hts_tristate delete_old; /**< delete locally obsolete files after update */
|
||||
int timeout; /**< connection timeout in seconds */
|
||||
int rateout; /**< minimum transfer rate (bytes/s) before abort */
|
||||
int maxtime; /**< max total mirror duration in seconds */
|
||||
@@ -465,13 +465,13 @@ struct httrackp {
|
||||
hts_boolean maketrack; /**< maintain an operations-statistics log */
|
||||
int parsejava; /**< Java/JS parsing mode; see htsparsejava_flags */
|
||||
int hostcontrol; /**< ban slow/timing-out hosts; see hts_hostcontrol bits */
|
||||
hts_boolean errpage; /**< generate an error page on 404 and similar */
|
||||
hts_tristate errpage; /**< generate an error page on 404 and similar */
|
||||
hts_boolean
|
||||
check_type; /**< probe unknown-type links (cgi/asp/dir) and follow moves
|
||||
*/
|
||||
hts_boolean all_in_cache; /**< keep all retrieved data in the cache */
|
||||
hts_robots robots; /**< robots.txt handling level */
|
||||
hts_boolean external; /**< render external links as error pages */
|
||||
hts_tristate external; /**< render external links as error pages */
|
||||
hts_boolean passprivacy; /**< strip passwords from external links */
|
||||
hts_boolean includequery; /**< include the query string in saved names */
|
||||
hts_boolean mirror_first_page; /**< only mirror the links of the first page */
|
||||
@@ -485,7 +485,7 @@ struct httrackp {
|
||||
hts_boolean sizehack; /**< treat same-size response as "updated" */
|
||||
hts_boolean urlhack; // force "url normalization" to avoid loops
|
||||
hts_boolean tolerant; /**< accept an incorrect Content-Length */
|
||||
hts_boolean
|
||||
hts_tristate
|
||||
parseall; /**< parse aggressively, including unknown tags with links */
|
||||
hts_boolean parsedebug; /**< parser debug mode */
|
||||
hts_boolean norecatch; /**< do not re-fetch files the user deleted locally */
|
||||
|
||||
@@ -3749,44 +3749,60 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
|
||||
} // bloc
|
||||
// erreur HTTP (ex: 404, not found)
|
||||
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED)
|
||||
|| (r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)
|
||||
) { // Precondition Failed, c'est à dire pour nous redemander TOUT le fichier
|
||||
if (fexist_utf8(heap(ptr)->sav)) {
|
||||
remove(heap(ptr)->sav); // Eliminer
|
||||
} else {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Unexpected 412/416 error (%s) for %s%s, '%s' could not be found on disk",
|
||||
r->msg, urladr(), urlfil(),
|
||||
heap(ptr)->sav != NULL ? heap(ptr)->sav : "");
|
||||
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED) ||
|
||||
(r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)) {
|
||||
// 412/416: the resume partial is stale; re-get the whole file (#206)
|
||||
lien_back *itemback = NULL;
|
||||
int had_partial = 0;
|
||||
int ref_existed = 0;
|
||||
int ref_gone;
|
||||
|
||||
// Drop the temp-ref, its partial, and heap->sav so the re-get carries no
|
||||
// Range; else back_add rebuilds the same Range and loops.
|
||||
if (back_unserialize_ref(opt, heap(ptr)->adr, heap(ptr)->fil,
|
||||
&itemback) == 0) {
|
||||
had_partial = 1;
|
||||
ref_existed = 1;
|
||||
// best-effort: an orphaned partial cannot re-Range once the ref is gone
|
||||
if (fexist_utf8(itemback->url_sav))
|
||||
(void) UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
itemback->url_sav));
|
||||
back_clear_entry(itemback);
|
||||
freet(itemback);
|
||||
}
|
||||
if (!fexist_utf8(heap(ptr)->sav)) { // Bien éliminé? (sinon on boucle..)
|
||||
#if HDEBUG
|
||||
printf("Partial content NOT up-to-date, reget all file for %s\n",
|
||||
heap(ptr)->sav);
|
||||
#endif
|
||||
// don't re-record if the ref survived (it would re-Range and loop)
|
||||
ref_gone =
|
||||
url_savename_refname_remove(opt, heap(ptr)->adr, heap(ptr)->fil) ||
|
||||
!ref_existed;
|
||||
if (fexist_utf8(heap(ptr)->sav)) {
|
||||
had_partial = 1;
|
||||
remove(heap(ptr)->sav);
|
||||
}
|
||||
|
||||
// Re-get once, only if a partial existed and both Range triggers are
|
||||
// gone; a failed removal gives up rather than looping. range_used is
|
||||
// unreliable (it does not survive the delayed-type two-pass).
|
||||
if (had_partial && ref_gone && !fexist_utf8(heap(ptr)->sav)) {
|
||||
hts_log_print(opt, LOG_DEBUG, "Partial file reget (%s) for %s%s",
|
||||
r->msg, urladr(), urlfil());
|
||||
// enregistrer le MEME lien
|
||||
if (hts_record_link(opt, heap(ptr)->adr, heap(ptr)->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||
heap_top()->testmode = heap(ptr)->testmode; // mode test?
|
||||
heap_top()->link_import = 0; // pas mode import
|
||||
heap_top()->testmode = heap(ptr)->testmode;
|
||||
heap_top()->link_import = 0;
|
||||
heap_top()->depth = heap(ptr)->depth;
|
||||
heap_top()->pass2 = max(heap(ptr)->pass2, numero_passe);
|
||||
heap_top()->retry = heap(ptr)->retry;
|
||||
heap_top()->premier = heap(ptr)->premier;
|
||||
heap_top()->precedent = ptr;
|
||||
//
|
||||
// canceller lien actuel
|
||||
error = 1;
|
||||
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
||||
//
|
||||
} else { // oups erreur, plus de mémoire!!
|
||||
XH_uninit; // désallocation mémoire & buffers
|
||||
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
||||
} else { // out of memory
|
||||
XH_uninit;
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
hts_log_print(opt, LOG_ERROR, "Can not remove old file %s", urlfil());
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Giving up on partial reget (%s) for %s%s", r->msg,
|
||||
urladr(), urlfil());
|
||||
error = 1;
|
||||
}
|
||||
|
||||
|
||||
1093
src/htsselftest.c
Normal file
1093
src/htsselftest.c
Normal file
File diff suppressed because it is too large
Load Diff
52
src/htsselftest.h
Normal file
52
src/htsselftest.h
Normal file
@@ -0,0 +1,52 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 2026 Xavier Roche and other contributors
|
||||
|
||||
SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Ethical use: we kindly ask that you NOT use this software to harvest email
|
||||
addresses or to collect any other private information about people. Doing so
|
||||
would dishonor our work and waste the many hours we have spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: htsselftest.h */
|
||||
/* named dispatch for the hidden engine self-tests */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#ifndef HTSSELFTEST_DEFH
|
||||
#define HTSSELFTEST_DEFH
|
||||
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_httrackp
|
||||
#define HTS_DEF_FWSTRUCT_httrackp
|
||||
typedef struct httrackp httrackp;
|
||||
#endif
|
||||
|
||||
/* Run engine self-test `name` over the positional args argv[0..argc-1], or list
|
||||
the available tests when name is NULL, empty, or "list". Prints the result;
|
||||
returns the process exit code (0 == success). The caller owns option cleanup.
|
||||
Reached through the hidden `httrack -#test[=NAME ...]` subcommand. */
|
||||
int hts_selftest(httrackp *opt, const char *name, int argc, char **argv);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1176,11 +1176,15 @@ static void proxytrack_process_HTTP(PT_Indexes indexes, T_SOC soc_c) {
|
||||
if (element != NULL) {
|
||||
msgCode = element->statuscode;
|
||||
StringRoom(headers, 8192);
|
||||
sprintf(StringBuffRW(headers), "HTTP/1.1 %d %s\r\n"
|
||||
sprintf(StringBuffRW(headers),
|
||||
"HTTP/1.1 %d %s\r\n"
|
||||
#ifndef NO_WEBDAV
|
||||
"%s"
|
||||
#endif
|
||||
"Content-Type: %s%s%s%s\r\n" "%s%s%s" "%s%s%s" "%s%s%s",
|
||||
"Content-Type: %s%s%s%s\r\n"
|
||||
"%s%s%s"
|
||||
"%s%s%s"
|
||||
"%s%s%s",
|
||||
/* */
|
||||
msgCode, element->msg,
|
||||
#ifndef NO_WEBDAV
|
||||
@@ -1188,16 +1192,18 @@ static void proxytrack_process_HTTP(PT_Indexes indexes, T_SOC soc_c) {
|
||||
StringBuff(davHeaders),
|
||||
#endif
|
||||
/* Content-type: foo; [ charset=bar ] */
|
||||
element->contenttype,
|
||||
hts_effective_mime(element->contenttype),
|
||||
((element->charset[0]) ? "; charset=\"" : ""),
|
||||
element->charset, ((element->charset[0]) ? "\"" : ""),
|
||||
/* location */
|
||||
((element->location != NULL
|
||||
&& element->location[0]) ? "Location: " : ""),
|
||||
((element->location != NULL
|
||||
&& element->location[0]) ? element->location : ""),
|
||||
((element->location != NULL
|
||||
&& element->location[0]) ? "\r\n" : ""),
|
||||
((element->location != NULL && element->location[0])
|
||||
? "Location: "
|
||||
: ""),
|
||||
((element->location != NULL && element->location[0])
|
||||
? element->location
|
||||
: ""),
|
||||
((element->location != NULL && element->location[0]) ? "\r\n"
|
||||
: ""),
|
||||
/* last-modified */
|
||||
((element->lastmodified[0]) ? "Last-Modified: " : ""),
|
||||
((element->lastmodified[0]) ? element->lastmodified : ""),
|
||||
@@ -1205,8 +1211,7 @@ static void proxytrack_process_HTTP(PT_Indexes indexes, T_SOC soc_c) {
|
||||
/* etag */
|
||||
((element->etag[0]) ? "ETag: " : ""),
|
||||
((element->etag[0]) ? element->etag : ""),
|
||||
((element->etag[0]) ? "\r\n" : "")
|
||||
);
|
||||
((element->etag[0]) ? "\r\n" : ""));
|
||||
StringLength(headers) = (int) strlen(StringBuff(headers));
|
||||
} else {
|
||||
/* No query string, no ending / : check the the <url>/ page */
|
||||
|
||||
@@ -52,6 +52,7 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
#include "htscore.h"
|
||||
#include "htsback.h"
|
||||
#include "htslib.h" /* hts_effective_mime */
|
||||
|
||||
#include "store.h"
|
||||
#include "proxystrings.h"
|
||||
@@ -2289,10 +2290,17 @@ static int PT_SaveCache__Arc_Fun(void *arg, const char *url, PT_Element element)
|
||||
int size_headers;
|
||||
|
||||
sprintf(st->headers,
|
||||
"HTTP/1.0 %d %s" "\r\n" "X-Server: ProxyTrack " PROXYTRACK_VERSION
|
||||
"\r\n" "Content-type: %s%s%s%s" "\r\n" "Last-modified: %s" "\r\n"
|
||||
"Content-length: %d" "\r\n", element->statuscode, element->msg,
|
||||
/**/ element->contenttype,
|
||||
"HTTP/1.0 %d %s"
|
||||
"\r\n"
|
||||
"X-Server: ProxyTrack " PROXYTRACK_VERSION "\r\n"
|
||||
"Content-type: %s%s%s%s"
|
||||
"\r\n"
|
||||
"Last-modified: %s"
|
||||
"\r\n"
|
||||
"Content-length: %d"
|
||||
"\r\n",
|
||||
element->statuscode, element->msg,
|
||||
/**/ hts_effective_mime(element->contenttype),
|
||||
(element->charset[0] ? "; charset=\"" : ""),
|
||||
(element->charset[0] ? element->charset : ""),
|
||||
(element->charset[0] ? "\"" : ""), /**/ element->lastmodified,
|
||||
@@ -2328,10 +2336,10 @@ static int PT_SaveCache__Arc_Fun(void *arg, const char *url, PT_Element element)
|
||||
/* args */
|
||||
(link_has_authority(url) ? "" : "http://"), url, "0.0.0.0",
|
||||
tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour,
|
||||
tm->tm_min, tm->tm_sec, element->contenttype, element->statuscode,
|
||||
st->md5, (element->location ? element->location : "-"),
|
||||
(long int) ftell(fp), st->filename,
|
||||
(long int) (size_headers + element->size));
|
||||
tm->tm_min, tm->tm_sec, hts_effective_mime(element->contenttype),
|
||||
element->statuscode, st->md5,
|
||||
(element->location ? element->location : "-"), (long int) ftell(fp),
|
||||
st->filename, (long int) (size_headers + element->size));
|
||||
/* network_doc */
|
||||
if (fwrite(st->headers, 1, size_headers, fp) != size_headers
|
||||
|| (element->size > 0
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Golden cache-format regression test (driven by 'httrack -#B <dir>').
|
||||
# Golden cache-format regression test (driven by 'httrack -#test=cache-golden <dir>').
|
||||
#
|
||||
# 01_engine-cache.test writes the cache with the same build it reads back (a
|
||||
# round-trip), so it cannot catch a read-path or ZIP-format regression where
|
||||
@@ -13,7 +13,7 @@
|
||||
# byte-exact.
|
||||
#
|
||||
# Regenerate the fixture after a deliberate format change with
|
||||
# 'httrack -#B <dir> regen', then copy <dir>/hts-cache/new.zip over the
|
||||
# 'httrack -#test=cache-golden <dir> regen', then copy <dir>/hts-cache/new.zip over the
|
||||
# committed file.
|
||||
|
||||
set -eu
|
||||
@@ -37,11 +37,11 @@ trap 'rm -rf "$dir"' EXIT
|
||||
mkdir -p "$dir/hts-cache"
|
||||
cp "$fixture/hts-cache/new.zip" "$dir/hts-cache/new.zip"
|
||||
|
||||
out=$(httrack -#B "$dir")
|
||||
out=$(httrack -#test=cache-golden "$dir")
|
||||
|
||||
# Match the exact success line: the read must have found and verified every
|
||||
# entry, not merely failed to enter the mode (a bad -#B falls through to the
|
||||
# usage screen, which also exits non-zero but never prints this).
|
||||
# entry, not merely failed to enter the mode (a renamed/removed test prints the
|
||||
# registry to stderr, which also exits non-zero but never prints this).
|
||||
test "$out" = "cache-golden: OK" || {
|
||||
echo "expected 'cache-golden: OK', got: $out" >&2
|
||||
exit 1
|
||||
|
||||
24
tests/01_engine-cache-writefail.test
Normal file
24
tests/01_engine-cache-writefail.test
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Keep this POSIX-portable: the harness runs it via $(BASH), which is a plain
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Cache write-failure handling (httrack -#test=cache-writefail <dir>). #174/#219.
|
||||
# A failing new.zip write (disk full) used to crash the process via assertf; it
|
||||
# must instead stop the mirror with a fatal error (exit_xh=-1), no crash. The
|
||||
# self-test asserts that; reverting the fix makes -#test=cache-writefail abort (SIGABRT) and fail.
|
||||
|
||||
set -eu
|
||||
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
out=$(httrack -#test=cache-writefail "$dir")
|
||||
|
||||
# Match the exact success line (error logs also go to stdout); a renamed/removed
|
||||
# test prints the registry to stderr, which exits non-zero but never prints this.
|
||||
printf '%s\n' "$out" | grep -qx "cache-writefail: OK" || {
|
||||
echo "expected 'cache-writefail: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -4,7 +4,7 @@
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Cache create/read/update logic (driven by 'httrack -#A <dir>').
|
||||
# Cache create/read/update logic (driven by 'httrack -#test=cache <dir>').
|
||||
#
|
||||
# The in-process self-test stores several hand-crafted edge entries (normal
|
||||
# HTML, an empty redirect with a near-limit location, a non-HTML body kept via
|
||||
@@ -20,13 +20,13 @@ set -eu
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
# Like the other -# debug modes, a trailing token (the working directory) is
|
||||
# required; a bare '-#A' falls through to the usage screen.
|
||||
out=$(httrack -#A "$dir")
|
||||
# The working directory is a required argument; without it the test prints a
|
||||
# usage line to stderr and returns non-zero.
|
||||
out=$(httrack -#test=cache "$dir")
|
||||
|
||||
# Match the exact success line, so the test cannot pass for an unrelated reason
|
||||
# (e.g. the -#A mode being gone and falling through to the usage screen, which
|
||||
# also exits non-zero but never prints this).
|
||||
# (e.g. the cache test being gone, which prints the registry to stderr but
|
||||
# never prints this line).
|
||||
test "$out" = "cache-selftest: OK" || {
|
||||
echo "expected 'cache-selftest: OK', got: $out" >&2
|
||||
exit 1
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
set -euo pipefail
|
||||
|
||||
# charset -> UTF-8 conversion (hts_convertStringToUTF8).
|
||||
# -#3 <charset> <string> prints the string re-decoded from <charset> as UTF-8.
|
||||
# -#test=charset <charset> <string> prints the string re-decoded from <charset> as UTF-8.
|
||||
conv() {
|
||||
test "$(httrack -O /dev/null -#3 "$1" "$2")" == "$3" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=charset "$1" "$2")" == "$3" || exit 1
|
||||
}
|
||||
# crash probe: malformed input must exit cleanly, not abort.
|
||||
runs() {
|
||||
httrack -O /dev/null -#3 "$1" "$2" >/dev/null 2>&1 || exit 1
|
||||
httrack -O /dev/null -#test=charset "$1" "$2" >/dev/null 2>&1 || exit 1
|
||||
}
|
||||
|
||||
# the source bytes below are UTF-8 (this file is UTF-8); "café" is 0x63 61 66 C3 A9.
|
||||
@@ -31,7 +31,7 @@ conv 'us-ascii' 'hello' 'hello'
|
||||
# unknown charset: ASCII passes through unchanged, but non-ASCII input cannot be
|
||||
# decoded and yields empty output (an error is printed to stderr).
|
||||
conv 'no-such-charset-xyz' 'abc' 'abc'
|
||||
test "$(httrack -O /dev/null -#3 'no-such-charset-xyz' 'café' 2>/dev/null)" == "" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=charset 'no-such-charset-xyz' 'café' 2>/dev/null)" == "" || exit 1
|
||||
|
||||
# malformed UTF-8 (lone continuation byte, truncated lead byte) must not crash
|
||||
runs 'utf-8' $'\x80'
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Issue #151 guard: the request Cookie header must be bare RFC 6265 name=value
|
||||
# pairs, no $Version/$Path attributes. Driven by the 'httrack -#Q' selftest.
|
||||
# pairs, no $Version/$Path attributes. Driven by the 'httrack -#test=cookies' selftest.
|
||||
|
||||
set -eu
|
||||
|
||||
# A trailing token is required; a bare '-#Q' falls through to the usage screen.
|
||||
out=$(httrack -#Q run)
|
||||
# 'run' is an ignored placeholder argument.
|
||||
out=$(httrack -#test=cookies run)
|
||||
|
||||
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||
# Exact-match the success line so a renamed/removed test (it prints the registry
|
||||
# to stderr) can't pass.
|
||||
test "$out" = "cookie-header: OK" || {
|
||||
echo "expected 'cookie-header: OK', got: $out" >&2
|
||||
exit 1
|
||||
|
||||
@@ -2,15 +2,16 @@
|
||||
#
|
||||
# Regression guard for the unsigned-enum sentinel trap: copy_htsopt's
|
||||
# `if (from->X > -1)` guard is always false for unsigned hts_boolean fields, so
|
||||
# they silently stop being copied. Driven by the in-process 'httrack -#9' test.
|
||||
# they silently stop being copied. Driven by the in-process 'httrack -#test=copyopt' test.
|
||||
# Keep POSIX-portable (harness runs it via $(BASH), a plain /bin/sh on macOS).
|
||||
|
||||
set -eu
|
||||
|
||||
# A trailing token is required; a bare '-#9' falls through to the usage screen.
|
||||
out=$(httrack -#9 run)
|
||||
# 'run' is an ignored placeholder argument.
|
||||
out=$(httrack -#test=copyopt run)
|
||||
|
||||
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||
# Exact-match the success line so a renamed/removed test (it prints the registry
|
||||
# to stderr) can't pass.
|
||||
test "$out" = "copy-htsopt: OK" || {
|
||||
echo "expected 'copy-htsopt: OK', got: $out" >&2
|
||||
exit 1
|
||||
|
||||
14
tests/01_engine-dns.test
Normal file
14
tests/01_engine-dns.test
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# DNS resolver/cache self-test: a mock getaddrinfo (no network) checks address
|
||||
# family, single-address selection, the -@i4/-@i6 family filter, and cache reuse.
|
||||
# 'run' is an ignored placeholder argument.
|
||||
out=$(httrack -#test=dns run)
|
||||
|
||||
test "$out" = "dns-selftest: OK" || {
|
||||
echo "expected 'dns-selftest: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -4,13 +4,13 @@
|
||||
set -euo pipefail
|
||||
|
||||
# HTML entity unescaping (hts_unescapeEntitiesWithCharset).
|
||||
# -#6 <string> prints the string with entities decoded (UTF-8 output).
|
||||
# -#test=entities <string> prints the string with entities decoded (UTF-8 output).
|
||||
ent() {
|
||||
test "$(httrack -O /dev/null -#6 "$1")" == "$2" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=entities "$1")" == "$2" || exit 1
|
||||
}
|
||||
# crash probe: malformed input must exit cleanly, not abort.
|
||||
runs() {
|
||||
httrack -O /dev/null -#6 "$1" >/dev/null 2>&1 || exit 1
|
||||
httrack -O /dev/null -#test=entities "$1" >/dev/null 2>&1 || exit 1
|
||||
}
|
||||
|
||||
# named entities
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
set -euo pipefail
|
||||
|
||||
# wildcard filter engine (strjoker), the core of +/- include/exclude rules.
|
||||
# -#0 <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
|
||||
# -#test=filter <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
|
||||
|
||||
match() {
|
||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does match $1" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=filter "$1" "$2")" == "$2 does match $1" || exit 1
|
||||
}
|
||||
nomatch() {
|
||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does NOT match $1" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=filter "$1" "$2")" == "$2 does NOT match $1" || exit 1
|
||||
}
|
||||
|
||||
# bare star matches everything
|
||||
|
||||
@@ -3,5 +3,7 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# httrack internal hashtable autotest on 100K keys
|
||||
httrack -#7 100000
|
||||
# httrack internal hashtable autotest on 100K keys. Assert the success line (on
|
||||
# stderr) so a misrouted registry entry can't pass on exit code alone.
|
||||
out=$(httrack -#test=hashtable 100000 2>&1)
|
||||
printf '%s\n' "$out" | grep -q "all hashtable tests were successful!" || exit 1
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# IDNA / punycode encode (-#4) and decode (-#5). This code has a CVE history,
|
||||
# IDNA / punycode encode (-#test=idna-encode) and decode (-#test=idna-decode). This code has a CVE history,
|
||||
# so the edge cases below cover passthrough, round-trips, and malformed input.
|
||||
|
||||
enc() { test "$(httrack -O /dev/null -#4 "$1")" == "$2" || exit 1; }
|
||||
dec() { test "$(httrack -O /dev/null -#5 "$1")" == "$2" || exit 1; }
|
||||
enc() { test "$(httrack -O /dev/null -#test=idna-encode "$1")" == "$2" || exit 1; }
|
||||
dec() { test "$(httrack -O /dev/null -#test=idna-decode "$1")" == "$2" || exit 1; }
|
||||
# crash probe: malformed ACE input must exit cleanly, not abort.
|
||||
runs() { httrack -O /dev/null -#5 "$1" >/dev/null 2>&1 || exit 1; }
|
||||
runs() { httrack -O /dev/null -#test=idna-decode "$1" >/dev/null 2>&1 || exit 1; }
|
||||
|
||||
# encode
|
||||
enc 'www.café.com' 'www.xn--caf-dma.com'
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
set -euo pipefail
|
||||
|
||||
# MIME type guessing from extension (get_httptype / give_mimext).
|
||||
# -#2 <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
|
||||
# -#test=mime <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
|
||||
|
||||
mime() {
|
||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is '$2'" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=mime "$1" | head -1)" == "$1 is '$2'" || exit 1
|
||||
}
|
||||
unknown() {
|
||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=mime "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
|
||||
}
|
||||
|
||||
mime '/a/b.html' 'text/html'
|
||||
|
||||
@@ -8,7 +8,7 @@ set -euo pipefail
|
||||
# relative path from <curr>'s directory to <link>
|
||||
rel() {
|
||||
local got
|
||||
got=$(httrack -O /dev/null -#l "$1" "$2")
|
||||
got=$(httrack -O /dev/null -#test=relative "$1" "$2")
|
||||
test "$got" == "relative=$3" ||
|
||||
{
|
||||
echo "FAIL rel($1, $2): got '$got' want 'relative=$3'"
|
||||
@@ -19,7 +19,7 @@ rel() {
|
||||
# resolve <link> against origin <adr>/<fil> -> adr=.. fil=..
|
||||
ident() {
|
||||
local got
|
||||
got=$(httrack -O /dev/null -#i "$1" "$2" "$3")
|
||||
got=$(httrack -O /dev/null -#test=resolve "$1" "$2" "$3")
|
||||
test "$got" == "$4" ||
|
||||
{
|
||||
echo "FAIL ident($1, $2, $3): got '$got' want '$4'"
|
||||
|
||||
41
tests/01_engine-savename.test
Executable file
41
tests/01_engine-savename.test
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Local save-name extension resolution (url_savename via -#test=savename <fil> <content-type>).
|
||||
# Asserts on the basename of "savename: <path>".
|
||||
|
||||
name() {
|
||||
out="$(httrack -O /dev/null -#test=savename "$1" "$2" | sed -n 's/^savename: //p')"
|
||||
test "${out##*/}" == "$3" || {
|
||||
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
# #115: an unknown trailing ".token" is part of the name, keep it and append the type.
|
||||
name '/article-1.884291' 'text/html' 'article-1.884291.html'
|
||||
name '/news/story-12345.987654' 'text/html' 'story-12345.987654.html'
|
||||
|
||||
# Recognized extensions still collapse to the resolved type.
|
||||
name '/page.php' 'text/html' 'page.html'
|
||||
name '/page.asp' 'text/html' 'page.html'
|
||||
name '/foo' 'text/html' 'foo.html'
|
||||
|
||||
# A bare trailing dot is not a tail to keep.
|
||||
name '/page.' 'text/html' 'page.html'
|
||||
|
||||
# Soft-404 (#267/#408): a binary URL served as HTML is named .html.
|
||||
name '/x.pdf' 'text/html' 'x.html'
|
||||
name '/x.gif' 'text/html' 'x.html'
|
||||
|
||||
# Type agrees with the extension: keep it, no churn, no double extension.
|
||||
name '/x.pdf' 'application/pdf' 'x.pdf'
|
||||
name '/x.jpg' 'image/jpeg' 'x.jpg'
|
||||
name '/x.html' 'text/html' 'x.html'
|
||||
name '/x.js' 'application/x-javascript' 'x.js'
|
||||
name '/types/data.json' 'application/json' 'data.json'
|
||||
|
||||
# Agreeing type must not rewrite the extension's casing (no strip-and-reappend).
|
||||
name '/x.JPG' 'image/jpeg' 'x.JPG'
|
||||
17
tests/01_engine-selftest-dispatch.test
Normal file
17
tests/01_engine-selftest-dispatch.test
Normal file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# The -#test dispatch itself: a bare -#test lists the registry, and an unknown
|
||||
# name errors (non-zero, diagnostic) instead of silently passing.
|
||||
|
||||
set -eu
|
||||
|
||||
# Bare -#test lists known tests (printed to stderr).
|
||||
list=$(httrack -#test 2>&1)
|
||||
printf '%s\n' "$list" | grep -q "filter" || exit 1
|
||||
printf '%s\n' "$list" | grep -q "cache-writefail" || exit 1
|
||||
|
||||
# Unknown name: non-zero exit + diagnostic, and no test result line.
|
||||
rc=0
|
||||
err=$(httrack -#test=bogus 2>&1) || rc=$?
|
||||
test "$rc" -ne 0 || exit 1
|
||||
printf '%s\n' "$err" | grep -q "Unknown self-test" || exit 1
|
||||
@@ -5,7 +5,7 @@ set -euo pipefail
|
||||
|
||||
# path simplify engine (fil_simplifie): collapses ./ and ../ segments.
|
||||
simp() {
|
||||
test "$(httrack -O /dev/null -#1 "$1")" == "simplified=$2" || exit 1
|
||||
test "$(httrack -O /dev/null -#test=simplify "$1")" == "simplified=$2" || exit 1
|
||||
}
|
||||
|
||||
simp './foo/bar/' 'foo/bar/'
|
||||
|
||||
@@ -3,23 +3,22 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# htssafe.h bounded string operations (driven by 'httrack -#8').
|
||||
# htssafe.h bounded string operations (driven by 'httrack -#test=strsafe').
|
||||
|
||||
# Success path: every bounded op (strcpybuff/strcatbuff/strncatbuff/strlcpybuff)
|
||||
# must behave correctly. Like the other -# debug modes, a trailing token is
|
||||
# required (a bare '-#8' falls through to the usage screen).
|
||||
# must behave correctly. 'run' selects the success path (vs the overflow modes).
|
||||
rc=0
|
||||
out=$(httrack -#8 run) || rc=$?
|
||||
out=$(httrack -#test=strsafe run) || rc=$?
|
||||
test "$rc" -eq 0 || exit 1
|
||||
test "$out" == "strsafe: OK" || exit 1
|
||||
|
||||
# Overflow path: an over-capacity write into a sized buffer must be caught by
|
||||
# the bounded macro and abort the process, not be silently truncated/completed.
|
||||
# Assert the htssafe abort signature specifically, so the test cannot pass for
|
||||
# an unrelated reason (e.g. the -#8 mode being gone and falling through to the
|
||||
# usage screen, which also exits non-zero).
|
||||
# an unrelated reason (e.g. the strsafe test being gone, which prints the
|
||||
# registry to stderr and also exits non-zero).
|
||||
# the bounded macro aborts (non-zero exit), so don't let set -e trip on it
|
||||
err=$(httrack -#8 overflow "this string is far too long for the buffer" 2>&1) || true
|
||||
err=$(httrack -#test=strsafe overflow "this string is far too long for the buffer" 2>&1) || true
|
||||
case "$err" in
|
||||
*"strsafe: NOT aborted"*)
|
||||
echo "over-capacity write was NOT caught" >&2
|
||||
@@ -36,7 +35,7 @@ esac
|
||||
# capacity (4 bytes into a 4-byte buffer), so this also pins the boundary: a
|
||||
# '<=' off-by-one in the capacity check would let it through (and print "NOT
|
||||
# aborted"). Match the specific htsbuff abort message, not just any assert.
|
||||
err=$(httrack -#8 overflow-buff "abcd" 2>&1) || true
|
||||
err=$(httrack -#test=strsafe overflow-buff "abcd" 2>&1) || true
|
||||
case "$err" in
|
||||
*"strsafe: NOT aborted"*)
|
||||
echo "htsbuff over-capacity write was NOT caught" >&2
|
||||
|
||||
25
tests/15_local-types.test
Normal file
25
tests/15_local-types.test
Normal file
@@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Content-Type vs URL-extension naming (issue #267 family) under the default
|
||||
# delayed type check (-%N2). Policy: a MISSING Content-Type must not clobber a
|
||||
# URL extension that maps to a specific non-HTML type (.png/.pdf stay as-is);
|
||||
# an explicitly DECLARED type is trusted, so a binary-looking URL that really
|
||||
# serves HTML (text/html on .pdf/.jpg) is named .html. The "wrong" names are
|
||||
# asserted absent so a regression in either direction fails here.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/notype.pdf' --not-found 'types/notype.html' \
|
||||
--found 'types/photo.png' \
|
||||
--found 'types/doc.pdf' \
|
||||
--found 'types/lie.html' --not-found 'types/lie.png' \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/page.htm' --not-found 'types/page.html' \
|
||||
--found 'types/script.js' \
|
||||
--found 'types/style.css' \
|
||||
--found 'types/data.json' \
|
||||
--found 'types/control.html' --not-found 'types/control.php' \
|
||||
--found 'types/gend61c.png' --not-found 'types/gend61c.html' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
11
tests/16_local-assume.test
Normal file
11
tests/16_local-assume.test
Normal file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# --assume under the default delayed type check (-%N2), issue #56. A user type
|
||||
# pinned with --assume must be honored immediately, not lost to the delayed
|
||||
# name: photo.png served as image/png but assumed text/html is saved as .html.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/photo.html' --not-found 'types/photo.png' \
|
||||
httrack 'BASEURL/types/photo.png' --assume png=text/html
|
||||
12
tests/17_local-empty-ct.test
Normal file
12
tests/17_local-empty-ct.test
Normal file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# An empty "Content-Type:" header value must be treated as "no usable type"
|
||||
# (keep the URL extension), not parsed from an uninitialized buffer. The crawl
|
||||
# also runs under ASan/UBSan in CI, which catches the uninitialized read this
|
||||
# guards against.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/emptyct.png' --not-found 'types/emptyct.html' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
15
tests/18_local-update.test
Normal file
15
tests/18_local-update.test
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# A second (update) pass must keep the names the first crawl chose. The stored
|
||||
# Content-Type rides the cache, so the update reads back the same value -- the
|
||||
# unknown/unknown sentinel for a typeless response, the declared type otherwise
|
||||
# -- and names consistently: a declared-text/html .pdf stays .html and a
|
||||
# typeless .png stays .png across the update rather than reverting.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/lie.html' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
110
tests/19_local-connect-fallback.test
Normal file
110
tests/19_local-connect-fallback.test
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# A host that resolves to several addresses must fall back to the next one when
|
||||
# a connect fails, instead of giving up on the first (dead IPv6 on a dual-stack
|
||||
# host, ...). HTTRACK_DEBUG_RESOLVE pins "deadhost" to a refused address first
|
||||
# (127.0.0.2, nothing listening) then the live server (127.0.0.1): the crawl
|
||||
# only succeeds if httrack retries the second address. A second case pins every
|
||||
# address to a refused one, so the slot must exhaust the list and error out
|
||||
# (rather than hang or loop).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
if test "${V6_SUPPORT:-}" == "no"; then
|
||||
echo "no IPv6 support (resolver list/override is IPv6-only), skipping"
|
||||
exit 77
|
||||
fi
|
||||
if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "python3 missing, skipping"
|
||||
exit 77
|
||||
fi
|
||||
|
||||
server="$top_srcdir/tests/local-server.py"
|
||||
root="$top_srcdir/tests/server-root"
|
||||
tmpdir=$(mktemp -d)
|
||||
serverpid=
|
||||
|
||||
cleanup() {
|
||||
if test -n "$serverpid"; then
|
||||
kill "$serverpid" 2>/dev/null || true
|
||||
wait "$serverpid" 2>/dev/null || true
|
||||
fi
|
||||
rm -rf "$tmpdir"
|
||||
return 0
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# bind the live server to 127.0.0.1 only, so 127.0.0.2 refuses the connect
|
||||
python3 "$server" --root "$root" --bind 127.0.0.1 >"$tmpdir/srv.out" 2>"$tmpdir/srv.err" &
|
||||
serverpid=$!
|
||||
port=
|
||||
for _ in $(seq 1 50); do
|
||||
line=$(head -n1 "$tmpdir/srv.out" 2>/dev/null || true)
|
||||
if test "${line%% *}" == "PORT"; then
|
||||
port="${line#PORT }"
|
||||
break
|
||||
fi
|
||||
kill -0 "$serverpid" 2>/dev/null || {
|
||||
echo "server exited early: $(cat "$tmpdir/srv.err")"
|
||||
exit 1
|
||||
}
|
||||
sleep 0.1
|
||||
done
|
||||
test -n "$port" || {
|
||||
echo "could not discover server port"
|
||||
exit 1
|
||||
}
|
||||
|
||||
out="$tmpdir/crawl"
|
||||
HTTRACK_DEBUG_RESOLVE="deadhost:127.0.0.2,127.0.0.1" \
|
||||
httrack "http://deadhost:$port/simple/basic.html" -O "$out" \
|
||||
-c1 --robots=0 --timeout=30 --quiet -Z >"$tmpdir/log" 2>&1
|
||||
|
||||
log="$out/hts-log.txt"
|
||||
|
||||
# the dead address was tried, then the next one (proves the fallback ran)
|
||||
if ! grep -q "trying next address" "$log"; then
|
||||
echo "FAIL: no connect fallback happened"
|
||||
cat "$log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 0 errors and the file was actually fetched (over the live address)
|
||||
errs=$(grep -iEc "^[0-9:]*[[:space:]]Error:" "$log" || true)
|
||||
test "$errs" == "0" || {
|
||||
echo "FAIL: $errs error(s) reported"
|
||||
grep -iE "Error:" "$log"
|
||||
exit 1
|
||||
}
|
||||
test -f "$out/deadhost_$port/simple/basic.html" || {
|
||||
echo "FAIL: basic.html not downloaded via fallback"
|
||||
find "$out" -type f
|
||||
exit 1
|
||||
}
|
||||
|
||||
# every address refused: the slot exhausts the list, then errors out (the
|
||||
# harness timeout would catch a hang/loop; refused connects are instant)
|
||||
out2="$tmpdir/crawl2"
|
||||
HTTRACK_DEBUG_RESOLVE="alldead:127.0.0.2,127.0.0.3" \
|
||||
httrack "http://alldead:$port/simple/basic.html" -O "$out2" \
|
||||
-c1 --robots=0 --timeout=30 --quiet -Z >"$tmpdir/log2" 2>&1
|
||||
log2="$out2/hts-log.txt"
|
||||
|
||||
grep -q "trying next address" "$log2" || {
|
||||
echo "FAIL: exhaustion path never tried the fallback address"
|
||||
cat "$log2"
|
||||
exit 1
|
||||
}
|
||||
grep -iqE "^[0-9:]*[[:space:]]Error:" "$log2" || {
|
||||
echo "FAIL: all addresses failing did not report an error"
|
||||
cat "$log2"
|
||||
exit 1
|
||||
}
|
||||
test ! -f "$out2/alldead_$port/simple/basic.html" || {
|
||||
echo "FAIL: file downloaded despite every address failing"
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "OK: connect fallback succeeds, and exhausting all addresses errors out"
|
||||
113
tests/20_local-resume-loop.test
Executable file
113
tests/20_local-resume-loop.test
Executable file
@@ -0,0 +1,113 @@
|
||||
#!/bin/bash
|
||||
# Issue #206: a continue/update crawl looped forever when the resume Range got a
|
||||
# 416. Pass 1 leaves a partial + temp-ref; pass 2 must terminate and not loop.
|
||||
set -u
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
testdir=$(cd "$(dirname "$0")" && pwd)
|
||||
server="${testdir}/local-server.py"
|
||||
|
||||
command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
|
||||
|
||||
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_206.XXXXXX") || exit 1
|
||||
serverpid=
|
||||
crawlpid=
|
||||
cleanup() {
|
||||
test -n "$crawlpid" && kill -9 "$crawlpid" 2>/dev/null
|
||||
if test -n "$serverpid"; then
|
||||
kill "$serverpid" 2>/dev/null
|
||||
wait "$serverpid" 2>/dev/null
|
||||
fi
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
trap cleanup EXIT HUP INT QUIT PIPE TERM
|
||||
|
||||
# --- start the server, discover its ephemeral port --------------------------
|
||||
# RESUME_COUNTER gets a byte per /resume/blob.txt request (pass-2 delta bounds re-gets).
|
||||
serverlog="${tmpdir}/server.log"
|
||||
counter="${tmpdir}/blobcount"
|
||||
RESUME_COUNTER="$counter" python3 "$server" --root "${testdir}/server-root" >"$serverlog" 2>&1 &
|
||||
serverpid=$!
|
||||
port=
|
||||
for _ in $(seq 1 50); do
|
||||
line=$(head -n1 "$serverlog" 2>/dev/null)
|
||||
if test "${line%% *}" == "PORT"; then
|
||||
port="${line#PORT }"
|
||||
break
|
||||
fi
|
||||
kill -0 "$serverpid" 2>/dev/null || {
|
||||
echo "server exited early: $(cat "$serverlog")"
|
||||
exit 1
|
||||
}
|
||||
sleep 0.1
|
||||
done
|
||||
test -n "$port" || {
|
||||
echo "could not discover server port"
|
||||
exit 1
|
||||
}
|
||||
base="http://127.0.0.1:${port}"
|
||||
|
||||
which httrack >/dev/null || {
|
||||
echo "could not find httrack"
|
||||
exit 1
|
||||
}
|
||||
out="${tmpdir}/crawl"
|
||||
mkdir "$out"
|
||||
common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0)
|
||||
refdir="${out}/hts-cache/ref"
|
||||
|
||||
# --- pass 1: crawl, interrupt once the blob download is underway -------------
|
||||
printf '[pass 1: interrupt mid-download] ..\t'
|
||||
httrack "${common[@]}" "${base}/resume/index.html" >"${tmpdir}/log1" 2>&1 &
|
||||
crawlpid=$!
|
||||
# Wait until blob.txt is requested, then SIGTERM so httrack's exit handler
|
||||
# finalizes the cache and serializes the temp-ref.
|
||||
for _ in $(seq 1 300); do
|
||||
test -s "$counter" && break
|
||||
kill -0 "$crawlpid" 2>/dev/null || break
|
||||
sleep 0.1
|
||||
done
|
||||
sleep 0.5
|
||||
kill -TERM "$crawlpid" 2>/dev/null
|
||||
wait "$crawlpid" 2>/dev/null
|
||||
crawlpid=
|
||||
test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
|
||||
echo "FAIL: no temp-ref survived pass 1; cannot drive #206"
|
||||
exit 1
|
||||
}
|
||||
echo "OK (temp-ref present)"
|
||||
before=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||
|
||||
# --- pass 2: --continue -> resume Range -> 416, bounded against the #206 loop -
|
||||
# Kill pass 2 after a deadline (portable stand-in for `timeout`, absent on macOS).
|
||||
printf '[pass 2: resume must terminate] ..\t'
|
||||
HANG_RC=137 # 128 + SIGKILL
|
||||
httrack "${common[@]}" --continue "${base}/resume/index.html" >"${tmpdir}/log2" 2>&1 &
|
||||
crawlpid=$!
|
||||
(sleep 30 && kill -9 "$crawlpid" 2>/dev/null) &
|
||||
guard=$!
|
||||
rc=0
|
||||
wait "$crawlpid" 2>/dev/null || rc=$?
|
||||
crawlpid=
|
||||
kill "$guard" 2>/dev/null || true
|
||||
wait "$guard" 2>/dev/null || true
|
||||
if test "$rc" -eq "$HANG_RC"; then
|
||||
echo "FAIL: pass 2 did not terminate (#206 resume->416 loop)"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK (terminated, rc=$rc)"
|
||||
|
||||
# The fix re-gets once (resume Range + range-less re-get = 2): the lower bound
|
||||
# rejects a drop-the-link non-fix (1), the upper bound rejects the loop (many).
|
||||
after=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||
hits=$((after - before))
|
||||
printf '[bounded re-get count] ..\t'
|
||||
if test "$hits" -lt 2; then
|
||||
echo "FAIL: only ${hits} pass-2 request(s); the stale partial was not re-got"
|
||||
exit 1
|
||||
fi
|
||||
if test "$hits" -gt 8; then
|
||||
echo "FAIL: ${hits} pass-2 requests for blob.txt (resume is looping)"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK (${hits} requests)"
|
||||
11
tests/21_local-intl-update.test
Normal file
11
tests/21_local-intl-update.test
Normal file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# #157: a dotless, accented URL named .html on the first crawl must keep .html
|
||||
# across an update -- not revert to the extensionless name.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'intl/Instalação_CVS_no_Ubuntu.html' \
|
||||
--not-found 'intl/Instalação_CVS_no_Ubuntu' \
|
||||
httrack 'BASEURL/intl/index.html'
|
||||
17
tests/22_local-broken-size.test
Executable file
17
tests/22_local-broken-size.test
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
# Issues #32/#41: a Content-Length that disagrees with the body warns "bogus
|
||||
# state (broken size)" and skips the cache; -%B (tolerant) accepts it.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# Default: warn, but the file is still written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-found 'bogus state \(broken size' \
|
||||
httrack 'BASEURL/size/index.html'
|
||||
|
||||
# -%B (tolerant): no warning, file written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-not-found 'bogus state' \
|
||||
httrack 'BASEURL/size/index.html' '-%B'
|
||||
19
tests/23_local-errpage.test
Normal file
19
tests/23_local-errpage.test
Normal file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
# Issue #17: with "no error pages" (-o0), 4xx/5xx bodies must not be written;
|
||||
# a genuine 0-byte 200 stays. Default (-o1) writes the error page. (#17's purge
|
||||
# half also does not reproduce; the purge path is not exercised here.)
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# -o0: 404 suppressed, good page and the legit 0-byte 200 kept.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'errpage/good.html' \
|
||||
--found 'errpage/empty.html' \
|
||||
--not-found 'errpage/missing.html' \
|
||||
httrack 'BASEURL/errpage/index.html' '-o0'
|
||||
|
||||
# Control -o1 (default): the 404 error page is written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'errpage/missing.html' \
|
||||
httrack 'BASEURL/errpage/index.html' '-o1'
|
||||
@@ -13,6 +13,7 @@ TESTS_ENVIRONMENT += PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH
|
||||
### TESTS_ENVIRONMENT += $(SHLIBPATH_VAR)="$(top_builddir)/src/$(LT_CV_OBJDIR)$${$(SHLIBPATH_VAR):+$(PATH_SEPARATOR)}$$$(SHLIBPATH_VAR)"
|
||||
TESTS_ENVIRONMENT += ONLINE_UNIT_TESTS=$(ONLINE_UNIT_TESTS)
|
||||
TESTS_ENVIRONMENT += HTTPS_SUPPORT=$(HTTPS_SUPPORT)
|
||||
TESTS_ENVIRONMENT += V6_SUPPORT=$(V6_SUPPORT)
|
||||
TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir)
|
||||
|
||||
TEST_EXTENSIONS = .test
|
||||
@@ -25,10 +26,12 @@ TESTS = \
|
||||
00_runnable.test \
|
||||
01_engine-cache.test \
|
||||
01_engine-cache-golden.test \
|
||||
01_engine-cache-writefail.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-cookies.test \
|
||||
01_engine-copyopt.test \
|
||||
01_engine-dns.test \
|
||||
01_engine-doitlog.test \
|
||||
01_engine-entities.test \
|
||||
01_engine-filter.test \
|
||||
@@ -38,6 +41,8 @@ TESTS = \
|
||||
01_engine-parse.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-strsafe.test \
|
||||
02_manpage-regen.test \
|
||||
@@ -51,6 +56,15 @@ TESTS = \
|
||||
12_crawl_https.test \
|
||||
13_crawl_proxy_https.test \
|
||||
13_local-cookies.test \
|
||||
14_local-https.test
|
||||
14_local-https.test \
|
||||
15_local-types.test \
|
||||
16_local-assume.test \
|
||||
17_local-empty-ct.test \
|
||||
18_local-update.test \
|
||||
19_local-connect-fallback.test \
|
||||
20_local-resume-loop.test \
|
||||
21_local-intl-update.test \
|
||||
22_local-broken-size.test \
|
||||
23_local-errpage.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -14,7 +14,9 @@
|
||||
# Usage:
|
||||
# bash local-crawl.sh [--tls] [--root DIR] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
|
||||
set -u
|
||||
|
||||
@@ -26,6 +28,7 @@ key="${testdir}/server.key"
|
||||
|
||||
tls=
|
||||
verbose=
|
||||
rerun=
|
||||
tmpdir=
|
||||
serverpid=
|
||||
crawlpid=
|
||||
@@ -89,6 +92,7 @@ nargs=$#
|
||||
while test "$pos" -lt "$nargs"; do
|
||||
case "${args[$pos]}" in
|
||||
--debug) verbose=1 ;;
|
||||
--rerun) rerun=1 ;; # run httrack a second time (update pass) before auditing
|
||||
--no-purge)
|
||||
nopurge=1
|
||||
audit+=("--no-purge")
|
||||
@@ -105,7 +109,7 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--found | --not-found | --directory)
|
||||
--found | --not-found | --directory | --log-found | --log-not-found)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
@@ -180,6 +184,31 @@ test "$crawlres" -eq 0 || ! result "httrack exited $crawlres" || {
|
||||
result "OK"
|
||||
grep -iE "^[0-9:]*[[:space:]]Error:" "${out}/hts-log.txt" >&2
|
||||
|
||||
# --- optional second pass: re-mirror into the same dir (cache/update path) ----
|
||||
if test -n "$rerun"; then
|
||||
info "re-running httrack (update pass)"
|
||||
httrack -O "$out" --user-agent="httrack $ver local ($(uname -omrs))" \
|
||||
"${moreargs[@]}" "${hts[@]}" >"${log}.2" 2>&1 &
|
||||
crawlpid=$!
|
||||
wait "$crawlpid"
|
||||
crawlres=$?
|
||||
crawlpid=
|
||||
test "$crawlres" -eq 0 || ! result "update pass exited $crawlres" || {
|
||||
cat "${log}.2" >&2
|
||||
exit 1
|
||||
}
|
||||
result "OK (update)"
|
||||
# The update summary reports "files updated"; a fresh crawl never does. Assert
|
||||
# it so a regression that bypasses the cache (re-crawls fresh) can't pass.
|
||||
info "checking update used the cache"
|
||||
if grep -aqE "mirror complete in .*files updated" "${out}/hts-log.txt"; then
|
||||
result "OK"
|
||||
else
|
||||
result "update pass did not report cache activity"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||
hostroot=
|
||||
for cand in "${out}/127.0.0.1_${port}" "${out}/127.0.0.1"; do
|
||||
@@ -230,6 +259,22 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--log-found)
|
||||
i=$((i + 1))
|
||||
info "checking log matches ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then result "OK"; else
|
||||
result "not in log"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--log-not-found)
|
||||
i=$((i + 1))
|
||||
info "checking log lacks ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then
|
||||
result "present in log"
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
@@ -15,6 +15,7 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||
from urllib.parse import quote, unquote, urlsplit
|
||||
|
||||
@@ -118,11 +119,174 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
# --- type/extension matrix (issue #267 family) -------------------------
|
||||
|
||||
def send_raw(self, body, content_type):
|
||||
"""Send a raw body with an explicit Content-Type, or none at all when
|
||||
content_type is None (to observe httrack's typeless-file naming)."""
|
||||
self.send_response(200)
|
||||
if content_type is not None:
|
||||
self.send_header("Content-Type", content_type)
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
# Fake-binary blobs for the image/pdf/typeless cases.
|
||||
FAKE_PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64
|
||||
FAKE_PDF = b"%PDF-1.4\n" + b"\x00" * 64
|
||||
|
||||
# path -> (body, content_type); None sends no header, "" sends an empty
|
||||
# Content-Type value (no usable type, must be treated like None).
|
||||
TYPE_MATRIX = {
|
||||
"/types/control.php": (b"<html><body>control</body></html>", "text/html"),
|
||||
"/types/photo.png": (FAKE_PNG, "image/png"),
|
||||
"/types/doc.pdf": (FAKE_PDF, "application/pdf"),
|
||||
"/types/notype.png": (FAKE_PNG, None),
|
||||
"/types/notype.pdf": (FAKE_PDF, None),
|
||||
"/types/emptyct.png": (FAKE_PNG, ""),
|
||||
"/types/lie.png": (FAKE_PNG, "text/html"),
|
||||
"/types/report.pdf": (b"<html><body>real page</body></html>", "text/html"),
|
||||
"/types/page.htm": (b"<html><body>htm page</body></html>", "text/html"),
|
||||
"/types/script.js": (b"var x = 1;\n", "application/javascript"),
|
||||
"/types/style.css": (b"body { color: red; }\n", "text/css"),
|
||||
"/types/data.json": (b'{"k": "v"}\n', "application/json"),
|
||||
"/types/gen.php": (FAKE_PNG, "image/png"),
|
||||
}
|
||||
|
||||
def route_types_index(self):
|
||||
body = (
|
||||
'\t<a href="control.php">control</a>\n'
|
||||
'\t<img src="photo.png" />\n'
|
||||
'\t<a href="doc.pdf">doc</a>\n'
|
||||
'\t<img src="notype.png" />\n'
|
||||
'\t<a href="notype.pdf">notypepdf</a>\n'
|
||||
'\t<img src="emptyct.png" />\n'
|
||||
'\t<img src="lie.png" />\n'
|
||||
'\t<a href="report.pdf">report</a>\n'
|
||||
'\t<a href="page.htm">htm</a>\n'
|
||||
'\t<script src="script.js"></script>\n'
|
||||
'\t<link rel="stylesheet" href="style.css" />\n'
|
||||
'\t<a href="data.json">json</a>\n'
|
||||
'\t<img src="gen.php?id=5" />\n'
|
||||
)
|
||||
self.send_html(body)
|
||||
|
||||
def route_types(self):
|
||||
path = urlsplit(self.path).path
|
||||
body, ctype = self.TYPE_MATRIX[path]
|
||||
self.send_raw(body, ctype)
|
||||
|
||||
# --- special chars in URLs across an update (issue #157) ---------------
|
||||
# A dotless, accented basename served as text/html (MediaWiki style). The
|
||||
# name the first crawl picks (.html) must survive the update pass.
|
||||
INTL_NAME = "Instalação_CVS_no_Ubuntu"
|
||||
|
||||
def route_intl_index(self):
|
||||
self.send_html('\t<a href="%s">accented</a>\n' % self.INTL_NAME)
|
||||
|
||||
def route_intl_page(self):
|
||||
self.send_raw(b"<html><body>accented page</body></html>\n", "text/html")
|
||||
|
||||
# resume / 416 loop (#206): the first GET stalls after a prefix so the crawl
|
||||
# can be interrupted (partial + temp-ref); every later request is 416.
|
||||
RESUME_PREFIX = b"PARTIAL-" + b"x" * 4096 # flushed before the stall
|
||||
RESUME_LEN = len(RESUME_PREFIX) + 4096 # declared length never delivered
|
||||
_resume_started = False
|
||||
|
||||
def route_resume_index(self):
|
||||
self.send_html('\t<a href="blob.txt">blob</a>')
|
||||
|
||||
def route_resume(self):
|
||||
counter = os.environ.get("RESUME_COUNTER")
|
||||
if counter:
|
||||
with open(counter, "a") as fp:
|
||||
fp.write("x")
|
||||
# First GET: stall mid-body so the crawl can be interrupted with a partial.
|
||||
if not Handler._resume_started:
|
||||
Handler._resume_started = True
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "image/png")
|
||||
self.send_header("Content-Length", str(self.RESUME_LEN))
|
||||
self.send_header("Accept-Ranges", "bytes")
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(self.RESUME_PREFIX)
|
||||
self.wfile.flush()
|
||||
try:
|
||||
while True:
|
||||
time.sleep(3600)
|
||||
except OSError:
|
||||
pass
|
||||
return
|
||||
self.send_response(416, "Requested Range Not Satisfiable")
|
||||
self.send_header("Content-Type", "image/png")
|
||||
self.send_header("Content-Range", "bytes */%d" % self.RESUME_LEN)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
# error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
|
||||
# bodies off disk; a genuine 0-byte 200 is a valid file and stays.
|
||||
def route_errpage_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="good.html">good</a>\n'
|
||||
'\t<a href="missing.html">missing</a>\n'
|
||||
'\t<a href="empty.html">empty</a>\n'
|
||||
)
|
||||
|
||||
def route_errpage_good(self):
|
||||
self.send_raw(b"<html><body>good page</body></html>\n", "text/html")
|
||||
|
||||
def route_errpage_missing(self):
|
||||
self.send_html("\t404 error body", status=404, extra_status="Not Found")
|
||||
|
||||
def route_errpage_empty(self):
|
||||
self.send_raw(b"", "text/html")
|
||||
|
||||
# broken Content-Length (#32/#41): declared size != bytes sent. httrack
|
||||
# warns "bogus state (broken size)" and skips the cache unless -%B.
|
||||
def route_size_index(self):
|
||||
self.send_html('\t<a href="oversize.bin">over</a>\n')
|
||||
|
||||
def route_size_oversize(self):
|
||||
body = b"A" * 100
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", str(len(body) - 2)) # lie: too short
|
||||
self.send_header("Connection", "close")
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
"/cookies/third.php": route_third,
|
||||
"/robots.txt": route_robots,
|
||||
"/types/index.html": route_types_index,
|
||||
"/types/control.php": route_types,
|
||||
"/types/photo.png": route_types,
|
||||
"/types/doc.pdf": route_types,
|
||||
"/types/notype.png": route_types,
|
||||
"/types/notype.pdf": route_types,
|
||||
"/types/emptyct.png": route_types,
|
||||
"/types/lie.png": route_types,
|
||||
"/types/report.pdf": route_types,
|
||||
"/types/page.htm": route_types,
|
||||
"/types/script.js": route_types,
|
||||
"/types/style.css": route_types,
|
||||
"/types/data.json": route_types,
|
||||
"/types/gen.php": route_types,
|
||||
"/intl/index.html": route_intl_index,
|
||||
"/intl/" + INTL_NAME: route_intl_page,
|
||||
"/resume/index.html": route_resume_index,
|
||||
"/resume/blob.txt": route_resume,
|
||||
"/size/index.html": route_size_index,
|
||||
"/size/oversize.bin": route_size_oversize,
|
||||
"/errpage/index.html": route_errpage_index,
|
||||
"/errpage/good.html": route_errpage_good,
|
||||
"/errpage/missing.html": route_errpage_missing,
|
||||
"/errpage/empty.html": route_errpage_empty,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
@@ -130,7 +294,8 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
handler = self.ROUTES.get(path)
|
||||
# Match percent-encoded paths (accented #157 route) by their decoded form.
|
||||
handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
|
||||
if handler is not None:
|
||||
handler(self)
|
||||
return True
|
||||
|
||||
@@ -206,9 +206,10 @@ main() {
|
||||
cp -a "$export_dir/debian" "httrack-$ver/debian"
|
||||
)
|
||||
|
||||
# Build (debuild also runs lintian and signs). --fail-on aborts on a lintian
|
||||
# error or warning, so neither a release nor CI produces an unclean package.
|
||||
local -a debuild_opts=(--lintian-opts -I -i "--fail-on=error,warning")
|
||||
# Build and sign. debuild runs lintian too but does NOT propagate its exit
|
||||
# status, so a broken package would pass unnoticed; disable it here and run
|
||||
# lintian ourselves below as the real gate.
|
||||
local -a debuild_opts=(--no-lintian)
|
||||
local -a build_opts=()
|
||||
[[ $source_only -eq 1 ]] && build_opts+=(-S)
|
||||
if [[ $unsigned -eq 1 ]]; then
|
||||
@@ -219,7 +220,8 @@ main() {
|
||||
info "building packages with debuild"
|
||||
(
|
||||
cd "$scratch/httrack-$ver"
|
||||
debuild "${build_opts[@]}" "${debuild_opts[@]}"
|
||||
# debuild options (--no-lintian) must precede the dpkg-buildpackage ones
|
||||
debuild "${debuild_opts[@]}" "${build_opts[@]}"
|
||||
)
|
||||
|
||||
# Collect every file the .changes references (orig, dsc, debs, ddebs, buildinfo).
|
||||
@@ -229,6 +231,16 @@ main() {
|
||||
changes=("$scratch"/*.changes)
|
||||
shopt -u nullglob
|
||||
[[ ${#changes[@]} -ge 1 ]] || die "debuild produced no .changes file"
|
||||
|
||||
# The real lintian gate (debuild only reports, it does not fail on tags).
|
||||
# --profile debian: CI runners are Ubuntu, whose vendor data would wrongly
|
||||
# reject the Debian "unstable" distribution. newer-standards-version only
|
||||
# means the local lintian is older than the buildds', not a package
|
||||
# defect, so suppress it. set -e turns any error/warning tag into a failure.
|
||||
info "running lintian gate (--fail-on=error,warning)"
|
||||
lintian --profile debian -I -i --fail-on=error,warning \
|
||||
--suppress-tags newer-standards-version "${changes[@]}"
|
||||
|
||||
dcmd cp -- "${changes[@]}" "$outdir/"
|
||||
|
||||
# Clean-room build gate: rebuild the source package in a minimal chroot that
|
||||
|
||||
Reference in New Issue
Block a user