mirror of
https://github.com/xroche/httrack.git
synced 2026-06-25 03:27:22 +03:00
Compare commits
73 Commits
feature/ap
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
58f368a91a | ||
|
|
c97b3e233e | ||
|
|
b615a4e7fd | ||
|
|
594cf0da39 | ||
|
|
3845cd1fb3 | ||
|
|
94bffb0804 | ||
|
|
a5c86e7e89 | ||
|
|
54f5717057 | ||
|
|
40fc9de360 | ||
|
|
4614eefefe | ||
|
|
b0e8262db0 | ||
|
|
addbd3136b | ||
|
|
a64c4cd160 | ||
|
|
1611dbcabf | ||
|
|
099501ee50 | ||
|
|
1b9eefa3b4 | ||
|
|
9c8d3a41eb | ||
|
|
ae77cd9d6d | ||
|
|
51b8dcd81c | ||
|
|
bcce664143 | ||
|
|
7a24add87c | ||
|
|
2308e7bafd | ||
|
|
ef5691fc47 | ||
|
|
0a6eb73903 | ||
|
|
fdb243e5a2 | ||
|
|
f8546e146d | ||
|
|
b7f602f2eb | ||
|
|
550100b56a | ||
|
|
33ddb27243 | ||
|
|
4606dfbf66 | ||
|
|
a6f1b9a3dd | ||
|
|
fb35d6a0f1 | ||
|
|
8a270fec03 | ||
|
|
0cbd5279f2 | ||
|
|
05306ee4fd | ||
|
|
1d0fc0a566 | ||
|
|
a4452592b4 | ||
|
|
62c2364b59 | ||
|
|
fe7041ddbf | ||
|
|
f5543df1af | ||
|
|
fee30aa95d | ||
|
|
f9f4700ee1 | ||
|
|
f030fa21e3 | ||
|
|
bdd1c1bc2c | ||
|
|
56665a268f | ||
|
|
2e948b9acd | ||
|
|
cae11499f1 | ||
|
|
02c7f4ebf6 | ||
|
|
9070b44a70 | ||
|
|
799c045061 | ||
|
|
fb1ee3bf2e | ||
|
|
6a08ca7d39 | ||
|
|
a8b491e509 | ||
|
|
a8e4bb3b81 | ||
|
|
0145ec37a3 | ||
|
|
a80fab38ba | ||
|
|
c52a524a63 | ||
|
|
1907621d37 | ||
|
|
3b2d7afdaa | ||
|
|
6ee539619e | ||
|
|
fb098b27b4 | ||
|
|
5f6a3fb917 | ||
|
|
f9e676dbe3 | ||
|
|
1b440c44b5 | ||
|
|
ac6dd1a570 | ||
|
|
4549ec3695 | ||
|
|
ac56c31b24 | ||
|
|
ee6beeeb7d | ||
|
|
6788bda380 | ||
|
|
7ead8d595e | ||
|
|
93f502990c | ||
|
|
0f4b2596b2 | ||
|
|
4a676bb5e1 |
@@ -16,6 +16,7 @@ BasedOnStyle: LLVM
|
|||||||
SpaceAfterCStyleCast: true # "(int) x", overwhelmingly dominant (542 vs 7)
|
SpaceAfterCStyleCast: true # "(int) x", overwhelmingly dominant (542 vs 7)
|
||||||
SortIncludes: false # C include order can be significant; never reorder
|
SortIncludes: false # C include order can be significant; never reorder
|
||||||
IncludeBlocks: Preserve # do not merge/reflow include groups
|
IncludeBlocks: Preserve # do not merge/reflow include groups
|
||||||
|
SeparateDefinitionBlocks: Always # blank line between definitions (readability)
|
||||||
|
|
||||||
# Stated explicitly for robustness against base-style drift (these match LLVM):
|
# Stated explicitly for robustness against base-style drift (these match LLVM):
|
||||||
IndentWidth: 2
|
IndentWidth: 2
|
||||||
|
|||||||
5
.flake8
Normal file
5
.flake8
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
[flake8]
|
||||||
|
# Match black's formatting so the two tools don't fight.
|
||||||
|
max-line-length = 88
|
||||||
|
# E203/W503 conflict with black's slice and line-break style.
|
||||||
|
extend-ignore = E203, W503
|
||||||
55
.github/workflows/ci.yml
vendored
55
.github/workflows/ci.yml
vendored
@@ -227,34 +227,47 @@ jobs:
|
|||||||
# Validate the Debian packaging via the same script maintainers release with.
|
# Validate the Debian packaging via the same script maintainers release with.
|
||||||
# One amd64/gcc run is enough: packaging (control/rules/manifest/lintian/quilt
|
# One amd64/gcc run is enough: packaging (control/rules/manifest/lintian/quilt
|
||||||
# source build) is arch- and compiler-independent, and the build matrix above
|
# source build) is arch- and compiler-independent, and the build matrix above
|
||||||
# already covers compile portability. lintian runs with --fail-on=error.
|
# already covers compile portability. mkdeb.sh runs lintian as an explicit gate
|
||||||
|
# (debuild does not propagate lintian's exit) with --fail-on=error,warning.
|
||||||
deb:
|
deb:
|
||||||
name: deb package (lintian)
|
name: deb package (lintian)
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-24.04
|
||||||
|
# Build and gate inside Debian sid, the upload target. A Debian dpkg-deb
|
||||||
|
# produces archive-legal xz members (an Ubuntu host defaults to zstd, which
|
||||||
|
# the archive's lintian rejects), and sid's lintian carries the same
|
||||||
|
# data-driven checks (embedded-lib fingerprints and the like) the buildds and
|
||||||
|
# UDD apply -- so issues surface here instead of after upload.
|
||||||
|
container: debian:sid
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
|
|
||||||
- name: Install packaging toolchain
|
- name: Install packaging toolchain
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
sudo apt-get update
|
apt-get update
|
||||||
sudo apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
|
ca-certificates git \
|
||||||
build-essential autoconf automake libtool autoconf-archive \
|
build-essential autoconf automake libtool autoconf-archive \
|
||||||
zlib1g-dev libssl-dev \
|
zlib1g-dev libssl-dev \
|
||||||
debhelper devscripts lintian fakeroot
|
debhelper devscripts lintian fakeroot
|
||||||
|
|
||||||
|
- uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
|
||||||
# --unsigned: CI has no GPG key (also skips the release sig/checksums).
|
# --unsigned: CI has no GPG key (also skips the release sig/checksums).
|
||||||
# debuild builds every package, then lintian gates on errors.
|
# mkdeb builds every package then runs the lintian gate (--fail-on=error,
|
||||||
|
# warning); debuild runs the packaged test pass.
|
||||||
#
|
#
|
||||||
# DEB_BUILD_OPTIONS trims work CI does not need (release builds via
|
# DEB_BUILD_OPTIONS trims work CI does not need (release builds via
|
||||||
# mkdeb.sh are untouched): noautodbgsym drops the -dbgsym packages whose
|
# mkdeb.sh are untouched): noautodbgsym drops the -dbgsym packages whose
|
||||||
# LTO payloads are slow to compress and that CI never ships; parallel uses
|
# LTO payloads are slow to compress and that CI never ships; parallel uses
|
||||||
# every core. We let debuild run its test pass -- the only one now that
|
# every core.
|
||||||
# mkdeb no longer runs its own -- so CI exercises the packaged tests.
|
- name: Build and lint Debian packages
|
||||||
- name: Build Debian packages
|
|
||||||
run: |
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
# The workspace volume is owned by the host runner uid, but the
|
||||||
|
# container runs as root, so mkdeb's git calls (superproject and the
|
||||||
|
# coucal submodule) trip "dubious ownership"; mark them all safe.
|
||||||
|
git config --global --add safe.directory "*"
|
||||||
export DEB_BUILD_OPTIONS="noautodbgsym parallel=$(nproc)"
|
export DEB_BUILD_OPTIONS="noautodbgsym parallel=$(nproc)"
|
||||||
bash tools/mkdeb.sh --unsigned --no-release-artifacts
|
bash tools/mkdeb.sh --unsigned --no-release-artifacts
|
||||||
|
|
||||||
@@ -320,6 +333,21 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
name: lint (shellcheck, shfmt)
|
name: lint (shellcheck, shfmt)
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-24.04
|
||||||
|
# Every tracked shell script; the globs expand at run time. Kept here so the
|
||||||
|
# shellcheck and shfmt steps below cannot drift apart.
|
||||||
|
env:
|
||||||
|
SHELL_SCRIPTS: >-
|
||||||
|
.githooks/pre-commit
|
||||||
|
bootstrap
|
||||||
|
build.sh
|
||||||
|
html/div/search.sh
|
||||||
|
man/makeman.sh
|
||||||
|
src/htsbasiccharsets.sh
|
||||||
|
src/htsentities.sh
|
||||||
|
src/webhttrack
|
||||||
|
tests/*.sh
|
||||||
|
tests/*.test
|
||||||
|
tools/mkdeb.sh
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v6
|
||||||
|
|
||||||
@@ -332,12 +360,11 @@ jobs:
|
|||||||
sudo apt-get install -y --no-install-recommends shellcheck shfmt
|
sudo apt-get install -y --no-install-recommends shellcheck shfmt
|
||||||
shfmt --version
|
shfmt --version
|
||||||
|
|
||||||
# Lint the scripts we maintain; the legacy scripts are a separate cleanup.
|
|
||||||
- name: shellcheck
|
- name: shellcheck
|
||||||
run: shellcheck man/makeman.sh tools/mkdeb.sh .githooks/pre-commit tests/*.test tests/check-network.sh
|
run: shellcheck $SHELL_SCRIPTS
|
||||||
|
|
||||||
- name: shfmt
|
- name: shfmt
|
||||||
run: shfmt -d -i 4 man/makeman.sh tools/mkdeb.sh .githooks/pre-commit
|
run: shfmt -d -i 4 $SHELL_SCRIPTS
|
||||||
|
|
||||||
# Check clang-format on CHANGED LINES ONLY. The engine predates clang-format
|
# Check clang-format on CHANGED LINES ONLY. The engine predates clang-format
|
||||||
# (it was shaped by an old Visual Studio formatter) and does not round-trip,
|
# (it was shaped by an old Visual Studio formatter) and does not round-trip,
|
||||||
|
|||||||
12
configure.ac
12
configure.ac
@@ -1,6 +1,6 @@
|
|||||||
AC_PREREQ([2.71])
|
AC_PREREQ([2.71])
|
||||||
|
|
||||||
AC_INIT([httrack], [3.49.8], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
AC_INIT([httrack], [3.49.9], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||||
AC_COPYRIGHT([
|
AC_COPYRIGHT([
|
||||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||||
Copyright (C) 1998-2015 Xavier Roche and other contributors
|
Copyright (C) 1998-2015 Xavier Roche and other contributors
|
||||||
@@ -29,9 +29,10 @@ AC_CONFIG_SRCDIR(src/httrack.c)
|
|||||||
AC_CONFIG_MACRO_DIR([m4])
|
AC_CONFIG_MACRO_DIR([m4])
|
||||||
AC_CONFIG_HEADERS(config.h)
|
AC_CONFIG_HEADERS(config.h)
|
||||||
AM_INIT_AUTOMAKE([subdir-objects])
|
AM_INIT_AUTOMAKE([subdir-objects])
|
||||||
# 3:0:0: htsblk layout changed (contenttype/charset/contentencoding widened to
|
# 3:1:0: 3.49.9 changed code but not the exported interface vs 3.49.8 (same 164
|
||||||
# 128), an incompatible ABI break, so bump current and reset revision/age.
|
# symbols, no struct-layout change), so bump revision only. (3:0:0 was the htsblk
|
||||||
VERSION_INFO="3:0:0"
|
# mime-buffer widening, an ABI break that moved the soname .so.2 -> .so.3.)
|
||||||
|
VERSION_INFO="3:1:0"
|
||||||
AM_MAINTAINER_MODE
|
AM_MAINTAINER_MODE
|
||||||
AC_USE_SYSTEM_EXTENSIONS
|
AC_USE_SYSTEM_EXTENSIONS
|
||||||
|
|
||||||
@@ -214,9 +215,12 @@ AC_SUBST(OPENSSL_LIBS)
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
### Support IPv6
|
### Support IPv6
|
||||||
|
V6_SUPPORT=no
|
||||||
AC_CHECK_LIB(c, getaddrinfo, [V6_FLAG="-DINET6"
|
AC_CHECK_LIB(c, getaddrinfo, [V6_FLAG="-DINET6"
|
||||||
|
V6_SUPPORT=yes
|
||||||
AC_DEFINE(HTS_INET6, 1, [Check for IPv6])], AC_MSG_WARN([*** IPv6 not found IPv6 compatibility disabled]))
|
AC_DEFINE(HTS_INET6, 1, [Check for IPv6])], AC_MSG_WARN([*** IPv6 not found IPv6 compatibility disabled]))
|
||||||
AC_SUBST(V6_FLAG)
|
AC_SUBST(V6_FLAG)
|
||||||
|
AC_SUBST(V6_SUPPORT)
|
||||||
|
|
||||||
### Check for LFS
|
### Check for LFS
|
||||||
AC_CHECK_LIB(c, fopen64, [LFS_FLAG="-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE"
|
AC_CHECK_LIB(c, fopen64, [LFS_FLAG="-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE"
|
||||||
|
|||||||
31
debian/changelog
vendored
31
debian/changelog
vendored
@@ -1,6 +1,33 @@
|
|||||||
|
httrack (3.49.9-1) unstable; urgency=medium
|
||||||
|
|
||||||
|
* New upstream release: Content-Type and file-type detection fixes (trust a
|
||||||
|
declared Content-Type over a binary URL extension, honor --assume under the
|
||||||
|
delayed type check, keep a known extension against a bogus or empty
|
||||||
|
Content-Type, and avoid an uninitialised read on an empty Content-Type), and
|
||||||
|
restored C++ source-compatibility of the installed headers so reverse
|
||||||
|
dependencies (httraqt) build again.
|
||||||
|
|
||||||
|
-- Xavier Roche <xavier@debian.org> Sun, 21 Jun 2026 17:59:38 +0200
|
||||||
|
|
||||||
|
httrack (3.49.8-2) unstable; urgency=medium
|
||||||
|
|
||||||
|
* Rename libhttrack2 to libhttrack3 to follow the SONAME, which the 3.49.8
|
||||||
|
ABI bump moved to libhttrack.so.3 (package-name-doesnt-match-sonames). In
|
||||||
|
3.49.8-1 the libhttrack2.files glob still matched .so.2, so the runtime
|
||||||
|
libraries fell through into the httrack package and libhttrack2 shipped no
|
||||||
|
library. The new .files uses a .so.3* wildcard so a future SONAME bump no
|
||||||
|
longer silently misplaces the libraries. New binary package, via NEW.
|
||||||
|
* Drop the stale debian/libhttrack-swf1.files: the swf module is no longer
|
||||||
|
built and no libhttrack-swf1 package exists.
|
||||||
|
|
||||||
|
-- Xavier Roche <xavier@debian.org> Sat, 20 Jun 2026 14:42:13 +0200
|
||||||
|
|
||||||
httrack (3.49.8-1) unstable; urgency=medium
|
httrack (3.49.8-1) unstable; urgency=medium
|
||||||
|
|
||||||
* New upstream release.
|
* New upstream release: HTTPS-proxy CONNECT tunnelling and wider srcset
|
||||||
|
parsing, a batch of crawler and parser fixes (CSS @import, xmlns
|
||||||
|
namespaces, relative paths, RFC 6265 cookies), and security hardening of
|
||||||
|
the parser and of buffer copies throughout the engine.
|
||||||
* Drop the OpenSSL linking exception from the license: OpenSSL 3.0+ is
|
* Drop the OpenSSL linking exception from the license: OpenSSL 3.0+ is
|
||||||
Apache-2.0 and GPL-compatible, so it is no longer needed. httrack is now
|
Apache-2.0 and GPL-compatible, so it is no longer needed. httrack is now
|
||||||
plain GPL-3.0-or-later. Updated debian/copyright accordingly.
|
plain GPL-3.0-or-later. Updated debian/copyright accordingly.
|
||||||
@@ -14,7 +41,7 @@ httrack (3.49.8-1) unstable; urgency=medium
|
|||||||
the QA debcheck page. Depend on firefox-esr | chromium | www-browser
|
the QA debcheck page. Depend on firefox-esr | chromium | www-browser
|
||||||
instead.
|
instead.
|
||||||
|
|
||||||
-- Xavier Roche <xavier@debian.org> Sun, 07 Jun 2026 14:29:24 +0200
|
-- Xavier Roche <xavier@debian.org> Sat, 20 Jun 2026 13:02:08 +0200
|
||||||
|
|
||||||
httrack (3.49.7-2) unstable; urgency=medium
|
httrack (3.49.7-2) unstable; urgency=medium
|
||||||
|
|
||||||
|
|||||||
6
debian/control
vendored
6
debian/control
vendored
@@ -58,13 +58,13 @@ Description: webhttrack common files
|
|||||||
This package is the common files of webhttrack, website copier and
|
This package is the common files of webhttrack, website copier and
|
||||||
mirroring utility
|
mirroring utility
|
||||||
|
|
||||||
Package: libhttrack2
|
Package: libhttrack3
|
||||||
Architecture: any
|
Architecture: any
|
||||||
Multi-Arch: same
|
Multi-Arch: same
|
||||||
Section: libs
|
Section: libs
|
||||||
Replaces: libhttrack1
|
|
||||||
Conflicts: libhttrack1
|
|
||||||
Depends: ${misc:Depends}, ${shlibs:Depends}
|
Depends: ${misc:Depends}, ${shlibs:Depends}
|
||||||
|
Replaces: libhttrack2, httrack (<< 3.49.8-2~)
|
||||||
|
Breaks: libhttrack2, httrack (<< 3.49.8-2~)
|
||||||
Description: Httrack website copier library
|
Description: Httrack website copier library
|
||||||
This package is the library part of httrack, website copier and mirroring
|
This package is the library part of httrack, website copier and mirroring
|
||||||
utility
|
utility
|
||||||
|
|||||||
106
debian/copyright
vendored
106
debian/copyright
vendored
@@ -1,21 +1,109 @@
|
|||||||
This package was debianized by Xavier Roche <roche@httrack.com> on
|
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||||
Fri, 27 Sep 2002 16:42:26 +0200
|
Upstream-Name: httrack
|
||||||
|
Upstream-Contact: Xavier Roche <roche@httrack.com>
|
||||||
|
Source: https://www.httrack.com/
|
||||||
|
|
||||||
The current Debian maintainer is Xavier Roche <xavier@debian.org>
|
Files: *
|
||||||
|
Copyright: 1998-2026 Xavier Roche and other contributors
|
||||||
|
License: GPL-3+
|
||||||
|
Comment:
|
||||||
|
The engine includes contributions from Yann Philippot (src/htsjava.c,
|
||||||
|
src/htsjava.h). htsbasenet.h links against the system OpenSSL library
|
||||||
|
(originally by Eric Young); no OpenSSL/SSLeay code is bundled here.
|
||||||
|
|
||||||
Upstream author: Xavier Roche <roche@httrack.com>
|
Files: src/minizip/*
|
||||||
|
Copyright: 1998-2010 Gilles Vollant
|
||||||
|
2007-2008 Even Rouault
|
||||||
|
2009-2010 Mathias Svensson
|
||||||
|
1990-2000 Info-ZIP
|
||||||
|
License: Zlib
|
||||||
|
Comment:
|
||||||
|
The decryption code in src/minizip/crypt.h and src/minizip/unzip.c derives
|
||||||
|
from the Info-ZIP distribution, distributed under the same terms.
|
||||||
|
|
||||||
Copyright: 1998-2014 Xavier Roche and other contributors
|
Files: src/md5.c
|
||||||
|
Copyright: 1993 Colin Plumb
|
||||||
|
License: public-domain-md5
|
||||||
|
This code implements the MD5 message-digest algorithm, due to Ron Rivest.
|
||||||
|
It was written by Colin Plumb in 1993, no copyright is claimed. This code
|
||||||
|
is in the public domain; do with it what you wish.
|
||||||
|
|
||||||
|
Files: src/coucal/*
|
||||||
|
Copyright: 2013-2014 Xavier Roche
|
||||||
|
License: BSD-3-clause
|
||||||
|
|
||||||
|
Files: src/coucal/murmurhash3.h*
|
||||||
|
Copyright: Austin Appleby
|
||||||
|
License: public-domain-murmurhash3
|
||||||
|
MurmurHash3 was written by Austin Appleby, and is placed in the public
|
||||||
|
domain. The author hereby disclaims copyright to this source code.
|
||||||
|
|
||||||
|
Files: html/server/div/com.httrack.WebHTTrack.metainfo.xml
|
||||||
|
Copyright: 1998-2026 Xavier Roche and other contributors
|
||||||
|
License: FSFAP
|
||||||
|
Copying and distribution of this file, with or without modification, are
|
||||||
|
permitted in any medium without royalty provided the copyright notice and
|
||||||
|
this notice are preserved. This file is offered as-is, without any warranty.
|
||||||
|
|
||||||
|
Files: debian/*
|
||||||
|
Copyright: 2002-2026 Xavier Roche <xavier@debian.org>
|
||||||
|
License: GPL-3+
|
||||||
|
|
||||||
|
License: GPL-3+
|
||||||
This program is free software: you can redistribute it and/or modify
|
This program is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
(at your option) any later version.
|
(at your option) any later version.
|
||||||
|
.
|
||||||
On Debian systems, the complete text of the GNU General Public
|
|
||||||
License version 3 can be found in /usr/share/common-licenses/GPL-3 file.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
This program is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
GNU General Public License for more details.
|
GNU General Public License for more details.
|
||||||
|
.
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
.
|
||||||
|
On Debian systems, the complete text of the GNU General Public License
|
||||||
|
version 3 can be found in /usr/share/common-licenses/GPL-3.
|
||||||
|
|
||||||
|
License: Zlib
|
||||||
|
This software is provided 'as-is', without any express or implied warranty.
|
||||||
|
In no event will the authors be held liable for any damages arising from the
|
||||||
|
use of this software.
|
||||||
|
.
|
||||||
|
Permission is granted to anyone to use this software for any purpose,
|
||||||
|
including commercial applications, and to alter it and redistribute it
|
||||||
|
freely, subject to the following restrictions:
|
||||||
|
.
|
||||||
|
1. The origin of this software must not be misrepresented; you must not claim
|
||||||
|
that you wrote the original software. If you use this software in a product,
|
||||||
|
an acknowledgment in the product documentation would be appreciated but is
|
||||||
|
not required.
|
||||||
|
2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
misrepresented as being the original software.
|
||||||
|
3. This notice may not be removed or altered from any source distribution.
|
||||||
|
|
||||||
|
License: BSD-3-clause
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
.
|
||||||
|
1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
may be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|||||||
3
debian/httrack-doc.lintian-overrides
vendored
3
debian/httrack-doc.lintian-overrides
vendored
@@ -4,3 +4,6 @@
|
|||||||
# so the path lives in the display pointer, not the override -- match with '*'.
|
# so the path lives in the display pointer, not the override -- match with '*'.
|
||||||
httrack-doc: extra-license-file *
|
httrack-doc: extra-license-file *
|
||||||
httrack-doc: package-contains-documentation-outside-usr-share-doc *
|
httrack-doc: package-contains-documentation-outside-usr-share-doc *
|
||||||
|
# search.sh is a sample CGI shipped alongside the HTML manual, not meant to be
|
||||||
|
# run from the package tree; it stays non-executable by design.
|
||||||
|
httrack-doc: script-not-executable *
|
||||||
|
|||||||
2
debian/libhttrack-swf1.files
vendored
2
debian/libhttrack-swf1.files
vendored
@@ -1,2 +0,0 @@
|
|||||||
usr/lib/*/libhtsswf.so.1.0.0
|
|
||||||
usr/lib/*/libhtsswf.so.1
|
|
||||||
5
debian/libhttrack2.files
vendored
5
debian/libhttrack2.files
vendored
@@ -1,5 +0,0 @@
|
|||||||
usr/lib/*/libhttrack.so.2.0.49
|
|
||||||
usr/lib/*/libhttrack.so.2
|
|
||||||
usr/lib/*/libhtsjava.so.2.0.49
|
|
||||||
usr/lib/*/libhtsjava.so.2
|
|
||||||
usr/share/httrack/templates
|
|
||||||
3
debian/libhttrack2.lintian-overrides
vendored
3
debian/libhttrack2.lintian-overrides
vendored
@@ -1,3 +0,0 @@
|
|||||||
# The shared libraries ship without a versioned symbols control file (ABI is
|
|
||||||
# tracked via the SONAME and a strict =version dependency, see debian/rules).
|
|
||||||
libhttrack2: no-symbols-control-file usr/lib/*
|
|
||||||
3
debian/libhttrack3.files
vendored
Normal file
3
debian/libhttrack3.files
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
usr/lib/*/libhttrack.so.3*
|
||||||
|
usr/lib/*/libhtsjava.so.3*
|
||||||
|
usr/share/httrack/templates
|
||||||
8
debian/libhttrack3.lintian-overrides
vendored
Normal file
8
debian/libhttrack3.lintian-overrides
vendored
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# The shared libraries ship without a versioned symbols control file (ABI is
|
||||||
|
# tracked via the SONAME plus a >= upstream-version dependency, see debian/rules).
|
||||||
|
libhttrack3: no-symbols-control-file usr/lib/*
|
||||||
|
|
||||||
|
# Bundled, locally patched minizip (src/minizip): it adds a zipFlush() API the
|
||||||
|
# system libminizip lacks (htscache.c flushes the cache .zip so an interrupted
|
||||||
|
# crawl leaves a valid archive), plus Android/old-zlib portability fixes.
|
||||||
|
libhttrack3: embedded-library *libminizip*
|
||||||
3
debian/proxytrack.lintian-overrides
vendored
Normal file
3
debian/proxytrack.lintian-overrides
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# Statically linked against httrack's bundled, patched minizip (see src/minizip
|
||||||
|
# and libhttrack3's override): the zipFlush() API is absent from the system one.
|
||||||
|
proxytrack: embedded-library *libminizip*
|
||||||
2
debian/rules
vendored
2
debian/rules
vendored
@@ -135,7 +135,7 @@ binary-arch: build install
|
|||||||
dh_makeshlibs -a -X/usr/lib/$(DEB_HOST_MULTIARCH)/httrack/libtest --version-info
|
dh_makeshlibs -a -X/usr/lib/$(DEB_HOST_MULTIARCH)/httrack/libtest --version-info
|
||||||
dh_installdeb -a
|
dh_installdeb -a
|
||||||
# we depend on the current version (ABI may change)
|
# we depend on the current version (ABI may change)
|
||||||
dh_shlibdeps -a -ldebian/libhttrack2/usr/lib/$(DEB_HOST_MULTIARCH)
|
dh_shlibdeps -a -ldebian/libhttrack3/usr/lib/$(DEB_HOST_MULTIARCH)
|
||||||
dh_gencontrol -a
|
dh_gencontrol -a
|
||||||
dh_md5sums -a
|
dh_md5sums -a
|
||||||
dh_builddeb -a
|
dh_builddeb -a
|
||||||
|
|||||||
27
history.txt
27
history.txt
@@ -4,13 +4,38 @@ HTTrack Website Copier release history:
|
|||||||
|
|
||||||
This file lists all changes and fixes that have been made for HTTrack
|
This file lists all changes and fixes that have been made for HTTrack
|
||||||
|
|
||||||
|
3.49-9
|
||||||
|
+ Fixed: file-type detection from the Content-Type header: trust a declared type over a binary URL extension, honor --assume under the delayed type check, and keep a known extension against a bogus or empty Content-Type (#267, #29, #56)
|
||||||
|
+ Fixed: an uninitialized-buffer read when the Content-Type is empty (#411)
|
||||||
|
+ Fixed: restored C++ source-compatibility of the installed headers so reverse dependencies (httraqt) build again (#413)
|
||||||
|
+ Changed: multiple internal build, packaging and test-harness improvements
|
||||||
|
|
||||||
3.49-8
|
3.49-8
|
||||||
|
+ New: tunnel HTTPS downloads through the configured HTTP proxy via CONNECT (#85)
|
||||||
|
+ New: parse every candidate URL in <img> and <source> srcset lists (#326)
|
||||||
+ Changed: dropped the obsolete OpenSSL linking exception (OpenSSL 3.0+ is Apache-2.0 and GPL-compatible); httrack is now plain GPLv3-or-later
|
+ Changed: dropped the obsolete OpenSSL linking exception (OpenSSL 3.0+ is Apache-2.0 and GPL-compatible); httrack is now plain GPLv3-or-later
|
||||||
+ Fixed: link libhtsjava and the libtest examples directly against libc
|
+ Fixed: several out-of-bounds reads in the HTML/CSS parser on hostile input (#94, #396)
|
||||||
|
+ Fixed: stored XSS via an unescaped URL in the generated page footer (#165)
|
||||||
|
+ Fixed: hardened buffer copies throughout the engine against overflow
|
||||||
|
+ Fixed: capture conditional CSS @import URLs (#94)
|
||||||
|
+ Fixed: don't crawl xmlns namespace declarations as links (#191)
|
||||||
|
+ Fixed: don't mistake the method argument of XMLHttpRequest.open for a URL (#218)
|
||||||
|
+ Fixed: percent-encode parentheses when rewriting CSS url() targets (#163)
|
||||||
|
+ Fixed: collapse ../ in file:// URLs and widen relative-link handling (#137, #162)
|
||||||
|
+ Fixed: drop the obsolete $Version/$Path attributes from the request Cookie header, per RFC 6265 (#151)
|
||||||
|
+ Fixed: keep empty quoted arguments when reloading doit.log for --update/--continue (#106)
|
||||||
|
+ Fixed: raise the User-Agent and custom-header length limits (#152)
|
||||||
|
+ Fixed: abort on a long log path (lock-file buffer too small) (#183)
|
||||||
|
+ Fixed: race in lazy mutex initialization (#297)
|
||||||
|
+ Fixed: sub-second mtime precision when comparing local files on POSIX (#383)
|
||||||
|
+ Fixed: modernize OpenSSL TLS initialization for the 3.x to 4.x transition (#308)
|
||||||
+ Fixed: in-place changes made by the postprocess callback were not applied (Roman Sęk)
|
+ Fixed: in-place changes made by the postprocess callback were not applied (Roman Sęk)
|
||||||
+ Fixed: "preffered" typo in the help text and man page (yosinn1-blip)
|
+ Fixed: "preffered" typo in the help text and man page (yosinn1-blip)
|
||||||
+ Fixed: corrections and updates of the Russian translation (German Aizek)
|
+ Fixed: corrections and updates of the Russian translation (German Aizek)
|
||||||
+ Fixed: corrections and updates of the Danish translation (scootergrisen)
|
+ Fixed: corrections and updates of the Danish translation (scootergrisen)
|
||||||
|
+ Fixed: link libhtsjava and the libtest examples directly against libc
|
||||||
|
+ New: documented the public library API headers and typed the option fields as named enums
|
||||||
|
+ Fixed: numerous build, packaging, CI and test-coverage improvements (out-of-tree builds, sanitizer/distcheck CI, shell and Python linting, AppStream metainfo)
|
||||||
|
|
||||||
3.49-7
|
3.49-7
|
||||||
+ Fixed: keep generated config.h architecture-independent (Debian #1133728)
|
+ Fixed: keep generated config.h architecture-independent (Debian #1133728)
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
# Simple indexing test using HTTrack
|
# Simple indexing test using HTTrack
|
||||||
@@ -18,22 +17,22 @@ if ! test -f "index.txt"; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Convert crlf to lf
|
# Convert crlf to lf
|
||||||
if test "`head index.txt -n 1 | tr '\r' '#' | grep -c '#'`" = "1"; then
|
if test "$(head index.txt -n 1 | tr '\r' '#' | grep -c '#')" = "1"; then
|
||||||
echo "Converting index to Unix LF style (not CR/LF) .."
|
echo "Converting index to Unix LF style (not CR/LF) .."
|
||||||
mv -f index.txt index.txt.old
|
mv -f index.txt index.txt.old
|
||||||
cat index.txt.old|tr -d '\r' > index.txt
|
tr -d '\r' <index.txt.old >index.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
keyword=-
|
keyword=-
|
||||||
while test -n "$keyword"; do
|
while test -n "$keyword"; do
|
||||||
printf "Enter a keyword: "
|
printf "Enter a keyword: "
|
||||||
read keyword
|
read -r keyword
|
||||||
|
|
||||||
if test -n "$keyword"; then
|
if test -n "$keyword"; then
|
||||||
FOUNDK="`grep -niE \"^$keyword\" index.txt`"
|
FOUNDK="$(grep -niE "^$keyword" index.txt)"
|
||||||
|
|
||||||
if test -n "$FOUNDK"; then
|
if test -n "$FOUNDK"; then
|
||||||
if ! test `echo "$FOUNDK"|wc -l` = "1"; then
|
if ! test "$(echo "$FOUNDK" | wc -l)" = "1"; then
|
||||||
# Multiple matches
|
# Multiple matches
|
||||||
printf "Found multiple keywords: "
|
printf "Found multiple keywords: "
|
||||||
echo "$FOUNDK" | cut -f2 -d':' | tr '\n' ' '
|
echo "$FOUNDK" | cut -f2 -d':' | tr '\n' ' '
|
||||||
@@ -41,12 +40,12 @@ while test -n "$keyword"; do
|
|||||||
echo "Use keyword$ to find only one"
|
echo "Use keyword$ to find only one"
|
||||||
else
|
else
|
||||||
# One match
|
# One match
|
||||||
N=`echo "$FOUNDK"|cut -f1 -d':'`
|
N=$(echo "$FOUNDK" | cut -f1 -d':')
|
||||||
PM=`tail +$N index.txt|grep -nE "\("|head -n 1`
|
PM=$(tail "+$N" index.txt | grep -nE "\(" | head -n 1)
|
||||||
if ! echo "$PM" | grep "ignored" >/dev/null; then
|
if ! echo "$PM" | grep "ignored" >/dev/null; then
|
||||||
M=`echo $PM|cut -f1 -d':'`
|
M=$(echo "$PM" | cut -f1 -d':')
|
||||||
echo "Found in:"
|
echo "Found in:"
|
||||||
cat index.txt | tail "+$N" | head -n "$M" | grep -E "[0-9]* " | cut -f2 -d' '
|
tail "+$N" index.txt | head -n "$M" | grep -E "[0-9]* " | cut -f2 -d' '
|
||||||
else
|
else
|
||||||
echo "keyword ignored (too many hits)"
|
echo "keyword ignored (too many hits)"
|
||||||
fi
|
fi
|
||||||
@@ -57,4 +56,3 @@ while test -n "$keyword"; do
|
|||||||
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ whttrackrundir = $(bindir)
|
|||||||
whttrackrun_SCRIPTS = webhttrack
|
whttrackrun_SCRIPTS = webhttrack
|
||||||
|
|
||||||
libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||||
htscache_selftest.c \
|
htscache_selftest.c htsdns_selftest.c \
|
||||||
htscatchurl.c htsfilters.c htsftp.c htshash.c coucal/coucal.c \
|
htscatchurl.c htsfilters.c htsftp.c htshash.c coucal/coucal.c \
|
||||||
htshelp.c htslib.c htscoremain.c \
|
htshelp.c htslib.c htscoremain.c \
|
||||||
htsname.c htsrobots.c htstools.c htswizard.c \
|
htsname.c htsrobots.c htstools.c htswizard.c \
|
||||||
@@ -66,7 +66,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
|||||||
md5.c \
|
md5.c \
|
||||||
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
||||||
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
||||||
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htscatchurl.h \
|
htsbasenet.h htsbauth.h htscache.h htscache_selftest.h htsdns_selftest.h htscatchurl.h \
|
||||||
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
||||||
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
||||||
htshelp.h htsindex.h htslib.h htsmd5.h \
|
htshelp.h htsindex.h htslib.h htsmd5.h \
|
||||||
|
|||||||
@@ -48,9 +48,8 @@ Please visit our Website: http://www.httrack.com
|
|||||||
/* Abort (with the failed byte count) when a growth allocation fails. The
|
/* Abort (with the failed byte count) when a growth allocation fails. The
|
||||||
array macros never return an out-of-memory error; they assert and abort. */
|
array macros never return an out-of-memory error; they assert and abort. */
|
||||||
static void hts_record_assert_memory_failed(const size_t size) {
|
static void hts_record_assert_memory_failed(const size_t size) {
|
||||||
fprintf(stderr, "memory allocation failed (%lu bytes)", \
|
fprintf(stderr, "memory allocation failed (%lu bytes)", (long int) size);
|
||||||
(long int) size); \
|
assertf(!"memory allocation failed");
|
||||||
assertf(! "memory allocation failed"); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Dynamic array of T elements. **/
|
/** Dynamic array of T elements. **/
|
||||||
@@ -109,20 +108,22 @@ static void hts_record_assert_memory_failed(const size_t size) {
|
|||||||
* After a call to this macro, TypedArrayRoom(A) is guaranteed to be at
|
* After a call to this macro, TypedArrayRoom(A) is guaranteed to be at
|
||||||
* least equal to 'ROOM'.
|
* least equal to 'ROOM'.
|
||||||
**/
|
**/
|
||||||
#define TypedArrayEnsureRoom(A, ROOM) do { \
|
#define TypedArrayEnsureRoom(A, ROOM) \
|
||||||
|
do { \
|
||||||
const size_t room_ = (ROOM); \
|
const size_t room_ = (ROOM); \
|
||||||
while (TypedArrayRoom(A) < room_) { \
|
while (TypedArrayRoom(A) < room_) { \
|
||||||
TypedArrayCapa(A) = TypedArrayCapa(A) < 16 ? 16 : TypedArrayCapa(A) * 2; \
|
TypedArrayCapa(A) = TypedArrayCapa(A) < 16 ? 16 : TypedArrayCapa(A) * 2; \
|
||||||
} \
|
} \
|
||||||
TypedArrayPtr(A) = realloc(TypedArrayPtr(A), \
|
TypedArrayPtr(A) = \
|
||||||
TypedArrayCapa(A)*TypedArrayWidth(A)); \
|
realloc(TypedArrayPtr(A), TypedArrayCapa(A) * TypedArrayWidth(A)); \
|
||||||
if (TypedArrayPtr(A) == NULL) { \
|
if (TypedArrayPtr(A) == NULL) { \
|
||||||
hts_record_assert_memory_failed(TypedArrayCapa(A) * TypedArrayWidth(A)); \
|
hts_record_assert_memory_failed(TypedArrayCapa(A) * TypedArrayWidth(A)); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/** Add an element. Macro, first element evaluated multiple times. **/
|
/** Add an element. Macro, first element evaluated multiple times. **/
|
||||||
#define TypedArrayAdd(A, E) do { \
|
#define TypedArrayAdd(A, E) \
|
||||||
|
do { \
|
||||||
TypedArrayEnsureRoom(A, 1); \
|
TypedArrayEnsureRoom(A, 1); \
|
||||||
assertf(TypedArraySize(A) < TypedArrayCapa(A)); \
|
assertf(TypedArraySize(A) < TypedArrayCapa(A)); \
|
||||||
TypedArrayTail(A) = (E); \
|
TypedArrayTail(A) = (E); \
|
||||||
@@ -133,7 +134,8 @@ static void hts_record_assert_memory_failed(const size_t size) {
|
|||||||
* Add 'COUNT' elements from 'PTR'.
|
* Add 'COUNT' elements from 'PTR'.
|
||||||
* Macro, first element evaluated multiple times.
|
* Macro, first element evaluated multiple times.
|
||||||
**/
|
**/
|
||||||
#define TypedArrayAppend(A, PTR, COUNT) do { \
|
#define TypedArrayAppend(A, PTR, COUNT) \
|
||||||
|
do { \
|
||||||
const size_t count_ = (COUNT); \
|
const size_t count_ = (COUNT); \
|
||||||
/* This 1-case is to benefit from type safety. */ \
|
/* This 1-case is to benefit from type safety. */ \
|
||||||
if (count_ == 1) { \
|
if (count_ == 1) { \
|
||||||
@@ -148,7 +150,8 @@ static void hts_record_assert_memory_failed(const size_t size) {
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/** Clear an array, freeing memory and clearing size and capacity. **/
|
/** Clear an array, freeing memory and clearing size and capacity. **/
|
||||||
#define TypedArrayFree(A) do { \
|
#define TypedArrayFree(A) \
|
||||||
|
do { \
|
||||||
if (TypedArrayPtr(A) != NULL) { \
|
if (TypedArrayPtr(A) != NULL) { \
|
||||||
TypedArrayCapa(A) = TypedArraySize(A) = 0; \
|
TypedArrayCapa(A) = TypedArraySize(A) = 0; \
|
||||||
free(TypedArrayPtr(A)); \
|
free(TypedArrayPtr(A)); \
|
||||||
|
|||||||
151
src/htsback.c
151
src/htsback.c
@@ -73,6 +73,8 @@ struct_back *back_new(httrackp *opt, int back_max) {
|
|||||||
|
|
||||||
sback->count = back_max;
|
sback->count = back_max;
|
||||||
sback->lnk = (lien_back *) calloct((back_max + 1), sizeof(lien_back));
|
sback->lnk = (lien_back *) calloct((back_max + 1), sizeof(lien_back));
|
||||||
|
sback->connect_fallback = (hts_connect_fallback *) calloct(
|
||||||
|
(back_max + 1), sizeof(hts_connect_fallback));
|
||||||
sback->ready = coucal_new(0);
|
sback->ready = coucal_new(0);
|
||||||
hts_set_hash_handler(sback->ready, opt);
|
hts_set_hash_handler(sback->ready, opt);
|
||||||
coucal_set_name(sback->ready, "back_new");
|
coucal_set_name(sback->ready, "back_new");
|
||||||
@@ -83,6 +85,7 @@ struct_back *back_new(httrackp *opt, int back_max) {
|
|||||||
sback->lnk[i].r.location = sback->lnk[i].location_buffer;
|
sback->lnk[i].r.location = sback->lnk[i].location_buffer;
|
||||||
sback->lnk[i].status = STATUS_FREE;
|
sback->lnk[i].status = STATUS_FREE;
|
||||||
sback->lnk[i].r.soc = INVALID_SOCKET;
|
sback->lnk[i].r.soc = INVALID_SOCKET;
|
||||||
|
sback->connect_fallback[i].addr_count = -1; // not yet probed
|
||||||
}
|
}
|
||||||
return sback;
|
return sback;
|
||||||
}
|
}
|
||||||
@@ -93,6 +96,7 @@ void back_free(struct_back ** sback) {
|
|||||||
freet((*sback)->lnk);
|
freet((*sback)->lnk);
|
||||||
(*sback)->lnk = NULL;
|
(*sback)->lnk = NULL;
|
||||||
}
|
}
|
||||||
|
freet((*sback)->connect_fallback);
|
||||||
if ((*sback)->ready != NULL) {
|
if ((*sback)->ready != NULL) {
|
||||||
coucal_delete(&(*sback)->ready);
|
coucal_delete(&(*sback)->ready);
|
||||||
(*sback)->ready_size_bytes = 0;
|
(*sback)->ready_size_bytes = 0;
|
||||||
@@ -102,6 +106,72 @@ void back_free(struct_back ** sback) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Per-candidate connect deadline cap (seconds): a connecting slot with another
|
||||||
|
address to try waits at most this long before falling back, instead of the
|
||||||
|
full (default 120s) slot timeout. Caps the dead-IPv6 stall while staying well
|
||||||
|
above a normal handshake. The last candidate still gets the full timeout. */
|
||||||
|
#define HTS_CONNECT_FALLBACK_TIMEOUT 10
|
||||||
|
|
||||||
|
int back_connect_fallback_due(int addr_index, int addr_count, int elapsed,
|
||||||
|
int timeout) {
|
||||||
|
int deadline;
|
||||||
|
|
||||||
|
if (addr_index + 1 >= addr_count) // last (or only) candidate: no fallback
|
||||||
|
return 0;
|
||||||
|
if (timeout <= 0) // no timeout management: never force it
|
||||||
|
return 0;
|
||||||
|
deadline = (timeout < HTS_CONNECT_FALLBACK_TIMEOUT)
|
||||||
|
? timeout
|
||||||
|
: HTS_CONNECT_FALLBACK_TIMEOUT;
|
||||||
|
return elapsed >= deadline;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Pending-connect result for a non-blocking socket reported ready by select():
|
||||||
|
0 = connected, >0 = the connect errno (refused, unreachable, ...), -1 if the
|
||||||
|
probe itself failed. A failed connect is reported writable too, so this is
|
||||||
|
how success is told from failure without blocking. */
|
||||||
|
static int connect_socket_error(T_SOC soc) {
|
||||||
|
int soerr = 0;
|
||||||
|
socklen_t len = (socklen_t) sizeof(soerr);
|
||||||
|
|
||||||
|
if (getsockopt(soc, SOL_SOCKET, SO_ERROR, (char *) &soerr, &len) != 0)
|
||||||
|
return -1;
|
||||||
|
return soerr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Retry a stuck/failed connecting slot against its next resolved address.
|
||||||
|
Closes the current socket and starts a non-blocking connect to the next
|
||||||
|
candidate, leaving the slot in STATUS_CONNECTING. Returns 1 if a new connect
|
||||||
|
was started, 0 if no fallback address remains (caller fails the slot). */
|
||||||
|
static int back_connect_next(httrackp *opt, struct_back *sback, int i) {
|
||||||
|
hts_connect_fallback *const cf = &sback->connect_fallback[i];
|
||||||
|
lien_back *const back = sback->lnk;
|
||||||
|
const int next = cf->addr_index + 1;
|
||||||
|
T_SOC soc;
|
||||||
|
|
||||||
|
if (next >= cf->addr_count)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (back[i].r.soc != INVALID_SOCKET) {
|
||||||
|
deletehttp(&back[i].r);
|
||||||
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
|
}
|
||||||
|
soc = newhttp_addr(opt, back[i].url_adr, &back[i].r, -1, 0, next, NULL);
|
||||||
|
if (soc == INVALID_SOCKET)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
back[i].r.soc = soc;
|
||||||
|
cf->addr_index = next;
|
||||||
|
cf->connect_start = time_local();
|
||||||
|
if (back[i].timeout > 0)
|
||||||
|
back[i].timeout_refresh = cf->connect_start;
|
||||||
|
back[i].status = STATUS_CONNECTING;
|
||||||
|
hts_log_print(opt, LOG_DEBUG,
|
||||||
|
"connect failed, trying next address (%d/%d) for %s", next + 1,
|
||||||
|
cf->addr_count, back[i].url_adr);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
void back_delete_all(httrackp * opt, cache_back * cache, struct_back * sback) {
|
void back_delete_all(httrackp * opt, cache_back * cache, struct_back * sback) {
|
||||||
if (sback != NULL) {
|
if (sback != NULL) {
|
||||||
int i;
|
int i;
|
||||||
@@ -1913,6 +1983,9 @@ int back_add(struct_back * sback, httrackp * opt, cache_back * cache, const char
|
|||||||
hts_init_htsblk(&back[p].r);
|
hts_init_htsblk(&back[p].r);
|
||||||
// memset(&(back[p].r), 0, sizeof(htsblk));
|
// memset(&(back[p].r), 0, sizeof(htsblk));
|
||||||
back[p].r.location = back[p].location_buffer;
|
back[p].r.location = back[p].location_buffer;
|
||||||
|
// fresh connect: address list not yet probed, start at the first
|
||||||
|
sback->connect_fallback[p].addr_index = 0;
|
||||||
|
sback->connect_fallback[p].addr_count = -1;
|
||||||
// recopier proxy
|
// recopier proxy
|
||||||
if ((back[p].r.req.proxy.active = opt->proxy.active)) {
|
if ((back[p].r.req.proxy.active = opt->proxy.active)) {
|
||||||
if (StringBuff(opt->proxy.bindhost) != NULL)
|
if (StringBuff(opt->proxy.bindhost) != NULL)
|
||||||
@@ -2369,6 +2442,9 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
// en cas de gestion du connect préemptif
|
// en cas de gestion du connect préemptif
|
||||||
#if HTS_XCONN
|
#if HTS_XCONN
|
||||||
if (back[i].status == STATUS_CONNECTING) { // connexion
|
if (back[i].status == STATUS_CONNECTING) { // connexion
|
||||||
|
// a connecting slot always carries a live socket; guard anyway so a
|
||||||
|
// stray INVALID_SOCKET can never reach FD_SET (mirrors the recv branch)
|
||||||
|
if (back[i].r.soc != INVALID_SOCKET) {
|
||||||
do_wait = 1;
|
do_wait = 1;
|
||||||
|
|
||||||
// noter socket write
|
// noter socket write
|
||||||
@@ -2385,6 +2461,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
// ID socket la plus élevée
|
// ID socket la plus élevée
|
||||||
nfds = back[i].r.soc;
|
nfds = back[i].r.soc;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
@@ -2517,8 +2594,20 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
}
|
}
|
||||||
// ---- FLAG WRITE MIS A UN?: POUR LE CONNECT
|
// ---- FLAG WRITE MIS A UN?: POUR LE CONNECT
|
||||||
if (back[i].status == STATUS_CONNECTING) { // attendre connect
|
if (back[i].status == STATUS_CONNECTING) { // attendre connect
|
||||||
|
hts_connect_fallback *const cf = &sback->connect_fallback[i];
|
||||||
int dispo = 0;
|
int dispo = 0;
|
||||||
|
|
||||||
|
// probe the resolved address list once per fresh connect (cache hit:
|
||||||
|
// the host was resolved when this connect was opened)
|
||||||
|
if (cf->addr_count < 0 && back[i].r.soc != INVALID_SOCKET &&
|
||||||
|
!back[i].r.is_file) {
|
||||||
|
SOCaddr scratch[HTS_MAXADDRNUM];
|
||||||
|
|
||||||
|
cf->addr_count = hts_dns_resolve_all(opt, back[i].url_adr, scratch,
|
||||||
|
HTS_MAXADDRNUM, NULL);
|
||||||
|
cf->connect_start = time_local();
|
||||||
|
}
|
||||||
|
|
||||||
// vérifier l'existance de timeout-check
|
// vérifier l'existance de timeout-check
|
||||||
if (!gestion_timeout)
|
if (!gestion_timeout)
|
||||||
if (back[i].timeout > 0)
|
if (back[i].timeout > 0)
|
||||||
@@ -2526,14 +2615,45 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
|
|
||||||
// connecté?
|
// connecté?
|
||||||
dispo = FD_ISSET(back[i].r.soc, &fds_c);
|
dispo = FD_ISSET(back[i].r.soc, &fds_c);
|
||||||
if (dispo) { // ok connected!!
|
if (dispo) { // socket ready: connect() finished (ok or failed)
|
||||||
|
// a refused/failed connect is reported writable too; probe SO_ERROR
|
||||||
|
// and, on failure, fall back to the next address (or fail the slot)
|
||||||
|
if (connect_socket_error(back[i].r.soc) != 0) {
|
||||||
|
if (!back_connect_next(opt, sback, i)) {
|
||||||
|
deletehttp(&back[i].r);
|
||||||
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
|
back[i].r.statuscode = STATUSCODE_CONNERROR;
|
||||||
|
strcpybuff(back[i].r.msg, "Connect Error");
|
||||||
|
back[i].status = STATUS_READY;
|
||||||
|
back_set_finished(sback, i);
|
||||||
|
}
|
||||||
|
continue; // reconnected (stay connecting) or failed
|
||||||
|
}
|
||||||
busy_state = 1;
|
busy_state = 1;
|
||||||
|
|
||||||
#if HTS_USEOPENSSL
|
#if HTS_USEOPENSSL
|
||||||
/* SSL mode */
|
/* SSL mode */
|
||||||
if (back[i].r.ssl) {
|
if (back[i].r.ssl) {
|
||||||
|
int tunnel_ok = 1;
|
||||||
|
|
||||||
|
// https via proxy: CONNECT-tunnel before TLS (#85)
|
||||||
|
if (back[i].r.req.proxy.active && back[i].r.ssl_con == NULL) {
|
||||||
|
const int timeout = back[i].timeout > 0 ? back[i].timeout : 30;
|
||||||
|
|
||||||
|
tunnel_ok =
|
||||||
|
http_proxy_tunnel(opt, &back[i].r, back[i].url_adr, timeout);
|
||||||
|
if (!tunnel_ok) {
|
||||||
|
if (!strnotempty(back[i].r.msg))
|
||||||
|
strcpybuff(back[i].r.msg, "proxy CONNECT failed");
|
||||||
|
deletehttp(&back[i].r);
|
||||||
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
|
back[i].r.statuscode = STATUSCODE_NON_FATAL;
|
||||||
|
back[i].status = STATUS_READY;
|
||||||
|
back_set_finished(sback, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
// handshake not yet launched
|
// handshake not yet launched
|
||||||
if (!back[i].r.ssl_con) {
|
if (tunnel_ok && !back[i].r.ssl_con) {
|
||||||
SSL_CTX_set_options(openssl_ctx, SSL_OP_ALL);
|
SSL_CTX_set_options(openssl_ctx, SSL_OP_ALL);
|
||||||
// new session
|
// new session
|
||||||
back[i].r.ssl_con = SSL_new(openssl_ctx);
|
back[i].r.ssl_con = SSL_new(openssl_ctx);
|
||||||
@@ -2551,7 +2671,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
back[i].r.statuscode = STATUSCODE_SSL_HANDSHAKE;
|
back[i].r.statuscode = STATUSCODE_SSL_HANDSHAKE;
|
||||||
}
|
}
|
||||||
/* Error */
|
/* Error */
|
||||||
if (back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
|
if (tunnel_ok && back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
|
||||||
strcpybuff(back[i].r.msg, "bad SSL/TLS handshake");
|
strcpybuff(back[i].r.msg, "bad SSL/TLS handshake");
|
||||||
deletehttp(&back[i].r);
|
deletehttp(&back[i].r);
|
||||||
back[i].r.soc = INVALID_SOCKET;
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
@@ -3838,7 +3958,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
/* funny log for commandline users */
|
/* funny log for commandline users */
|
||||||
//if (!opt->quiet) {
|
//if (!opt->quiet) {
|
||||||
// petite animation
|
// petite animation
|
||||||
if (opt->verbosedisplay == 1) {
|
if (opt->verbosedisplay == HTS_VERBOSE_SIMPLE) {
|
||||||
if (back[i].status == STATUS_READY) {
|
if (back[i].status == STATUS_READY) {
|
||||||
if (back[i].r.statuscode == HTTP_OK)
|
if (back[i].r.statuscode == HTTP_OK)
|
||||||
printf("* %s%s (" LLintP " bytes) - OK" VT_CLREOL "\r",
|
printf("* %s%s (" LLintP " bytes) - OK" VT_CLREOL "\r",
|
||||||
@@ -3866,6 +3986,29 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
|
|
||||||
if (back[i].status > 0) { // réception/connexion/..
|
if (back[i].status > 0) { // réception/connexion/..
|
||||||
if (back[i].timeout > 0) {
|
if (back[i].timeout > 0) {
|
||||||
|
// a stuck connect with a fallback address: retry the next one well
|
||||||
|
// before the full timeout (dead IPv6 on a dual-stack host, ...)
|
||||||
|
if (back[i].status == STATUS_CONNECTING) {
|
||||||
|
const hts_connect_fallback *const cf =
|
||||||
|
&sback->connect_fallback[i];
|
||||||
|
|
||||||
|
if (back_connect_fallback_due(cf->addr_index, cf->addr_count,
|
||||||
|
(int) (act - cf->connect_start),
|
||||||
|
back[i].timeout)) {
|
||||||
|
if (back_connect_next(opt, sback, i)) {
|
||||||
|
continue; // reconnected to the next candidate
|
||||||
|
}
|
||||||
|
// fallback was due but no socket could be opened
|
||||||
|
// (back_connect_next closed the dead one): stop now rather than
|
||||||
|
// spin on an invalid fd
|
||||||
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
|
back[i].r.statuscode = STATUSCODE_CONNERROR;
|
||||||
|
strcpybuff(back[i].r.msg, "Connect Error");
|
||||||
|
back[i].status = STATUS_READY;
|
||||||
|
back_set_finished(sback, i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
//printf("time check %d\n",((int) (act-back[i].timeout_refresh))-back[i].timeout);
|
//printf("time check %d\n",((int) (act-back[i].timeout_refresh))-back[i].timeout);
|
||||||
if (((int) (act - back[i].timeout_refresh)) >= back[i].timeout) {
|
if (((int) (act - back[i].timeout_refresh)) >= back[i].timeout) {
|
||||||
hts_log_print(opt, LOG_DEBUG, "connection timed out for %s%s", back[i].url_adr,
|
hts_log_print(opt, LOG_DEBUG, "connection timed out for %s%s", back[i].url_adr,
|
||||||
|
|||||||
@@ -49,9 +49,10 @@ Please visit our Website: http://www.httrack.com
|
|||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
// KB955045 (http://support.microsoft.com/kb/955045)
|
// KB955045 (http://support.microsoft.com/kb/955045)
|
||||||
// To execute an application using this function on earlier versions of Windows
|
// To execute an application using this function on earlier versions of Windows
|
||||||
// (Windows 2000, Windows NT, and Windows Me/98/95), then it is mandatary to #include Ws2tcpip.h
|
// (Windows 2000, Windows NT, and Windows Me/98/95), then it is mandatary to
|
||||||
// and also Wspiapi.h. When the Wspiapi.h header file is included, the 'getaddrinfo' function is
|
// #include Ws2tcpip.h and also Wspiapi.h. When the Wspiapi.h header file is
|
||||||
// #defined to the 'WspiapiGetAddrInfo' inline function in Wspiapi.h.
|
// included, the 'getaddrinfo' function is #defined to the 'WspiapiGetAddrInfo'
|
||||||
|
// inline function in Wspiapi.h.
|
||||||
#include <ws2tcpip.h>
|
#include <ws2tcpip.h>
|
||||||
#include <Wspiapi.h>
|
#include <Wspiapi.h>
|
||||||
// #include <winsock2.h>
|
// #include <winsock2.h>
|
||||||
|
|||||||
@@ -13,14 +13,14 @@ rm -f CP932.TXT CP936.TXT CP949.TXT CP950.TXT
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Produce code
|
# Produce code
|
||||||
printf "/** GENERATED FILE ($0), DO NOT EDIT **/\n\n"
|
printf '/** GENERATED FILE (%s), DO NOT EDIT **/\n\n' "$0"
|
||||||
for i in *.TXT; do
|
for i in *.TXT; do
|
||||||
echo "processing $i" >&2
|
echo "processing $i" >&2
|
||||||
grep -vE "^(#|$)" $i | grep -E "^0x" | sed -e 's/[[:space:]]/ /g' | cut -f1,2 -d' ' | \
|
grep -vE "^(#|$)" "$i" | grep -E "^0x" | sed -e 's/[[:space:]]/ /g' | cut -f1,2 -d' ' |
|
||||||
(
|
(
|
||||||
unset arr
|
unset arr
|
||||||
while read LINE ; do
|
while read -r LINE; do
|
||||||
from=$[$(echo $LINE | cut -f1 -d' ')]
|
from=$(($(echo "$LINE" | cut -f1 -d' ')))
|
||||||
if ! test -n "$from"; then
|
if ! test -n "$from"; then
|
||||||
echo "error with $i" >&2
|
echo "error with $i" >&2
|
||||||
exit 1
|
exit 1
|
||||||
@@ -28,22 +28,23 @@ for i in *.TXT ; do
|
|||||||
echo "out-of-range ($LINE) with $i" >&2
|
echo "out-of-range ($LINE) with $i" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
to=$(echo $LINE | cut -f2 -d' ')
|
to=$(echo "$LINE" | cut -f2 -d' ')
|
||||||
arr[$from]=$to
|
arr[from]=$to
|
||||||
done
|
done
|
||||||
name=$(echo $i | tr 'A-Z' 'a-z' | tr '-' '_' | sed -e 's/\.txt//' -e 's/8859/iso_8859/')
|
# shellcheck disable=SC2018,SC2019 # charset filenames are ASCII; keep C-locale A-Z/a-z
|
||||||
printf "/* Table for $i */\nstatic const hts_UCS4 table_${name}[256] = {\n "
|
name=$(echo "$i" | tr 'A-Z' 'a-z' | tr '-' '_' | sed -e 's/\.txt//' -e 's/8859/iso_8859/')
|
||||||
i=0
|
printf '/* Table for %s */\nstatic const hts_UCS4 table_%s[256] = {\n ' "$i" "$name"
|
||||||
while test "$i" -lt 256; do
|
idx=0
|
||||||
if test "$i" -gt 0; then
|
while test "$idx" -lt 256; do
|
||||||
|
if test "$idx" -gt 0; then
|
||||||
printf ", "
|
printf ", "
|
||||||
if test $[${i}%8] -eq 0; then
|
if test $((idx % 8)) -eq 0; then
|
||||||
printf "\n "
|
printf "\n "
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
value=${arr[$i]:-0}
|
value=${arr[$idx]:-0}
|
||||||
printf "0x%04x" $value
|
printf "0x%04x" "$value"
|
||||||
i=$[${i}+1]
|
idx=$((idx + 1))
|
||||||
done
|
done
|
||||||
printf " };\n\n"
|
printf " };\n\n"
|
||||||
)
|
)
|
||||||
@@ -53,7 +54,8 @@ done
|
|||||||
# Indexes
|
# Indexes
|
||||||
printf "static const struct {\n const char *name;\n const hts_UCS4 *table;\n} table_mappings[] = {\n"
|
printf "static const struct {\n const char *name;\n const hts_UCS4 *table;\n} table_mappings[] = {\n"
|
||||||
for i in *.TXT; do
|
for i in *.TXT; do
|
||||||
name=$(echo $i | tr 'A-Z' 'a-z' | tr '-' '_' | sed -e 's/\.txt//' -e 's/8859/iso_8859/')
|
# shellcheck disable=SC2018,SC2019 # charset filenames are ASCII; keep C-locale A-Z/a-z
|
||||||
printf " { \"$(echo $name | tr -d '_')\", table_${name} },\n"
|
name=$(echo "$i" | tr 'A-Z' 'a-z' | tr '-' '_' | sed -e 's/\.txt//' -e 's/8859/iso_8859/')
|
||||||
|
printf ' { "%s", table_%s },\n' "$(echo "$name" | tr -d '_')" "$name"
|
||||||
done
|
done
|
||||||
printf " { NULL, NULL }\n};\n"
|
printf " { NULL, NULL }\n};\n"
|
||||||
|
|||||||
@@ -71,7 +71,8 @@ struct t_cookie {
|
|||||||
int cookie_add(t_cookie *cookie, const char *cook_name, const char *cook_value,
|
int cookie_add(t_cookie *cookie, const char *cook_name, const char *cook_value,
|
||||||
const char *domain, const char *path);
|
const char *domain, const char *path);
|
||||||
|
|
||||||
int cookie_del(t_cookie * cookie, const char *cook_name, const char *domain, const char *path);
|
int cookie_del(t_cookie *cookie, const char *cook_name, const char *domain,
|
||||||
|
const char *path);
|
||||||
|
|
||||||
int cookie_load(t_cookie *cookie, const char *path, const char *name);
|
int cookie_load(t_cookie *cookie, const char *path, const char *name);
|
||||||
|
|
||||||
@@ -83,7 +84,8 @@ void cookie_delete(char *s, size_t s_size, size_t pos);
|
|||||||
|
|
||||||
const char *cookie_get(char *buffer, const char *cookie_base, int param);
|
const char *cookie_get(char *buffer, const char *cookie_base, int param);
|
||||||
|
|
||||||
char *cookie_find(char *s, const char *cook_name, const char *domain, const char *path);
|
char *cookie_find(char *s, const char *cook_name, const char *domain,
|
||||||
|
const char *path);
|
||||||
|
|
||||||
char *cookie_nextfield(char *a);
|
char *cookie_nextfield(char *a);
|
||||||
|
|
||||||
@@ -92,7 +94,8 @@ char *cookie_nextfield(char *a);
|
|||||||
/** Register credentials (auth = base-64 user:pass) for the prefix derived from
|
/** Register credentials (auth = base-64 user:pass) for the prefix derived from
|
||||||
adr (host) and fil (path). No-op returning 0 if cookie is NULL, allocation
|
adr (host) and fil (path). No-op returning 0 if cookie is NULL, allocation
|
||||||
fails, or a matching prefix is already stored; returns 1 on insertion. */
|
fails, or a matching prefix is already stored; returns 1 on insertion. */
|
||||||
int bauth_add(t_cookie * cookie, const char *adr, const char *fil, const char *auth);
|
int bauth_add(t_cookie *cookie, const char *adr, const char *fil,
|
||||||
|
const char *auth);
|
||||||
|
|
||||||
/** Return the stored base-64 credentials whose prefix matches adr+fil, or NULL
|
/** Return the stored base-64 credentials whose prefix matches adr+fil, or NULL
|
||||||
if none (or cookie is NULL). Returned pointer aliases the jar's bauth_chain;
|
if none (or cookie is NULL). Returned pointer aliases the jar's bauth_chain;
|
||||||
|
|||||||
@@ -135,7 +135,8 @@ HTSEXT_API T_SOC catch_url_init(int *port, /* 128 bytes */ char *adr) {
|
|||||||
// returns 0 if error
|
// returns 0 if error
|
||||||
// url: buffer where URL must be stored - or ip:port in case of failure
|
// url: buffer where URL must be stored - or ip:port in case of failure
|
||||||
// data: 32Kb
|
// data: 32Kb
|
||||||
HTSEXT_API int catch_url(T_SOC soc, char *url, char *method, char *data) {
|
HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||||
|
char *data) {
|
||||||
int retour = 0;
|
int retour = 0;
|
||||||
|
|
||||||
// connexion (accept)
|
// connexion (accept)
|
||||||
|
|||||||
@@ -87,7 +87,8 @@ Please visit our Website: http://www.httrack.com
|
|||||||
// fast cache (build hash table)
|
// fast cache (build hash table)
|
||||||
#define HTS_FAST_CACHE 1
|
#define HTS_FAST_CACHE 1
|
||||||
|
|
||||||
// le > peut être considéré comme un tag de fermeture de commentaire (<!-- > est valide)
|
// le > peut être considéré comme un tag de fermeture de commentaire (<!-- > est
|
||||||
|
// valide)
|
||||||
#define GT_ENDS_COMMENT 1
|
#define GT_ENDS_COMMENT 1
|
||||||
|
|
||||||
// always adds a '/' at the end if a '~' is encountered (/~smith -> /~smith/)
|
// always adds a '/' at the end if a '~' is encountered (/~smith -> /~smith/)
|
||||||
@@ -97,7 +98,8 @@ Please visit our Website: http://www.httrack.com
|
|||||||
#define HTS_STRIP_DOUBLE_SLASH 0
|
#define HTS_STRIP_DOUBLE_SLASH 0
|
||||||
|
|
||||||
// case-sensitive pour les dossiers et fichiers (0/1)
|
// case-sensitive pour les dossiers et fichiers (0/1)
|
||||||
// [normalement 1, mais pose des problèmes (url malformée par exemple) et n'est pas très utile..
|
// [normalement 1, mais pose des problèmes (url malformée par exemple) et n'est
|
||||||
|
// pas très utile..
|
||||||
// ..et pas bcp respecté]
|
// ..et pas bcp respecté]
|
||||||
// REMOVED
|
// REMOVED
|
||||||
// #define HTS_CASSE 0
|
// #define HTS_CASSE 0
|
||||||
|
|||||||
@@ -2585,7 +2585,7 @@ static int mkdir_compat(const char *pathname) {
|
|||||||
|
|
||||||
/* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */
|
/* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */
|
||||||
/* Note: preserve errno */
|
/* Note: preserve errno */
|
||||||
HTSEXT_API int dir_exists(const char *path) {
|
HTSEXT_API hts_boolean dir_exists(const char *path) {
|
||||||
const int err = errno;
|
const int err = errno;
|
||||||
STRUCT_STAT st;
|
STRUCT_STAT st;
|
||||||
char BIGSTK file[HTS_URLMAXSIZE * 2];
|
char BIGSTK file[HTS_URLMAXSIZE * 2];
|
||||||
@@ -3342,7 +3342,8 @@ int back_fill(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
int ptr, int numero_passe) {
|
int ptr, int numero_passe) {
|
||||||
int n = back_pluggable_sockets(sback, opt);
|
int n = back_pluggable_sockets(sback, opt);
|
||||||
|
|
||||||
if (opt->savename_delayed == 2 && !opt->delayed_cached) /* cancel (always delayed) */
|
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD &&
|
||||||
|
!opt->delayed_cached) /* cancel (always delayed) */
|
||||||
return 0;
|
return 0;
|
||||||
if (n > 0) {
|
if (n > 0) {
|
||||||
int p;
|
int p;
|
||||||
@@ -3646,7 +3647,7 @@ HTSEXT_API int hts_setpause(httrackp * opt, int p) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ask for termination
|
// ask for termination
|
||||||
HTSEXT_API int hts_request_stop(httrackp * opt, int force) {
|
HTSEXT_API int hts_request_stop(httrackp *opt, hts_boolean force) {
|
||||||
if (opt != NULL) {
|
if (opt != NULL) {
|
||||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||||
hts_mutexlock(&opt->state.lock);
|
hts_mutexlock(&opt->state.lock);
|
||||||
@@ -3656,7 +3657,7 @@ HTSEXT_API int hts_request_stop(httrackp * opt, int force) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
HTSEXT_API int hts_has_stopped(httrackp * opt) {
|
HTSEXT_API hts_boolean hts_has_stopped(httrackp *opt) {
|
||||||
int ended;
|
int ended;
|
||||||
hts_mutexlock(&opt->state.lock);
|
hts_mutexlock(&opt->state.lock);
|
||||||
ended = opt->state.is_ended;
|
ended = opt->state.is_ended;
|
||||||
@@ -3678,12 +3679,12 @@ HTSEXT_API int hts_has_stopped(httrackp * opt) {
|
|||||||
//}
|
//}
|
||||||
// ajout d'URL
|
// ajout d'URL
|
||||||
// -1 : erreur
|
// -1 : erreur
|
||||||
HTSEXT_API int hts_addurl(httrackp * opt, char **url) {
|
HTSEXT_API hts_boolean hts_addurl(httrackp *opt, char **url) {
|
||||||
if (url)
|
if (url)
|
||||||
opt->state._hts_addurl = url;
|
opt->state._hts_addurl = url;
|
||||||
return (opt->state._hts_addurl != NULL);
|
return (opt->state._hts_addurl != NULL);
|
||||||
}
|
}
|
||||||
HTSEXT_API int hts_resetaddurl(httrackp * opt) {
|
HTSEXT_API hts_boolean hts_resetaddurl(httrackp *opt) {
|
||||||
opt->state._hts_addurl = NULL;
|
opt->state._hts_addurl = NULL;
|
||||||
return (opt->state._hts_addurl != NULL);
|
return (opt->state._hts_addurl != NULL);
|
||||||
}
|
}
|
||||||
@@ -3702,6 +3703,8 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
|||||||
if (from->maxsoc > 0)
|
if (from->maxsoc > 0)
|
||||||
to->maxsoc = from->maxsoc;
|
to->maxsoc = from->maxsoc;
|
||||||
|
|
||||||
|
/* hts_tristate fields use HTS_DEFAULT (-1) for "unspecified": copy_htsopt
|
||||||
|
skips them so the target keeps its value. */
|
||||||
if (from->nearlink > -1)
|
if (from->nearlink > -1)
|
||||||
to->nearlink = from->nearlink;
|
to->nearlink = from->nearlink;
|
||||||
|
|
||||||
@@ -3844,7 +3847,7 @@ int htsAddLink(htsmoduleStruct * str, char *link) {
|
|||||||
a = opt->savename_type;
|
a = opt->savename_type;
|
||||||
b = opt->savename_83;
|
b = opt->savename_83;
|
||||||
opt->savename_type = 0;
|
opt->savename_type = 0;
|
||||||
opt->savename_83 = 0;
|
opt->savename_83 = HTS_SAVENAME_83_LONG;
|
||||||
// note: adr,fil peuvent être patchés
|
// note: adr,fil peuvent être patchés
|
||||||
r =
|
r =
|
||||||
url_savename(&afs, NULL, NULL, NULL, opt, sback, cache, hashptr, ptr, numero_passe,
|
url_savename(&afs, NULL, NULL, NULL, opt, sback, cache, hashptr, ptr, numero_passe,
|
||||||
|
|||||||
@@ -152,6 +152,15 @@ struct lien_adrfilsave {
|
|||||||
char save[HTS_URLMAXSIZE * 2]; /**< local save path (with directory) */
|
char save[HTS_URLMAXSIZE * 2]; /**< local save path (with directory) */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** Per-slot connect-fallback bookkeeping (parallel to struct_back.lnk).
|
||||||
|
Tracks which resolved address the slot is currently connecting to so a
|
||||||
|
stuck connect can be retried against the next one. */
|
||||||
|
typedef struct hts_connect_fallback {
|
||||||
|
int addr_index; /**< candidate being connected (0-based) */
|
||||||
|
int addr_count; /**< resolved addresses; -1 = not yet probed */
|
||||||
|
TStamp connect_start; /**< when the current candidate's connect began */
|
||||||
|
} hts_connect_fallback;
|
||||||
|
|
||||||
/** The download-slot ring: the set of concurrent transfers in flight.
|
/** The download-slot ring: the set of concurrent transfers in flight.
|
||||||
Allocated/owned by the engine; consumers (status callbacks, the loop)
|
Allocated/owned by the engine; consumers (status callbacks, the loop)
|
||||||
read it but do not resize or free it. */
|
read it but do not resize or free it. */
|
||||||
@@ -168,6 +177,7 @@ struct struct_back {
|
|||||||
int count; /**< number of usable slots (back_max) */
|
int count; /**< number of usable slots (back_max) */
|
||||||
coucal ready; /**< index of slots whose transfer completed */
|
coucal ready; /**< index of slots whose transfer completed */
|
||||||
LLint ready_size_bytes; /**< total bytes buffered in completed slots */
|
LLint ready_size_bytes; /**< total bytes buffered in completed slots */
|
||||||
|
hts_connect_fallback *connect_fallback; /**< per-slot, count+1 entries */
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct cache_back_zip_entry cache_back_zip_entry;
|
typedef struct cache_back_zip_entry cache_back_zip_entry;
|
||||||
@@ -372,6 +382,13 @@ void check_rate(TStamp stat_timestart, int maxrate);
|
|||||||
/* Backing (download-slot) scheduler. Operate on the back[] ring (struct_back).
|
/* Backing (download-slot) scheduler. Operate on the back[] ring (struct_back).
|
||||||
Not thread-safe; call from the single crawl loop. */
|
Not thread-safe; call from the single crawl loop. */
|
||||||
|
|
||||||
|
/* True if a connecting slot should give up on the current address and try the
|
||||||
|
next one: a fallback address remains (addr_index+1 < addr_count) and the
|
||||||
|
candidate has been connecting for at least its deadline, min(timeout, an
|
||||||
|
internal cap). elapsed/timeout in seconds. Exposed for the -#D self-test. */
|
||||||
|
int back_connect_fallback_due(int addr_index, int addr_count, int elapsed,
|
||||||
|
int timeout);
|
||||||
|
|
||||||
/* How many new sockets may be opened now, honoring maxsoc and the maxconn rate
|
/* How many new sockets may be opened now, honoring maxsoc and the maxconn rate
|
||||||
limit (>=0). _strict ignores reserved-slot headroom; the plain form leaves
|
limit (>=0). _strict ignores reserved-slot headroom; the plain form leaves
|
||||||
room for naming tests and stops at 0 when the stack is nearly full. */
|
room for naming tests and stops at 0 when the stack is nearly full. */
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ Please visit our Website: http://www.httrack.com
|
|||||||
#include "htscharset.h"
|
#include "htscharset.h"
|
||||||
#include "htsencoding.h"
|
#include "htsencoding.h"
|
||||||
#include "htscache_selftest.h"
|
#include "htscache_selftest.h"
|
||||||
|
#include "htsdns_selftest.h"
|
||||||
#include "htsmd5.h"
|
#include "htsmd5.h"
|
||||||
|
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
@@ -612,12 +613,12 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
/* Terminal is a tty, may ask questions and display funny information */
|
/* Terminal is a tty, may ask questions and display funny information */
|
||||||
if (isatty(1)) {
|
if (isatty(1)) {
|
||||||
opt->quiet = 0;
|
opt->quiet = 0;
|
||||||
opt->verbosedisplay = 1;
|
opt->verbosedisplay = HTS_VERBOSE_SIMPLE;
|
||||||
}
|
}
|
||||||
/* Not a tty, no stdin input or funny output! */
|
/* Not a tty, no stdin input or funny output! */
|
||||||
else {
|
else {
|
||||||
opt->quiet = 1;
|
opt->quiet = 1;
|
||||||
opt->verbosedisplay = 0;
|
opt->verbosedisplay = HTS_VERBOSE_NONE;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -953,9 +954,11 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
p = buff;
|
p = buff;
|
||||||
do {
|
do {
|
||||||
int insert_after_argc;
|
int insert_after_argc;
|
||||||
|
int quoted; /* "" unquotes to empty but is still a real token (#106) */
|
||||||
|
|
||||||
// read next
|
// read next
|
||||||
lastp = p;
|
lastp = p;
|
||||||
|
quoted = (p != NULL && *p == '"');
|
||||||
if (p) {
|
if (p) {
|
||||||
p = next_token(p, 1);
|
p = next_token(p, 1);
|
||||||
if (p) {
|
if (p) {
|
||||||
@@ -966,7 +969,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
|
|
||||||
/* Insert parameters BUT so that they can be in the same order */
|
/* Insert parameters BUT so that they can be in the same order */
|
||||||
if (lastp) {
|
if (lastp) {
|
||||||
if (strnotempty(lastp)) {
|
if (strnotempty(lastp) || quoted) {
|
||||||
insert_after_argc = argc - insert_after;
|
insert_after_argc = argc - insert_after;
|
||||||
cmdl_ins(lastp, insert_after_argc, (argv + insert_after), x_argvblk,
|
cmdl_ins(lastp, insert_after_argc, (argv + insert_after), x_argvblk,
|
||||||
x_argvblk_size, x_ptr);
|
x_argvblk_size, x_ptr);
|
||||||
@@ -1815,24 +1818,22 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
com++;
|
com++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 'L':
|
case 'L': {
|
||||||
{
|
sscanf(com + 1, "%d", (int *) &opt->savename_83);
|
||||||
sscanf(com + 1, "%d", &opt->savename_83);
|
|
||||||
switch (opt->savename_83) {
|
switch (opt->savename_83) {
|
||||||
case 0: // 8-3 (ISO9660 L1)
|
case 0: // 8-3 (ISO9660 L1)
|
||||||
opt->savename_83 = 1;
|
opt->savename_83 = HTS_SAVENAME_83_DOS;
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
opt->savename_83 = 0;
|
opt->savename_83 = HTS_SAVENAME_83_LONG;
|
||||||
break;
|
break;
|
||||||
default: // 2 == ISO9660 (ISO9660 L2)
|
default: // 2 == ISO9660 (ISO9660 L2)
|
||||||
opt->savename_83 = 2;
|
opt->savename_83 = HTS_SAVENAME_83_ISO9660;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
while (isdigit((unsigned char) *(com + 1)))
|
while (isdigit((unsigned char) *(com + 1)))
|
||||||
com++;
|
com++;
|
||||||
}
|
} break;
|
||||||
break;
|
|
||||||
case 's':
|
case 's':
|
||||||
if (isdigit((unsigned char) *(com + 1))) {
|
if (isdigit((unsigned char) *(com + 1))) {
|
||||||
sscanf(com + 1, "%d", (int *) &opt->robots);
|
sscanf(com + 1, "%d", (int *) &opt->robots);
|
||||||
@@ -1989,9 +1990,9 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
}
|
}
|
||||||
break; // url hack
|
break; // url hack
|
||||||
case 'v':
|
case 'v':
|
||||||
opt->verbosedisplay = 2;
|
opt->verbosedisplay = HTS_VERBOSE_FULL;
|
||||||
if (isdigit((unsigned char) *(com + 1))) {
|
if (isdigit((unsigned char) *(com + 1))) {
|
||||||
sscanf(com + 1, "%d", &opt->verbosedisplay);
|
sscanf(com + 1, "%d", (int *) &opt->verbosedisplay);
|
||||||
while(isdigit((unsigned char) *(com + 1)))
|
while(isdigit((unsigned char) *(com + 1)))
|
||||||
com++;
|
com++;
|
||||||
}
|
}
|
||||||
@@ -2004,9 +2005,9 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 'N':
|
case 'N':
|
||||||
opt->savename_delayed = 2;
|
opt->savename_delayed = HTS_SAVENAME_DELAYED_HARD;
|
||||||
if (isdigit((unsigned char) *(com + 1))) {
|
if (isdigit((unsigned char) *(com + 1))) {
|
||||||
sscanf(com + 1, "%d", &opt->savename_delayed);
|
sscanf(com + 1, "%d", (int *) &opt->savename_delayed);
|
||||||
while(isdigit((unsigned char) *(com + 1)))
|
while(isdigit((unsigned char) *(com + 1)))
|
||||||
com++;
|
com++;
|
||||||
}
|
}
|
||||||
@@ -2460,6 +2461,51 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 'D': { // DNS resolver/cache self-test (mock getaddrinfo)
|
||||||
|
const int err = dns_selftests(opt);
|
||||||
|
|
||||||
|
printf("dns-selftest: %s\n", err ? "FAIL" : "OK");
|
||||||
|
htsmain_free();
|
||||||
|
return err;
|
||||||
|
} break;
|
||||||
|
case 'N': { // url_savename name resolution: httrack -#N <fil>
|
||||||
|
// <content-type>
|
||||||
|
if (na + 2 < argc) {
|
||||||
|
lien_adrfilsave afs;
|
||||||
|
cache_back cache;
|
||||||
|
struct_back *sback;
|
||||||
|
hash_struct hash;
|
||||||
|
lien_back headers;
|
||||||
|
|
||||||
|
memset(&afs, 0, sizeof(afs));
|
||||||
|
strcpybuff(afs.af.adr, "www.example.com");
|
||||||
|
strcpybuff(afs.af.fil, argv[na + 1]);
|
||||||
|
|
||||||
|
memset(&cache, 0, sizeof(cache));
|
||||||
|
cache.hashtable = (void *) coucal_new(0);
|
||||||
|
|
||||||
|
sback = back_new(opt, opt->maxsoc * 32 + 1024);
|
||||||
|
hash_init(opt, &hash, opt->urlhack);
|
||||||
|
|
||||||
|
memset(&headers, 0, sizeof(headers));
|
||||||
|
headers.status = 0;
|
||||||
|
headers.r.statuscode = HTTP_OK;
|
||||||
|
strcpybuff(headers.r.contenttype, argv[na + 2]);
|
||||||
|
strcpybuff(headers.url_fil, argv[na + 1]);
|
||||||
|
|
||||||
|
url_savename(&afs, NULL, NULL, NULL, opt, sback, &cache,
|
||||||
|
&hash, 0, 0, &headers);
|
||||||
|
printf("savename: %s\n", afs.save);
|
||||||
|
htsmain_free();
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
fprintf(
|
||||||
|
stderr,
|
||||||
|
"Option #N requires <fil> <content-type> arguments\n");
|
||||||
|
htsmain_free();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case 'C': // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file
|
case 'C': // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file
|
||||||
{
|
{
|
||||||
int hasFilter = 0;
|
int hasFilter = 0;
|
||||||
@@ -2579,7 +2625,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
(r.size >= 0) ? r.size : (-r.size));
|
(r.size >= 0) ? r.size : (-r.size));
|
||||||
if (r.contenttype >= 0) {
|
if (r.contenttype >= 0) {
|
||||||
fprintf(stdout, "Content-Type: %s\r\n",
|
fprintf(stdout, "Content-Type: %s\r\n",
|
||||||
r.contenttype);
|
hts_effective_mime(r.contenttype));
|
||||||
}
|
}
|
||||||
if (r.cdispo[0]) {
|
if (r.cdispo[0]) {
|
||||||
fprintf(stdout, "Content-Disposition: %s\r\n",
|
fprintf(stdout, "Content-Disposition: %s\r\n",
|
||||||
@@ -2787,6 +2833,47 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 'l': /* lienrelatif: relative link from curr_fil to link */
|
||||||
|
if (na + 2 >= argc) {
|
||||||
|
HTS_PANIC_PRINTF(
|
||||||
|
"Option #l needs a link and a current-file path");
|
||||||
|
printf(
|
||||||
|
"Example: '-#l' 'host/dir/img.gif' 'host/dir/p.html'\n");
|
||||||
|
htsmain_free();
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
char s[HTS_URLMAXSIZE * 2];
|
||||||
|
|
||||||
|
if (lienrelatif(s, sizeof(s), argv[na + 1], argv[na + 2]) ==
|
||||||
|
0)
|
||||||
|
printf("relative=%s\n", s);
|
||||||
|
else
|
||||||
|
printf("relative=<ERROR>\n");
|
||||||
|
htsmain_free();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'i': /* ident_url_relatif: resolve a link -> adr/fil */
|
||||||
|
if (na + 3 >= argc) {
|
||||||
|
HTS_PANIC_PRINTF(
|
||||||
|
"Option #i needs a link, an origin address and file");
|
||||||
|
printf("Example: '-#i' '../img.gif' 'www.foo.com' "
|
||||||
|
"'/d/p.html'\n");
|
||||||
|
htsmain_free();
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
lien_adrfil af;
|
||||||
|
const int r = ident_url_relatif(argv[na + 1], argv[na + 2],
|
||||||
|
argv[na + 3], &af);
|
||||||
|
|
||||||
|
if (r == 0)
|
||||||
|
printf("adr=%s fil=%s\n", af.adr, af.fil);
|
||||||
|
else
|
||||||
|
printf("error=%d\n", r);
|
||||||
|
htsmain_free();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
break;
|
||||||
case '2': // mimedefs
|
case '2': // mimedefs
|
||||||
if (na + 1 >= argc) {
|
if (na + 1 >= argc) {
|
||||||
HTS_PANIC_PRINTF("Option #2 needs to be followed by an URL");
|
HTS_PANIC_PRINTF("Option #2 needs to be followed by an URL");
|
||||||
@@ -3096,6 +3183,88 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
htsmain_free();
|
htsmain_free();
|
||||||
return 0;
|
return 0;
|
||||||
break;
|
break;
|
||||||
|
case '9': { // copy_htsopt selftest: httrack -#9
|
||||||
|
httrackp *from = hts_create_opt();
|
||||||
|
httrackp *to = hts_create_opt();
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
/* from-values differ from both the to-values and the
|
||||||
|
hts_create_opt() defaults (nearlink FALSE, errpage/parseall
|
||||||
|
TRUE), so a copy that no-ops or just resets to defaults is
|
||||||
|
caught too, not only the unsigned-guard bug. */
|
||||||
|
from->retry = 7; /* int field: positive control */
|
||||||
|
to->retry = 0;
|
||||||
|
from->nearlink = HTS_TRUE;
|
||||||
|
to->nearlink = HTS_FALSE;
|
||||||
|
from->errpage = HTS_FALSE;
|
||||||
|
to->errpage = HTS_TRUE;
|
||||||
|
from->parseall = HTS_FALSE;
|
||||||
|
to->parseall = HTS_TRUE;
|
||||||
|
|
||||||
|
copy_htsopt(from, to);
|
||||||
|
|
||||||
|
if (to->retry != 7)
|
||||||
|
err = 1;
|
||||||
|
if (to->nearlink != HTS_TRUE)
|
||||||
|
err = 1;
|
||||||
|
if (to->errpage != HTS_FALSE)
|
||||||
|
err = 1;
|
||||||
|
if (to->parseall != HTS_FALSE)
|
||||||
|
err = 1;
|
||||||
|
|
||||||
|
/* HTS_DEFAULT (-1) is "unspecified": copy_htsopt must skip it,
|
||||||
|
leaving the target intact. Only a signed (int-backed) field
|
||||||
|
can hold -1, so this also guards the type against regressing
|
||||||
|
to an unsigned hts_boolean. */
|
||||||
|
from->parseall = HTS_DEFAULT;
|
||||||
|
to->parseall = HTS_TRUE;
|
||||||
|
copy_htsopt(from, to);
|
||||||
|
if (to->parseall != HTS_TRUE)
|
||||||
|
err = 1;
|
||||||
|
|
||||||
|
hts_free_opt(from);
|
||||||
|
hts_free_opt(to);
|
||||||
|
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
|
||||||
|
htsmain_free();
|
||||||
|
return err;
|
||||||
|
} break;
|
||||||
|
case 'Q': { // cookie request-header selftest: httrack -#Q
|
||||||
|
static t_cookie cookie;
|
||||||
|
char hdr[1024];
|
||||||
|
/* RFC 6265: bare name=value pairs, no $Version/$Path (#151). */
|
||||||
|
const char *expected = "Cookie: name=value; has_js=1" H_CRLF;
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
const char *dom = "www.example.com";
|
||||||
|
int added;
|
||||||
|
|
||||||
|
cookie.max_len = (int) sizeof(cookie.data);
|
||||||
|
cookie.data[0] = '\0';
|
||||||
|
added = cookie_add(&cookie, "name", "value", dom, "/");
|
||||||
|
added |= cookie_add(&cookie, "has_js", "1", dom, "/");
|
||||||
|
/* different domain: must be filtered out */
|
||||||
|
added |= cookie_add(&cookie, "junk", "x", "other.org", "/");
|
||||||
|
if (added) {
|
||||||
|
printf("cookie-header: FAIL (cookie_add setup)\n");
|
||||||
|
htsmain_free();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
http_cookie_header_selftest(&cookie, dom, "/", hdr,
|
||||||
|
sizeof(hdr));
|
||||||
|
if (strcmp(hdr, expected) != 0)
|
||||||
|
err = 1;
|
||||||
|
if (strstr(hdr, "$Version") != NULL ||
|
||||||
|
strstr(hdr, "$Path") != NULL)
|
||||||
|
err = 1;
|
||||||
|
if (strstr(hdr, "junk") != NULL) // wrong-domain cookie leaked
|
||||||
|
err = 1;
|
||||||
|
printf("cookie-header: %s\n", err ? "FAIL" : "OK");
|
||||||
|
if (err)
|
||||||
|
printf(" got: %s\n", hdr);
|
||||||
|
htsmain_free();
|
||||||
|
return err;
|
||||||
|
} break;
|
||||||
case '!':
|
case '!':
|
||||||
HTS_PANIC_PRINTF
|
HTS_PANIC_PRINTF
|
||||||
("Option #! is disabled for security reasons");
|
("Option #! is disabled for security reasons");
|
||||||
|
|||||||
@@ -109,8 +109,8 @@ typedef int (*t_hts_htmlcheck_chopt) (t_hts_callbackarg * carg, httrackp * opt);
|
|||||||
/* Rewrite hook over an in-memory page: the html and len arguments point at the
|
/* Rewrite hook over an in-memory page: the html and len arguments point at the
|
||||||
buffer and its length (the callback may reallocate and resize it),
|
buffer and its length (the callback may reallocate and resize it),
|
||||||
url_adresse and url_fichier name it. */
|
url_adresse and url_fichier name it. */
|
||||||
typedef int (*t_hts_htmlcheck_process) (t_hts_callbackarg * carg,
|
typedef int (*t_hts_htmlcheck_process)(t_hts_callbackarg *carg, httrackp *opt,
|
||||||
httrackp * opt, char **html, int *len,
|
char **html, int *len,
|
||||||
const char *url_adresse,
|
const char *url_adresse,
|
||||||
const char *url_fichier);
|
const char *url_fichier);
|
||||||
|
|
||||||
@@ -147,9 +147,8 @@ typedef const char *(*t_hts_htmlcheck_query3) (t_hts_callbackarg * carg,
|
|||||||
queue size and running totals, stat_time the elapsed time. */
|
queue size and running totals, stat_time the elapsed time. */
|
||||||
typedef int (*t_hts_htmlcheck_loop)(t_hts_callbackarg *carg, httrackp *opt,
|
typedef int (*t_hts_htmlcheck_loop)(t_hts_callbackarg *carg, httrackp *opt,
|
||||||
lien_back *back, int back_max,
|
lien_back *back, int back_max,
|
||||||
int back_index, int lien_tot,
|
int back_index, int lien_tot, int lien_ntot,
|
||||||
int lien_ntot, int stat_time,
|
int stat_time, hts_stat_struct *stats);
|
||||||
hts_stat_struct * stats);
|
|
||||||
|
|
||||||
/* Veto a link (adr host, fil path) after its transfer; status is the result.
|
/* Veto a link (adr host, fil path) after its transfer; status is the result.
|
||||||
Return 0 to drop the link. */
|
Return 0 to drop the link. */
|
||||||
@@ -168,8 +167,8 @@ typedef void (*t_hts_htmlcheck_pause) (t_hts_callbackarg * carg, httrackp * opt,
|
|||||||
const char *lockfile);
|
const char *lockfile);
|
||||||
|
|
||||||
/* Fired after a file is written to disk; 'file' is the local path. */
|
/* Fired after a file is written to disk; 'file' is the local path. */
|
||||||
typedef void (*t_hts_htmlcheck_filesave) (t_hts_callbackarg * carg,
|
typedef void (*t_hts_htmlcheck_filesave)(t_hts_callbackarg *carg, httrackp *opt,
|
||||||
httrackp * opt, const char *file);
|
const char *file);
|
||||||
|
|
||||||
/* Richer file-saved notification: source host/filename, local path, and flags
|
/* Richer file-saved notification: source host/filename, local path, and flags
|
||||||
telling whether the file is new, modified, or left unchanged. */
|
telling whether the file is new, modified, or left unchanged. */
|
||||||
@@ -189,13 +188,12 @@ typedef int (*t_hts_htmlcheck_linkdetected2) (t_hts_callbackarg * carg,
|
|||||||
const char *tag_start);
|
const char *tag_start);
|
||||||
|
|
||||||
/* Fired on each transfer-status change of slot 'back'. */
|
/* Fired on each transfer-status change of slot 'back'. */
|
||||||
typedef int (*t_hts_htmlcheck_xfrstatus) (t_hts_callbackarg * carg,
|
typedef int (*t_hts_htmlcheck_xfrstatus)(t_hts_callbackarg *carg, httrackp *opt,
|
||||||
httrackp * opt, lien_back * back);
|
lien_back *back);
|
||||||
|
|
||||||
/* Choose the local save path for a URL; write it into 'save'. adr/fil name the
|
/* Choose the local save path for a URL; write it into 'save'. adr/fil name the
|
||||||
target, referer_adr/referer_fil the page that linked it. */
|
target, referer_adr/referer_fil the page that linked it. */
|
||||||
typedef int (*t_hts_htmlcheck_savename) (t_hts_callbackarg * carg,
|
typedef int (*t_hts_htmlcheck_savename)(t_hts_callbackarg *carg, httrackp *opt,
|
||||||
httrackp * opt,
|
|
||||||
const char *adr_complete,
|
const char *adr_complete,
|
||||||
const char *fil_complete,
|
const char *fil_complete,
|
||||||
const char *referer_adr,
|
const char *referer_adr,
|
||||||
@@ -206,9 +204,9 @@ typedef t_hts_htmlcheck_savename t_hts_htmlcheck_extsavename;
|
|||||||
|
|
||||||
/* Inspect or edit the outgoing request headers in 'buff' before they are sent.
|
/* Inspect or edit the outgoing request headers in 'buff' before they are sent.
|
||||||
*/
|
*/
|
||||||
typedef int (*t_hts_htmlcheck_sendhead) (t_hts_callbackarg * carg,
|
typedef int (*t_hts_htmlcheck_sendhead)(t_hts_callbackarg *carg, httrackp *opt,
|
||||||
httrackp * opt, char *buff,
|
char *buff, const char *adr,
|
||||||
const char *adr, const char *fil,
|
const char *fil,
|
||||||
const char *referer_adr,
|
const char *referer_adr,
|
||||||
const char *referer_fil,
|
const char *referer_fil,
|
||||||
htsblk *outgoing);
|
htsblk *outgoing);
|
||||||
|
|||||||
359
src/htsdns_selftest.c
Normal file
359
src/htsdns_selftest.c
Normal file
@@ -0,0 +1,359 @@
|
|||||||
|
/* ------------------------------------------------------------ */
|
||||||
|
/*
|
||||||
|
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||||
|
Copyright (C) 2026 Xavier Roche and other contributors
|
||||||
|
|
||||||
|
SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
Ethical use: we kindly ask that you NOT use this software to harvest email
|
||||||
|
addresses or to collect any other private information about people. Doing so
|
||||||
|
would dishonor our work and waste the many hours we have spent on it.
|
||||||
|
|
||||||
|
Please visit our Website: http://www.httrack.com
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* ------------------------------------------------------------ */
|
||||||
|
/* File: htsdns_selftest.c subroutines: */
|
||||||
|
/* in-process self-test for the DNS resolver and cache */
|
||||||
|
/* Author: Xavier Roche */
|
||||||
|
/* ------------------------------------------------------------ */
|
||||||
|
|
||||||
|
/* Routes the resolver through a scripted getaddrinfo (hts_resolver_backend)
|
||||||
|
instead of the network, so resolution and the DNS cache are testable for a
|
||||||
|
fixed set of scenarios (IPv4/IPv6/dual-stack, errors, family filter,
|
||||||
|
cache reuse) with no live DNS. */
|
||||||
|
|
||||||
|
#define HTS_INTERNAL_BYTECODE
|
||||||
|
|
||||||
|
#include "htsdns_selftest.h"
|
||||||
|
|
||||||
|
#include "htscore.h"
|
||||||
|
#include "htslib.h"
|
||||||
|
#include "htsnet.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#if HTS_INET6 != 0
|
||||||
|
|
||||||
|
/* IPV6_resolver: 0 = v4+v6, 1 = v4 only, 2 = v6 only (htscoremain -@i). */
|
||||||
|
extern int IPV6_resolver;
|
||||||
|
|
||||||
|
/* One scripted host: either a getaddrinfo error, or an ordered address list. */
|
||||||
|
typedef struct mock_addr {
|
||||||
|
int family; /* AF_INET / AF_INET6 */
|
||||||
|
unsigned char addr[16]; /* 4 (v4) or 16 (v6) meaningful bytes */
|
||||||
|
} mock_addr;
|
||||||
|
|
||||||
|
typedef struct mock_host {
|
||||||
|
const char *name;
|
||||||
|
int gai_err; /* non-zero: getaddrinfo returns this */
|
||||||
|
int naddr;
|
||||||
|
mock_addr addr[6];
|
||||||
|
int calls; /* times the backend resolved this host */
|
||||||
|
} mock_host;
|
||||||
|
|
||||||
|
static mock_host mock_hosts[] = {
|
||||||
|
{"v4only.test", 0, 1, {{AF_INET, {1, 2, 3, 4}}}, 0},
|
||||||
|
{"v6only.test", 0, 1, {{AF_INET6, {0x20, 0x01, 0x0d, 0xb8, [15] = 1}}}, 0},
|
||||||
|
/* dual stack, IPv6 first (RFC 6724 order) then IPv4 */
|
||||||
|
{"dual.test",
|
||||||
|
0,
|
||||||
|
2,
|
||||||
|
{{AF_INET6, {0x20, 0x01, 0x0d, 0xb8, [15] = 2}}, {AF_INET, {5, 6, 7, 8}}},
|
||||||
|
0},
|
||||||
|
/* dual stack, IPv4 first: distinguishes "keep the first address" from
|
||||||
|
"prefer a family", so the selection contract is actually pinned. */
|
||||||
|
{"dual4.test",
|
||||||
|
0,
|
||||||
|
2,
|
||||||
|
{{AF_INET, {9, 10, 11, 12}},
|
||||||
|
{AF_INET6, {0x20, 0x01, 0x0d, 0xb8, [15] = 3}}},
|
||||||
|
0},
|
||||||
|
/* more addresses than HTS_MAXADDRNUM: the list must clamp to the cap. */
|
||||||
|
{"many.test",
|
||||||
|
0,
|
||||||
|
6,
|
||||||
|
{{AF_INET, {10, 0, 0, 1}},
|
||||||
|
{AF_INET, {10, 0, 0, 2}},
|
||||||
|
{AF_INET, {10, 0, 0, 3}},
|
||||||
|
{AF_INET, {10, 0, 0, 4}},
|
||||||
|
{AF_INET, {10, 0, 0, 5}},
|
||||||
|
{AF_INET, {10, 0, 0, 6}}},
|
||||||
|
0},
|
||||||
|
{"nodns.test", EAI_NONAME, 0, {{0}}, 0},
|
||||||
|
};
|
||||||
|
|
||||||
|
static mock_host *mock_find(const char *name) {
|
||||||
|
for (size_t i = 0; i < sizeof(mock_hosts) / sizeof(mock_hosts[0]); i++) {
|
||||||
|
if (strcmp(mock_hosts[i].name, name) == 0)
|
||||||
|
return &mock_hosts[i];
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mock_reset_calls(void) {
|
||||||
|
for (size_t i = 0; i < sizeof(mock_hosts) / sizeof(mock_hosts[0]); i++)
|
||||||
|
mock_hosts[i].calls = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Build one addrinfo node owning its sockaddr (freed by mock_freeaddrinfo). */
|
||||||
|
static struct addrinfo *mock_mkai(const mock_addr *a) {
|
||||||
|
struct addrinfo *ai = calloct(1, sizeof(*ai));
|
||||||
|
|
||||||
|
ai->ai_family = a->family;
|
||||||
|
if (a->family == AF_INET) {
|
||||||
|
struct sockaddr_in *sin = calloct(1, sizeof(*sin));
|
||||||
|
|
||||||
|
sin->sin_family = AF_INET;
|
||||||
|
memcpy(&sin->sin_addr, a->addr, 4);
|
||||||
|
ai->ai_addr = (struct sockaddr *) sin;
|
||||||
|
ai->ai_addrlen = sizeof(*sin);
|
||||||
|
} else {
|
||||||
|
struct sockaddr_in6 *sin6 = calloct(1, sizeof(*sin6));
|
||||||
|
|
||||||
|
sin6->sin6_family = AF_INET6;
|
||||||
|
memcpy(&sin6->sin6_addr, a->addr, 16);
|
||||||
|
ai->ai_addr = (struct sockaddr *) sin6;
|
||||||
|
ai->ai_addrlen = sizeof(*sin6);
|
||||||
|
}
|
||||||
|
return ai;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mock_getaddrinfo(const char *node, const char *service,
|
||||||
|
const struct addrinfo *hints,
|
||||||
|
struct addrinfo **res) {
|
||||||
|
mock_host *const h = mock_find(node);
|
||||||
|
const int want = (hints != NULL) ? hints->ai_family : PF_UNSPEC;
|
||||||
|
struct addrinfo *head = NULL, *tail = NULL;
|
||||||
|
|
||||||
|
(void) service;
|
||||||
|
*res = NULL;
|
||||||
|
if (h == NULL)
|
||||||
|
return EAI_NONAME;
|
||||||
|
h->calls++; /* a real backend hit; a cached host skips this */
|
||||||
|
if (h->gai_err != 0)
|
||||||
|
return h->gai_err;
|
||||||
|
for (int i = 0; i < h->naddr; i++) {
|
||||||
|
if (want != PF_UNSPEC && want != h->addr[i].family)
|
||||||
|
continue; /* honor the requested family (v4/v6 only) */
|
||||||
|
struct addrinfo *const ai = mock_mkai(&h->addr[i]);
|
||||||
|
|
||||||
|
if (head == NULL)
|
||||||
|
head = ai;
|
||||||
|
else
|
||||||
|
tail->ai_next = ai;
|
||||||
|
tail = ai;
|
||||||
|
}
|
||||||
|
if (head == NULL)
|
||||||
|
return EAI_NONAME; /* filtered to empty, as the libc resolver does */
|
||||||
|
*res = head;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mock_freeaddrinfo(struct addrinfo *res) {
|
||||||
|
while (res != NULL) {
|
||||||
|
struct addrinfo *const next = res->ai_next;
|
||||||
|
|
||||||
|
freet(res->ai_addr);
|
||||||
|
freet(res);
|
||||||
|
res = next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static const hts_resolver_backend mock_backend = {mock_getaddrinfo,
|
||||||
|
mock_freeaddrinfo};
|
||||||
|
|
||||||
|
static int failures = 0;
|
||||||
|
|
||||||
|
#define CHECK(cond) \
|
||||||
|
do { \
|
||||||
|
if (!(cond)) { \
|
||||||
|
failures++; \
|
||||||
|
fprintf(stderr, "dns-selftest: FAIL at %s:%d: %s\n", __FILE__, __LINE__, \
|
||||||
|
#cond); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/* Resolve via the uncached entry point; return the address family, or
|
||||||
|
AF_UNSPEC if the host did not resolve. */
|
||||||
|
static int resolve_family_nocache(const char *host) {
|
||||||
|
SOCaddr addr;
|
||||||
|
const char *err = NULL;
|
||||||
|
|
||||||
|
if (hts_dns_resolve_nocache2(host, &addr, &err) == NULL)
|
||||||
|
return AF_UNSPEC;
|
||||||
|
return SOCaddr_sinfamily(addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
int dns_selftests(httrackp *opt) {
|
||||||
|
failures = 0;
|
||||||
|
hts_dns_set_resolver_backend(&mock_backend);
|
||||||
|
|
||||||
|
/* IPv4-only / IPv6-only hosts map to the right family. */
|
||||||
|
IPV6_resolver = 0;
|
||||||
|
CHECK(resolve_family_nocache("v4only.test") == AF_INET);
|
||||||
|
CHECK(resolve_family_nocache("v6only.test") == AF_INET6);
|
||||||
|
|
||||||
|
/* Dual-stack: the single-address API returns the *first* resolved address.
|
||||||
|
Both orderings pin selection by position, not a family preference. The
|
||||||
|
multi-address API (resolve_all, below) exposes the whole list. */
|
||||||
|
CHECK(resolve_family_nocache("dual.test") == AF_INET6); /* v6 listed first */
|
||||||
|
CHECK(resolve_family_nocache("dual4.test") == AF_INET); /* v4 listed first */
|
||||||
|
|
||||||
|
/* Unknown host does not resolve. */
|
||||||
|
CHECK(resolve_family_nocache("nodns.test") == AF_UNSPEC);
|
||||||
|
|
||||||
|
/* Family filter (-@i4 / -@i6) selects v4 / v6 out of the dual-stack host. */
|
||||||
|
IPV6_resolver = 1;
|
||||||
|
CHECK(resolve_family_nocache("dual.test") == AF_INET);
|
||||||
|
IPV6_resolver = 2;
|
||||||
|
CHECK(resolve_family_nocache("dual.test") == AF_INET6);
|
||||||
|
IPV6_resolver = 0;
|
||||||
|
|
||||||
|
/* Cached driver resolves a host once and reuses the *same* address. */
|
||||||
|
mock_reset_calls();
|
||||||
|
{
|
||||||
|
SOCaddr a1, a2;
|
||||||
|
char ip1[64], ip2[64];
|
||||||
|
const char *err = NULL;
|
||||||
|
|
||||||
|
CHECK(hts_dns_resolve2(opt, "v4only.test", &a1, &err) != NULL);
|
||||||
|
CHECK(hts_dns_resolve2(opt, "v4only.test", &a2, &err) != NULL);
|
||||||
|
CHECK(mock_find("v4only.test")->calls == 1);
|
||||||
|
/* the cache returns the right address, not merely a hit for the key */
|
||||||
|
SOCaddr_inetntoa(ip1, sizeof(ip1), a1);
|
||||||
|
SOCaddr_inetntoa(ip2, sizeof(ip2), a2);
|
||||||
|
CHECK(strcmp(ip1, "1.2.3.4") == 0);
|
||||||
|
CHECK(strcmp(ip1, ip2) == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* A negative result is cached too: a second lookup does not re-resolve. */
|
||||||
|
{
|
||||||
|
SOCaddr a1, a2;
|
||||||
|
const char *err = NULL;
|
||||||
|
|
||||||
|
CHECK(hts_dns_resolve2(opt, "nodns.test", &a1, &err) == NULL);
|
||||||
|
CHECK(hts_dns_resolve2(opt, "nodns.test", &a2, &err) == NULL);
|
||||||
|
CHECK(mock_find("nodns.test")->calls == 1); /* resolved once, then cached */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Multi-address resolution: count and order are the connect-fallback
|
||||||
|
contract. A dead first address is retried against the next, so both must be
|
||||||
|
exact. */
|
||||||
|
mock_reset_calls();
|
||||||
|
{
|
||||||
|
SOCaddr addrs[HTS_MAXADDRNUM];
|
||||||
|
char ip[64];
|
||||||
|
const char *err = NULL;
|
||||||
|
|
||||||
|
/* dual-stack, in resolver order: [0]=v6, [1]=v4 */
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "dual.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||||
|
2);
|
||||||
|
CHECK(SOCaddr_sinfamily(addrs[0]) == AF_INET6);
|
||||||
|
CHECK(SOCaddr_sinfamily(addrs[1]) == AF_INET);
|
||||||
|
SOCaddr_inetntoa(ip, sizeof(ip), addrs[1]);
|
||||||
|
CHECK(strcmp(ip, "5.6.7.8") == 0);
|
||||||
|
CHECK(mock_find("dual.test")->calls ==
|
||||||
|
1); /* one backend hit for the list */
|
||||||
|
|
||||||
|
/* single-address host: count 1 */
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "v4only.test", addrs, HTS_MAXADDRNUM,
|
||||||
|
&err) == 1);
|
||||||
|
SOCaddr_inetntoa(ip, sizeof(ip), addrs[0]);
|
||||||
|
CHECK(strcmp(ip, "1.2.3.4") == 0);
|
||||||
|
|
||||||
|
/* does-not-resolve: count 0 (negative), no addresses */
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "nodns.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||||
|
0);
|
||||||
|
|
||||||
|
/* more than the cap: the kept list is clamped to HTS_MAXADDRNUM, keeping
|
||||||
|
the FIRST addresses in resolver order (not some other window) */
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "many.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||||
|
HTS_MAXADDRNUM);
|
||||||
|
SOCaddr_inetntoa(ip, sizeof(ip), addrs[0]);
|
||||||
|
CHECK(strcmp(ip, "10.0.0.1") == 0);
|
||||||
|
SOCaddr_inetntoa(ip, sizeof(ip), addrs[HTS_MAXADDRNUM - 1]);
|
||||||
|
CHECK(strcmp(ip, "10.0.0.4") == 0);
|
||||||
|
|
||||||
|
/* family filter still applies through the list path */
|
||||||
|
IPV6_resolver = 1;
|
||||||
|
CHECK(hts_dns_resolve_all(opt, "dual4.test", addrs, HTS_MAXADDRNUM, &err) ==
|
||||||
|
1);
|
||||||
|
CHECK(SOCaddr_sinfamily(addrs[0]) == AF_INET);
|
||||||
|
IPV6_resolver = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* newhttp_addr() must connect to the addr_index-th address, not always the
|
||||||
|
first: this is what back_connect_next relies on to reach the fallback. */
|
||||||
|
{
|
||||||
|
htsblk r;
|
||||||
|
int count = -1;
|
||||||
|
T_SOC s;
|
||||||
|
|
||||||
|
hts_init_htsblk(&r);
|
||||||
|
s = newhttp_addr(opt, "dual.test", &r, 80, 0, 0, &count);
|
||||||
|
CHECK(count == 2);
|
||||||
|
CHECK(SOCaddr_sinfamily(r.address) == AF_INET6); /* index 0 = v6 */
|
||||||
|
if (s != INVALID_SOCKET)
|
||||||
|
deletesoc(s);
|
||||||
|
|
||||||
|
hts_init_htsblk(&r);
|
||||||
|
count = -1;
|
||||||
|
s = newhttp_addr(opt, "dual.test", &r, 80, 0, 1, &count);
|
||||||
|
CHECK(count == 2);
|
||||||
|
CHECK(SOCaddr_sinfamily(r.address) == AF_INET); /* index 1 = v4 */
|
||||||
|
if (s != INVALID_SOCKET)
|
||||||
|
deletesoc(s);
|
||||||
|
|
||||||
|
/* out-of-range index: no address selected (address stays unset) */
|
||||||
|
hts_init_htsblk(&r);
|
||||||
|
s = newhttp_addr(opt, "dual.test", &r, 80, 0, 2, NULL);
|
||||||
|
CHECK(s == INVALID_SOCKET);
|
||||||
|
if (s != INVALID_SOCKET)
|
||||||
|
deletesoc(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Connect-fallback decision (consumer of the multi-address list): when a
|
||||||
|
stuck connect should abandon the current address for the next one. */
|
||||||
|
{
|
||||||
|
/* no fallback for the last/only candidate, whatever the elapsed time */
|
||||||
|
CHECK(back_connect_fallback_due(0, 1, 9999, 120) == 0);
|
||||||
|
CHECK(back_connect_fallback_due(1, 2, 9999, 120) == 0);
|
||||||
|
CHECK(back_connect_fallback_due(3, 4, 9999, 120) == 0);
|
||||||
|
/* fallback available: wait the per-candidate deadline (cap 10s here) */
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 9, 120) == 0);
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 10, 120) == 1);
|
||||||
|
CHECK(back_connect_fallback_due(2, 4, 10, 120) == 1);
|
||||||
|
/* a shorter slot timeout shortens the deadline (min(timeout, cap)) */
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 4, 5) == 0);
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 5, 5) == 1);
|
||||||
|
/* no timeout management: never force a fallback */
|
||||||
|
CHECK(back_connect_fallback_due(0, 2, 9999, 0) == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
hts_dns_set_resolver_backend(NULL);
|
||||||
|
return failures;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
int dns_selftests(httrackp *opt) {
|
||||||
|
(void) opt;
|
||||||
|
return 0; /* resolver seam only exists in the IPv6 build */
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
51
src/htsdns_selftest.h
Normal file
51
src/htsdns_selftest.h
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
/* ------------------------------------------------------------ */
|
||||||
|
/*
|
||||||
|
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||||
|
Copyright (C) 2026 Xavier Roche and other contributors
|
||||||
|
|
||||||
|
SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
Ethical use: we kindly ask that you NOT use this software to harvest email
|
||||||
|
addresses or to collect any other private information about people. Doing so
|
||||||
|
would dishonor our work and waste the many hours we have spent on it.
|
||||||
|
|
||||||
|
Please visit our Website: http://www.httrack.com
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* ------------------------------------------------------------ */
|
||||||
|
/* File: htsdns_selftest.h */
|
||||||
|
/* Author: Xavier Roche */
|
||||||
|
/* ------------------------------------------------------------ */
|
||||||
|
|
||||||
|
#ifndef HTSDNS_SELFTEST_DEFH
|
||||||
|
#define HTSDNS_SELFTEST_DEFH
|
||||||
|
|
||||||
|
#ifdef HTS_INTERNAL_BYTECODE
|
||||||
|
|
||||||
|
#ifndef HTS_DEF_FWSTRUCT_httrackp
|
||||||
|
#define HTS_DEF_FWSTRUCT_httrackp
|
||||||
|
typedef struct httrackp httrackp;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Drive the DNS resolver and cache through a scripted (mock) getaddrinfo,
|
||||||
|
asserting address family, single-address selection, negative caching, the
|
||||||
|
IPv4/IPv6 family filter, and that a cached host is resolved only once.
|
||||||
|
Returns the number of failed checks (0 == success). */
|
||||||
|
int dns_selftests(httrackp *opt);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -33,14 +33,14 @@ EOF
|
|||||||
else
|
else
|
||||||
GET "${url}"
|
GET "${url}"
|
||||||
fi
|
fi
|
||||||
) \
|
) |
|
||||||
| grep -E '^<!ENTITY [a-zA-Z0-9_]' \
|
grep -E '^<!ENTITY [a-zA-Z0-9_]' |
|
||||||
| sed \
|
sed \
|
||||||
-e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
|
-e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
|
||||||
-e 's/-->$//' \
|
-e 's/-->$//' \
|
||||||
-e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/'\
|
-e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/' |
|
||||||
| ( \
|
(
|
||||||
read A
|
read -r A
|
||||||
while test -n "$A"; do
|
while test -n "$A"; do
|
||||||
ent="${A%% *}"
|
ent="${A%% *}"
|
||||||
code=$(echo "$A" | cut -f2 -d' ')
|
code=$(echo "$A" | cut -f2 -d' ')
|
||||||
@@ -49,11 +49,11 @@ EOF
|
|||||||
i=0
|
i=0
|
||||||
a=1664525
|
a=1664525
|
||||||
c=1013904223
|
c=1013904223
|
||||||
m="$[1 << 32]"
|
m="$((1 << 32))"
|
||||||
while test "$i" -lt ${#ent}; do
|
while test "$i" -lt ${#ent}; do
|
||||||
d="$(echo -n "${ent:${i}:1}" | hexdump -v -e '/1 "%d"')"
|
d="$(echo -n "${ent:${i}:1}" | hexdump -v -e '/1 "%d"')"
|
||||||
hash="$[((${hash}*${a})%(${m})+${d}+${c})%(${m})]"
|
hash="$((((hash * a) % (m) + d + c) % (m)))"
|
||||||
i=$[${i}+1]
|
i=$((i + 1))
|
||||||
done
|
done
|
||||||
echo -e " /* $A */"
|
echo -e " /* $A */"
|
||||||
echo -e " case ${hash}u:"
|
echo -e " case ${hash}u:"
|
||||||
@@ -63,7 +63,7 @@ EOF
|
|||||||
echo -e " break;"
|
echo -e " break;"
|
||||||
|
|
||||||
# next
|
# next
|
||||||
read A
|
read -r A
|
||||||
done
|
done
|
||||||
)
|
)
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
|
|||||||
@@ -43,8 +43,8 @@ Please visit our Website: http://www.httrack.com
|
|||||||
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
||||||
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
||||||
LIB_VERSION the data/cache format generation. */
|
LIB_VERSION the data/cache format generation. */
|
||||||
#define HTTRACK_VERSION "3.49-8"
|
#define HTTRACK_VERSION "3.49-9"
|
||||||
#define HTTRACK_VERSIONID "3.49.8"
|
#define HTTRACK_VERSIONID "3.49.9"
|
||||||
#define HTTRACK_AFF_VERSION "3.x"
|
#define HTTRACK_AFF_VERSION "3.x"
|
||||||
#define HTTRACK_LIB_VERSION "2.0"
|
#define HTTRACK_LIB_VERSION "2.0"
|
||||||
|
|
||||||
@@ -226,9 +226,14 @@ Please visit our Website: http://www.httrack.com
|
|||||||
|
|
||||||
/* Copyright (C) 1998 Xavier Roche and other contributors */
|
/* Copyright (C) 1998 Xavier Roche and other contributors */
|
||||||
#define HTTRACK_AFF_AUTHORS "[XR&CO'2014]"
|
#define HTTRACK_AFF_AUTHORS "[XR&CO'2014]"
|
||||||
#define HTS_DEFAULT_FOOTER "<!-- Mirrored from %s%s by HTTrack Website Copier/" HTTRACK_AFF_VERSION " " HTTRACK_AFF_AUTHORS ", %s -->"
|
#define HTS_DEFAULT_FOOTER \
|
||||||
|
"<!-- Mirrored from %s%s by HTTrack Website Copier/" HTTRACK_AFF_VERSION \
|
||||||
|
" " HTTRACK_AFF_AUTHORS ", %s -->"
|
||||||
#define HTTRACK_WEB "http://www.httrack.com"
|
#define HTTRACK_WEB "http://www.httrack.com"
|
||||||
#define HTS_UPDATE_WEBSITE "http://www.httrack.com/update.php3?Product=HTTrack&Version=" HTTRACK_VERSIONID "&VersionStr=" HTTRACK_VERSION "&Platform=%d&Language=%s"
|
#define HTS_UPDATE_WEBSITE \
|
||||||
|
"http://www.httrack.com/" \
|
||||||
|
"update.php3?Product=HTTrack&Version=" HTTRACK_VERSIONID \
|
||||||
|
"&VersionStr=" HTTRACK_VERSION "&Platform=%d&Language=%s"
|
||||||
|
|
||||||
#define H_CRLF "\x0d\x0a"
|
#define H_CRLF "\x0d\x0a"
|
||||||
#define CRLF "\x0d\x0a"
|
#define CRLF "\x0d\x0a"
|
||||||
@@ -242,6 +247,25 @@ Please visit our Website: http://www.httrack.com
|
|||||||
#define HTS_NOPARAM "(none)"
|
#define HTS_NOPARAM "(none)"
|
||||||
#define HTS_NOPARAM2 "\"(none)\""
|
#define HTS_NOPARAM2 "\"(none)\""
|
||||||
|
|
||||||
|
/* Boolean flag for option fields and API yes/no returns. Int-backed, not an
|
||||||
|
enum: an enum makes C++ reject `field = 1` / `f(0)` on the exported fields
|
||||||
|
and params. Int-sized, so the httrackp layout and the ABI are unchanged. */
|
||||||
|
#ifndef HTS_DEF_DEFSTRUCT_hts_boolean
|
||||||
|
#define HTS_DEF_DEFSTRUCT_hts_boolean
|
||||||
|
|
||||||
|
typedef int hts_boolean;
|
||||||
|
#define HTS_FALSE 0
|
||||||
|
#define HTS_TRUE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef HTS_DEF_DEFSTRUCT_hts_tristate
|
||||||
|
#define HTS_DEF_DEFSTRUCT_hts_tristate
|
||||||
|
/* Tri-state hts_boolean: HTS_DEFAULT (-1) = "unspecified" (copy_htsopt leaves
|
||||||
|
the target untouched); HTS_FALSE/HTS_TRUE = off/on. */
|
||||||
|
typedef int hts_tristate;
|
||||||
|
#define HTS_DEFAULT (-1)
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Larger/smaller of two values. Macros: arguments are evaluated twice. */
|
/* Larger/smaller of two values. Macros: arguments are evaluated twice. */
|
||||||
#define maximum(A, B) ((A) > (B) ? (A) : (B))
|
#define maximum(A, B) ((A) > (B) ? (A) : (B))
|
||||||
|
|
||||||
@@ -270,8 +294,8 @@ Please visit our Website: http://www.httrack.com
|
|||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
/* See <http://gcc.gnu.org/wiki/Visibility> */
|
/* See <http://gcc.gnu.org/wiki/Visibility> */
|
||||||
#if ( ( defined(__GNUC__) && ( __GNUC__ >= 4 ) ) \
|
#if ((defined(__GNUC__) && (__GNUC__ >= 4)) || \
|
||||||
|| ( defined(HAVE_VISIBILITY) && HAVE_VISIBILITY ) )
|
(defined(HAVE_VISIBILITY) && HAVE_VISIBILITY))
|
||||||
|
|
||||||
#define HTSEXT_API __attribute__((visibility("default")))
|
#define HTSEXT_API __attribute__((visibility("default")))
|
||||||
#else
|
#else
|
||||||
@@ -327,8 +351,8 @@ typedef __int64 LLint;
|
|||||||
typedef __int64 TStamp;
|
typedef __int64 TStamp;
|
||||||
|
|
||||||
#define LLintP "%I64d"
|
#define LLintP "%I64d"
|
||||||
#elif (defined(_LP64) || defined(__x86_64__) \
|
#elif (defined(_LP64) || defined(__x86_64__) || defined(__powerpc64__) || \
|
||||||
|| defined(__powerpc64__) || defined(__64BIT__))
|
defined(__64BIT__))
|
||||||
|
|
||||||
typedef long int LLint;
|
typedef long int LLint;
|
||||||
|
|
||||||
@@ -384,6 +408,10 @@ typedef int T_SOC;
|
|||||||
/* Buffer size for a printed network address (IPv4 or IPv6, NUL included). */
|
/* Buffer size for a printed network address (IPv4 or IPv6, NUL included). */
|
||||||
#define HTS_MAXADDRLEN 64
|
#define HTS_MAXADDRLEN 64
|
||||||
|
|
||||||
|
/* Max resolved addresses kept per host for connect fallback (dead IPv6 etc.).
|
||||||
|
*/
|
||||||
|
#define HTS_MAXADDRNUM 4
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#else
|
#else
|
||||||
#define __cdecl
|
#define __cdecl
|
||||||
@@ -397,7 +425,8 @@ typedef int T_SOC;
|
|||||||
#if HTS_ACCESS
|
#if HTS_ACCESS
|
||||||
#define HTS_ACCESS_FILE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
|
#define HTS_ACCESS_FILE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
|
||||||
|
|
||||||
#define HTS_ACCESS_FOLDER (S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
|
#define HTS_ACCESS_FOLDER \
|
||||||
|
(S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)
|
||||||
#else
|
#else
|
||||||
#define HTS_ACCESS_FILE (S_IRUSR | S_IWUSR)
|
#define HTS_ACCESS_FILE (S_IRUSR | S_IWUSR)
|
||||||
|
|
||||||
@@ -419,7 +448,11 @@ typedef int T_SOC;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* fflush sur stdout */
|
/* fflush sur stdout */
|
||||||
#define io_flush { fflush(stdout); fflush(stdin); }
|
#define io_flush \
|
||||||
|
{ \
|
||||||
|
fflush(stdout); \
|
||||||
|
fflush(stdin); \
|
||||||
|
}
|
||||||
|
|
||||||
/* HTSLib */
|
/* HTSLib */
|
||||||
|
|
||||||
@@ -516,7 +549,13 @@ static const t_htsboundary htsboundary = 0xDEADBEEF;
|
|||||||
#if _HTS_WIDE
|
#if _HTS_WIDE
|
||||||
extern FILE *DEBUG_fp;
|
extern FILE *DEBUG_fp;
|
||||||
|
|
||||||
#define DEBUG_W(A) { if (DEBUG_fp==NULL) DEBUG_fp=fopen("bug.out","wb"); fprintf(DEBUG_fp,":>"A); fflush(DEBUG_fp); }
|
#define DEBUG_W(A) \
|
||||||
|
{ \
|
||||||
|
if (DEBUG_fp == NULL) \
|
||||||
|
DEBUG_fp = fopen("bug.out", "wb"); \
|
||||||
|
fprintf(DEBUG_fp, ":>" A); \
|
||||||
|
fflush(DEBUG_fp); \
|
||||||
|
}
|
||||||
#undef _
|
#undef _
|
||||||
#define _ ,
|
#define _ ,
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
755
src/htslib.c
755
src/htslib.c
File diff suppressed because it is too large
Load Diff
63
src/htslib.h
63
src/htslib.h
@@ -147,11 +147,13 @@ struct OLD_htsblk {
|
|||||||
#define HTS_DEF_FWSTRUCT_t_dnscache
|
#define HTS_DEF_FWSTRUCT_t_dnscache
|
||||||
typedef struct t_dnscache t_dnscache;
|
typedef struct t_dnscache t_dnscache;
|
||||||
#endif
|
#endif
|
||||||
|
// One DNS cache record, stored as a coucal value keyed by hostname.
|
||||||
struct t_dnscache {
|
struct t_dnscache {
|
||||||
struct t_dnscache *next;
|
// resolved addresses, in resolver (RFC 6724) order; host_count==0 means the
|
||||||
const char *iadr;
|
// name does not resolve (negative cache). host_count<=HTS_MAXADDRNUM.
|
||||||
size_t host_length; // length ; (4 or 16) ; 0 for error
|
int host_count;
|
||||||
char host_addr[HTS_MAXADDRLEN];
|
size_t host_length[HTS_MAXADDRNUM]; // sockaddr length of each (16 or 28)
|
||||||
|
char host_addr[HTS_MAXADDRNUM][HTS_MAXADDRLEN];
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Library internal definictions */
|
/* Library internal definictions */
|
||||||
@@ -182,10 +184,22 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode, const char *xsend
|
|||||||
const char *adr, const char *fil,
|
const char *adr, const char *fil,
|
||||||
const char *referer_adr, const char *referer_fil,
|
const char *referer_adr, const char *referer_fil,
|
||||||
htsblk * retour);
|
htsblk * retour);
|
||||||
|
/* Build the request "Cookie:" header line for stored cookies matching
|
||||||
|
domain/path into dst (NUL-terminated). Exposed for the -#Q self-test;
|
||||||
|
wraps the same logic http_sendhead() uses. Returns cookies emitted. */
|
||||||
|
int http_cookie_header_selftest(t_cookie *cookie, const char *domain,
|
||||||
|
const char *path, char *dst, size_t dst_size);
|
||||||
|
|
||||||
//int newhttp(char* iadr,char* err=NULL);
|
//int newhttp(char* iadr,char* err=NULL);
|
||||||
T_SOC newhttp(httrackp * opt, const char *iadr, htsblk * retour, int port,
|
T_SOC newhttp(httrackp * opt, const char *iadr, htsblk * retour, int port,
|
||||||
int waitconnect);
|
int waitconnect);
|
||||||
|
/* Like newhttp(), but connect to the addr_index-th resolved address of the host
|
||||||
|
(0-based) instead of always the first; *addr_count, if non-NULL, is set to
|
||||||
|
the total resolved addresses. newhttp() == newhttp_addr(...,0,NULL). Used by
|
||||||
|
the slot scheduler to try the next address when a connect fails (dead IPv6
|
||||||
|
etc.). */
|
||||||
|
T_SOC newhttp_addr(httrackp *opt, const char *iadr, htsblk *retour, int port,
|
||||||
|
int waitconnect, int addr_index, int *addr_count);
|
||||||
HTS_INLINE void deletehttp(htsblk * r);
|
HTS_INLINE void deletehttp(htsblk * r);
|
||||||
HTS_INLINE int deleteaddr(htsblk * r);
|
HTS_INLINE int deleteaddr(htsblk * r);
|
||||||
HTS_INLINE void deletesoc(T_SOC soc);
|
HTS_INLINE void deletesoc(T_SOC soc);
|
||||||
@@ -193,15 +207,31 @@ HTS_INLINE void deletesoc_r(htsblk * r);
|
|||||||
htsblk http_test(httrackp * opt, const char *adr, const char *fil, char *loc);
|
htsblk http_test(httrackp * opt, const char *adr, const char *fil, char *loc);
|
||||||
int check_readinput(htsblk * r);
|
int check_readinput(htsblk * r);
|
||||||
int check_readinput_t(T_SOC soc, int timeout);
|
int check_readinput_t(T_SOC soc, int timeout);
|
||||||
|
int check_writeinput_t(T_SOC soc, int timeout);
|
||||||
|
|
||||||
|
/* Open an HTTP CONNECT tunnel through the active proxy for an https request:
|
||||||
|
`retour->soc` must already be TCP-connected to the proxy, and `adr` is the
|
||||||
|
origin authority (url_adr, e.g. "https://host:port"). Sends the CONNECT
|
||||||
|
request (with Proxy-Authorization when the proxy carries credentials) and
|
||||||
|
reads the proxy's status line, so the caller's TLS handshake then runs
|
||||||
|
end-to-end with the origin. Blocks up to `timeout` seconds. Returns 1 on a
|
||||||
|
2xx tunnel, 0 on failure (retour->msg/statuscode set). */
|
||||||
|
int http_proxy_tunnel(httrackp *opt, htsblk *retour, const char *adr,
|
||||||
|
int timeout);
|
||||||
void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * retour,
|
void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * retour,
|
||||||
char *rcvd);
|
char *rcvd);
|
||||||
void treatfirstline(htsblk * retour, const char *rcvd);
|
void treatfirstline(htsblk * retour, const char *rcvd);
|
||||||
|
|
||||||
// sous-fonctions
|
// sous-fonctions
|
||||||
LLint http_xfread1(htsblk * r, int bufl);
|
LLint http_xfread1(htsblk * r, int bufl);
|
||||||
HTS_INLINE SOCaddr* hts_dns_resolve2(httrackp * opt, const char *iadr,
|
/* Cached resolver: fill out[0..count-1] with up to max addresses for iadr (in
|
||||||
SOCaddr *const addr,
|
resolver order), returning the count (0 = does not resolve, negative-cached).
|
||||||
|
Resolves once per host; later calls read the DNS cache. Must hold no lock
|
||||||
|
(brackets opt->state.lock itself). */
|
||||||
|
int hts_dns_resolve_all(httrackp *opt, const char *iadr, SOCaddr *out, int max,
|
||||||
const char **error);
|
const char **error);
|
||||||
|
HTS_INLINE SOCaddr *hts_dns_resolve2(httrackp *opt, const char *iadr,
|
||||||
|
SOCaddr *const addr, const char **error);
|
||||||
HTS_INLINE SOCaddr* hts_dns_resolve(httrackp * opt, const char *iadr,
|
HTS_INLINE SOCaddr* hts_dns_resolve(httrackp * opt, const char *iadr,
|
||||||
SOCaddr *const addr);
|
SOCaddr *const addr);
|
||||||
HTSEXT_API SOCaddr* hts_dns_resolve_nocache2(const char *const hostname,
|
HTSEXT_API SOCaddr* hts_dns_resolve_nocache2(const char *const hostname,
|
||||||
@@ -214,8 +244,9 @@ HTSEXT_API int check_hostname_dns(const char *const hostname);
|
|||||||
int ftp_available(void);
|
int ftp_available(void);
|
||||||
|
|
||||||
#if HTS_DNSCACHE
|
#if HTS_DNSCACHE
|
||||||
void hts_cache_free(t_dnscache *const cache);
|
/* Return opt's DNS cache hashtable (hostname -> t_dnscache record), creating it
|
||||||
t_dnscache *hts_cache(httrackp * opt);
|
on first use. Records are owned by the table and freed on coucal_delete. */
|
||||||
|
coucal hts_cache(httrackp *opt);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// outils divers
|
// outils divers
|
||||||
@@ -465,10 +496,22 @@ HTS_STATIC int strcmpnocase(const char *a, const char *b) {
|
|||||||
|
|
||||||
// is this MIME an hypertext MIME (text/html), html/js-style or other script/text type?
|
// is this MIME an hypertext MIME (text/html), html/js-style or other script/text type?
|
||||||
#define HTS_HYPERTEXT_DEFAULT_MIME "text/html"
|
#define HTS_HYPERTEXT_DEFAULT_MIME "text/html"
|
||||||
|
/* Sentinel stored when the server declared no Content-Type. It is html-ish
|
||||||
|
for every type test (so a typeless response still parses/stores as today),
|
||||||
|
but the naming code (wire_patches_ext) treats it as "no declared type" and
|
||||||
|
keeps the URL extension. It rides the cache, so updates name consistently. */
|
||||||
|
#define HTS_UNKNOWN_MIME "unknown/unknown"
|
||||||
|
/* Map the no-declared-type sentinel back to a real type for any header or
|
||||||
|
record we EMIT or PERSIST, so "unknown/unknown" never reaches a consumer
|
||||||
|
(a served Content-Type, a ProxyTrack .arc record, ...). */
|
||||||
|
#define hts_effective_mime(m) \
|
||||||
|
(strfield2((m), HTS_UNKNOWN_MIME) ? HTS_HYPERTEXT_DEFAULT_MIME : (m))
|
||||||
|
|
||||||
#define is_html_mime_type(a) \
|
#define is_html_mime_type(a) \
|
||||||
( (strfield2((a),"text/html")!=0)\
|
((strfield2((a), "text/html") != 0) || \
|
||||||
|| (strfield2((a),"application/xhtml+xml")!=0) \
|
(strfield2((a), "application/xhtml+xml") != 0) || \
|
||||||
|
(strfield2((a), HTS_UNKNOWN_MIME) != \
|
||||||
|
0) /* no declared type: treat as html */ \
|
||||||
)
|
)
|
||||||
#define is_hypertext_mime__(a) \
|
#define is_hypertext_mime__(a) \
|
||||||
( \
|
( \
|
||||||
|
|||||||
@@ -92,8 +92,8 @@ struct htsmoduleStruct {
|
|||||||
|
|
||||||
/* Callbacks */
|
/* Callbacks */
|
||||||
t_htsAddLink addLink; /* call this function when links are
|
t_htsAddLink addLink; /* call this function when links are
|
||||||
being detected. it if not your responsability to decide
|
being detected. it if not your responsability to
|
||||||
if the engine will keep them, or not. */
|
decide if the engine will keep them, or not. */
|
||||||
|
|
||||||
/* Optional */
|
/* Optional */
|
||||||
char *localLink; /* if non null, the engine will write there the local
|
char *localLink; /* if non null, the engine will write there the local
|
||||||
@@ -117,7 +117,6 @@ struct htsmoduleStruct {
|
|||||||
int *ptr_;
|
int *ptr_;
|
||||||
const char *page_charset_;
|
const char *page_charset_;
|
||||||
/* Internal use - please don't touch */
|
/* Internal use - please don't touch */
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
119
src/htsname.c
119
src/htsname.c
@@ -138,6 +138,35 @@ static void cleanEndingSpaceOrDot(char *s) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Should the wire Content-Type override the URL's own extension when naming the
|
||||||
|
saved file? True when the type is patchable (may_unknown2) and either the URL
|
||||||
|
extension implies no specific type or the server declared a disagreeing one.
|
||||||
|
A URL extension mapping to a specific non-HTML type is kept only when the
|
||||||
|
server declared NO type (the HTS_UNKNOWN_MIME sentinel; the #267 mangle
|
||||||
|
guard): a typeless .png stays .png, but a .pdf explicitly served as text/html
|
||||||
|
is named .html. The sentinel rides the cache, so updates stay consistent. */
|
||||||
|
static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||||
|
const char *file) {
|
||||||
|
char urlmime[256];
|
||||||
|
|
||||||
|
if (may_unknown2(opt, wiremime, file))
|
||||||
|
return 0; /* type kept verbatim (keep-list / bogus-multiple) */
|
||||||
|
urlmime[0] = '\0';
|
||||||
|
/* type implied by the URL extension, only when confidently known (flag 0) */
|
||||||
|
if (!get_httptype_sized(opt, urlmime, sizeof(urlmime), file, 0))
|
||||||
|
return 1; /* URL ext implies no known type: trust the wire type */
|
||||||
|
if (strfield2(wiremime, urlmime))
|
||||||
|
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
|
||||||
|
/* wire disagrees with a specific non-HTML URL ext. Keep the ext only when
|
||||||
|
the server declared no type (the sentinel); an explicitly declared type,
|
||||||
|
even text/html, is trusted, so a binary-looking URL that really serves
|
||||||
|
HTML (login/error interstitial, soft-404) is named .html. */
|
||||||
|
if (!is_hypertext_mime(opt, urlmime, file) &&
|
||||||
|
strfield2(wiremime, HTS_UNKNOWN_MIME))
|
||||||
|
return 0;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
// forme le nom du fichier à sauver (save) à partir de fil et adr
|
// forme le nom du fichier à sauver (save) à partir de fil et adr
|
||||||
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
|
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
|
||||||
int url_savename(lien_adrfilsave *const afs,
|
int url_savename(lien_adrfilsave *const afs,
|
||||||
@@ -184,10 +213,11 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
|
|
||||||
/* 8-3 ? */
|
/* 8-3 ? */
|
||||||
switch (opt->savename_83) {
|
switch (opt->savename_83) {
|
||||||
case 1: // 8-3
|
case HTS_SAVENAME_83_DOS: // 8-3
|
||||||
max_char = 8;
|
max_char = 8;
|
||||||
break;
|
break;
|
||||||
case 2: // Level 2 File names may be up to 31 characters.
|
case HTS_SAVENAME_83_ISO9660: // Level 2 File names may be up to 31
|
||||||
|
// characters.
|
||||||
max_char = 31;
|
max_char = 31;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@@ -324,7 +354,10 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* replace shtml to html.. */
|
/* replace shtml to html.. */
|
||||||
if (opt->savename_delayed == 2)
|
/* HARD delays every type, except one the user pinned with --assume: honor it
|
||||||
|
immediately (ishtml() consults the user type), no delayed name (#56) */
|
||||||
|
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD &&
|
||||||
|
!is_userknowntype(opt, fil))
|
||||||
is_html = -1; /* ALWAYS delay type */
|
is_html = -1; /* ALWAYS delay type */
|
||||||
else
|
else
|
||||||
is_html = ishtml(opt, fil);
|
is_html = ishtml(opt, fil);
|
||||||
@@ -363,7 +396,9 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
) {
|
) {
|
||||||
// tester type avec requète HEAD si on ne connait pas le type du fichier
|
// tester type avec requète HEAD si on ne connait pas le type du fichier
|
||||||
if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
|
if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
|
||||||
if (opt->savename_delayed == 2 || (ishtest = ishtml(opt, fil)) < 0) { // on ne sait pas si c'est un html ou un fichier..
|
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD ||
|
||||||
|
(ishtest = ishtml(opt, fil)) <
|
||||||
|
0) { // unsure whether it's html or a file
|
||||||
// lire dans le cache
|
// lire dans le cache
|
||||||
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
|
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
|
||||||
|
|
||||||
@@ -377,7 +412,7 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
if (strnotempty(r.cdispo)) { /* filename given */
|
if (strnotempty(r.cdispo)) { /* filename given */
|
||||||
ext_chg = 2; /* change filename */
|
ext_chg = 2; /* change filename */
|
||||||
strcpybuff(ext, r.cdispo);
|
strcpybuff(ext, r.cdispo);
|
||||||
} else if (!may_unknown2(opt, r.contenttype, fil)) { // on peut patcher à priori?
|
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
|
||||||
if (give_mimext(s, sizeof(s),
|
if (give_mimext(s, sizeof(s),
|
||||||
r.contenttype)) { // recognized extension
|
r.contenttype)) { // recognized extension
|
||||||
ext_chg = 1;
|
ext_chg = 1;
|
||||||
@@ -393,11 +428,12 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
//
|
//
|
||||||
} else if (opt->savename_delayed != 2 && is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
} else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD &&
|
||||||
|
is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
||||||
Lookup mimetype not only by extension,
|
Lookup mimetype not only by extension,
|
||||||
but also by filename */
|
but also by filename */
|
||||||
/* Note: "foo.cgi => text/html" means that foo.cgi shall have the text/html MIME file type,
|
/* Note: "foo.cgi => text/html" means that foo.cgi shall have the
|
||||||
that is, ".html" */
|
text/html MIME file type, that is, ".html" */
|
||||||
char BIGSTK mime[1024];
|
char BIGSTK mime[1024];
|
||||||
|
|
||||||
mime[0] = ext[0] = '\0';
|
mime[0] = ext[0] = '\0';
|
||||||
@@ -408,16 +444,21 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// note: if savename_delayed is enabled, the naming will be temporary (and slightly invalid!)
|
// note: if savename_delayed is enabled, the naming will be temporary
|
||||||
// note: if we are about to stop (opt->state.stop), back_add() will fail later
|
// (and slightly invalid!)
|
||||||
else if (opt->savename_delayed != 0 && !opt->state.stop) {
|
//
|
||||||
|
// note: if we are about to stop (opt->state.stop), back_add() will
|
||||||
|
// fail later
|
||||||
|
else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
|
||||||
|
!opt->state.stop) {
|
||||||
// Check if the file is ready in backing. We basically take the same logic as later.
|
// Check if the file is ready in backing. We basically take the same logic as later.
|
||||||
// FIXME: we should cleanup and factorize this unholy mess
|
// FIXME: we should cleanup and factorize this unholy mess
|
||||||
if (headers != NULL && headers->status >= 0 && !is_redirect) {
|
if (headers != NULL && headers->status >= 0 && !is_redirect) {
|
||||||
if (strnotempty(headers->r.cdispo)) { /* filename given */
|
if (strnotempty(headers->r.cdispo)) { /* filename given */
|
||||||
ext_chg = 2; /* change filename */
|
ext_chg = 2; /* change filename */
|
||||||
strcpybuff(ext, headers->r.cdispo);
|
strcpybuff(ext, headers->r.cdispo);
|
||||||
} else if (!may_unknown2(opt, headers->r.contenttype, headers->url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
|
} else if (wire_patches_ext(opt, headers->r.contenttype,
|
||||||
|
headers->url_fil)) {
|
||||||
char s[16];
|
char s[16];
|
||||||
if (give_mimext(
|
if (give_mimext(
|
||||||
s, sizeof(s),
|
s, sizeof(s),
|
||||||
@@ -633,7 +674,8 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
if (!has_been_moved) {
|
if (!has_been_moved) {
|
||||||
if (back[b].r.statuscode != -10) { // erreur
|
if (back[b].r.statuscode != -10) { // erreur
|
||||||
if (strnotempty(back[b].r.contenttype) == 0)
|
if (strnotempty(back[b].r.contenttype) == 0)
|
||||||
strcpybuff(back[b].r.contenttype, "text/html"); // message d'erreur en html
|
strcpybuff(back[b].r.contenttype,
|
||||||
|
HTS_UNKNOWN_MIME); // no declared type
|
||||||
// Finalement on, renvoie un erreur, pour ne toucher à rien dans le code
|
// Finalement on, renvoie un erreur, pour ne toucher à rien dans le code
|
||||||
// libérer emplacement backing
|
// libérer emplacement backing
|
||||||
}
|
}
|
||||||
@@ -645,7 +687,8 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
if (strnotempty(back[b].r.cdispo)) { /* filename given */
|
if (strnotempty(back[b].r.cdispo)) { /* filename given */
|
||||||
ext_chg = 2; /* change filename */
|
ext_chg = 2; /* change filename */
|
||||||
strcpybuff(ext, back[b].r.cdispo);
|
strcpybuff(ext, back[b].r.cdispo);
|
||||||
} else if (!may_unknown2(opt, back[b].r.contenttype, back[b].url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
|
} else if (wire_patches_ext(opt, back[b].r.contenttype,
|
||||||
|
back[b].url_fil)) {
|
||||||
if (give_mimext(
|
if (give_mimext(
|
||||||
s, sizeof(s),
|
s, sizeof(s),
|
||||||
back[b].r.contenttype)) { // recognized extension
|
back[b].r.contenttype)) { // recognized extension
|
||||||
@@ -717,9 +760,9 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
strcatbuff(fil, DEFAULT_HTML); // nommer page par défaut (à priori ici html depuis un proxy http)
|
strcatbuff(fil, DEFAULT_HTML); // nommer page par défaut (à priori ici html depuis un proxy http)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Changer extension?
|
// Change the extension? e.g. php3 saved as html, cgi as html or gif/xbm
|
||||||
// par exemple, php3 sera sauvé en html, cgi en html ou gif, xbm etc.. selon les cas
|
// depending on the resolved type.
|
||||||
if (ext_chg && !opt->no_type_change) { // changer ext
|
if (ext_chg && !opt->no_type_change) {
|
||||||
char *a = fil + strlen(fil) - 1;
|
char *a = fil + strlen(fil) - 1;
|
||||||
|
|
||||||
if ((opt->debug > 1) && (opt->log != NULL)) {
|
if ((opt->debug > 1) && (opt->log != NULL)) {
|
||||||
@@ -731,11 +774,19 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
adr_complete, fil_complete, ext);
|
adr_complete, fil_complete, ext);
|
||||||
}
|
}
|
||||||
if (ext_chg == 1) {
|
if (ext_chg == 1) {
|
||||||
|
// Cut the old extension only when it is empty (a bare trailing dot), the
|
||||||
|
// new one, or a recognized one; an unknown trailing ".token" (e.g.
|
||||||
|
// /article-1.884291, #115) is part of the name, not an extension.
|
||||||
|
const char *const old_ext = get_ext(catbuff, sizeof(catbuff), fil);
|
||||||
|
const int known_ext = !*old_ext || strfield2(old_ext, ext) ||
|
||||||
|
is_knowntype(opt, fil) || is_dyntype(old_ext) ||
|
||||||
|
ishtml_ext(old_ext) != -1;
|
||||||
|
|
||||||
while((a > fil) && (*a != '.') && (*a != '/'))
|
while((a > fil) && (*a != '.') && (*a != '/'))
|
||||||
a--;
|
a--;
|
||||||
if (*a == '.')
|
if (*a == '.' && known_ext)
|
||||||
*a = '\0'; // couper
|
*a = '\0'; // cut
|
||||||
strcatbuff(fil, "."); // recopier point
|
strcatbuff(fil, "."); // re-add the dot
|
||||||
} else {
|
} else {
|
||||||
while((a > fil) && (*a != '/'))
|
while((a > fil) && (*a != '/'))
|
||||||
a--;
|
a--;
|
||||||
@@ -743,7 +794,7 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
a++;
|
a++;
|
||||||
*a = '\0';
|
*a = '\0';
|
||||||
}
|
}
|
||||||
strcatbuff(fil, ext); // copier ext/nom
|
strcatbuff(fil, ext); // append ext/name
|
||||||
}
|
}
|
||||||
// Rechercher premier / et dernier .
|
// Rechercher premier / et dernier .
|
||||||
{
|
{
|
||||||
@@ -1190,7 +1241,8 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
// Not used anymore unless non-delayed types.
|
// Not used anymore unless non-delayed types.
|
||||||
// de même en cas de manque d'extension on en place une de manière forcée..
|
// de même en cas de manque d'extension on en place une de manière forcée..
|
||||||
// cela évite les /chez/toto et les /chez/toto/index.html incompatibles
|
// cela évite les /chez/toto et les /chez/toto/index.html incompatibles
|
||||||
if (opt->savename_type != -1 && opt->savename_delayed != 2) {
|
if (opt->savename_type != -1 &&
|
||||||
|
opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD) {
|
||||||
char *a = afs->save + strlen(afs->save) - 1;
|
char *a = afs->save + strlen(afs->save) - 1;
|
||||||
|
|
||||||
while((a > afs->save) && (*a != '.') && (*a != '/'))
|
while((a > afs->save) && (*a != '.') && (*a != '/'))
|
||||||
@@ -1248,18 +1300,8 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
|| c == '>' // windows forbidden
|
|| c == '>' // windows forbidden
|
||||||
|| c == '|' // windows forbidden
|
|| c == '|' // windows forbidden
|
||||||
//|| c == '@' // ?
|
//|| c == '@' // ?
|
||||||
||
|
|| (opt->savename_83 == HTS_SAVENAME_83_ISO9660 // CDROM
|
||||||
(
|
&& (c == '-' || c == '=' || c == '+'))) {
|
||||||
opt->savename_83 == 2 // CDROM
|
|
||||||
&&
|
|
||||||
(
|
|
||||||
c == '-'
|
|
||||||
|| c == '='
|
|
||||||
|| c == '+'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
{
|
|
||||||
afs->save[i] = '_';
|
afs->save[i] = '_';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1521,7 +1563,8 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
char *a = afs->save + strlen(afs->save) - 1;
|
char *a = afs->save + strlen(afs->save) - 1;
|
||||||
char *b;
|
char *b;
|
||||||
int n = 2;
|
int n = 2;
|
||||||
char collisionSeparator = ((opt->savename_83 != 2) ? '-' : '_');
|
char collisionSeparator =
|
||||||
|
((opt->savename_83 != HTS_SAVENAME_83_ISO9660) ? '-' : '_');
|
||||||
|
|
||||||
tempo[0] = '\0';
|
tempo[0] = '\0';
|
||||||
|
|
||||||
@@ -1686,10 +1729,10 @@ char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
|||||||
StringBuff(opt->path_log), digest_filename);
|
StringBuff(opt->path_log), digest_filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* remove refname if any */
|
/* remove refname if any; HTS_TRUE if it was removed */
|
||||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||||
const char *fil) {
|
const char *fil) {
|
||||||
char *filename = url_savename_refname_fullpath(opt, adr, fil);
|
char *filename = url_savename_refname_fullpath(opt, adr, fil);
|
||||||
|
|
||||||
(void) UNLINK(filename);
|
return UNLINK(filename) == 0 ? HTS_TRUE : HTS_FALSE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -104,7 +104,8 @@ char *url_md5(char *digest_buffer, const char *fil_complete);
|
|||||||
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
||||||
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||||
const char *fil);
|
const char *fil);
|
||||||
void url_savename_refname_remove(httrackp * opt, const char *adr,
|
/* Remove the temp-ref for (adr,fil); HTS_TRUE if it was removed. */
|
||||||
|
hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
|
||||||
const char *fil);
|
const char *fil);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
70
src/htsnet.h
70
src/htsnet.h
@@ -112,8 +112,8 @@ struct SOCaddr {
|
|||||||
|
|
||||||
/** Pointer to the port field (network byte order) for the active family.
|
/** Pointer to the port field (network byte order) for the active family.
|
||||||
Asserts on NULL or an unset/unknown family. */
|
Asserts on NULL or an unset/unknown family. */
|
||||||
static HTS_INLINE HTS_UNUSED in_port_t* SOCaddr_sinport_(SOCaddr *const addr,
|
static HTS_INLINE HTS_UNUSED in_port_t *
|
||||||
const char *file, const int line) {
|
SOCaddr_sinport_(SOCaddr *const addr, const char *file, const int line) {
|
||||||
assertf_(addr != NULL, file, line);
|
assertf_(addr != NULL, file, line);
|
||||||
switch (addr->m_addr.sa.sa_family) {
|
switch (addr->m_addr.sa.sa_family) {
|
||||||
case AF_INET:
|
case AF_INET:
|
||||||
@@ -134,7 +134,8 @@ static HTS_INLINE HTS_UNUSED in_port_t* SOCaddr_sinport_(SOCaddr *const addr,
|
|||||||
/** Length of the active sockaddr (sockaddr_in or sockaddr_in6), or 0 if the
|
/** Length of the active sockaddr (sockaddr_in or sockaddr_in6), or 0 if the
|
||||||
family is unset/unknown. The 0 case doubles as the "not valid" test. */
|
family is unset/unknown. The 0 case doubles as the "not valid" test. */
|
||||||
static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_size_(const SOCaddr *const addr,
|
static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_size_(const SOCaddr *const addr,
|
||||||
const char *file, const int line) {
|
const char *file,
|
||||||
|
const int line) {
|
||||||
assertf_(addr != NULL, file, line);
|
assertf_(addr != NULL, file, line);
|
||||||
switch (addr->m_addr.sa.sa_family) {
|
switch (addr->m_addr.sa.sa_family) {
|
||||||
case AF_INET:
|
case AF_INET:
|
||||||
@@ -152,8 +153,8 @@ static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_size_(const SOCaddr*const addr,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Reset to the unset state (family AF_UNSPEC), making the address invalid. */
|
/** Reset to the unset state (family AF_UNSPEC), making the address invalid. */
|
||||||
static HTS_INLINE HTS_UNUSED void SOCaddr_clear_(SOCaddr*const addr,
|
static HTS_INLINE HTS_UNUSED void
|
||||||
const char *file, const int line) {
|
SOCaddr_clear_(SOCaddr *const addr, const char *file, const int line) {
|
||||||
assertf_(addr != NULL, file, line);
|
assertf_(addr != NULL, file, line);
|
||||||
addr->m_addr.sa.sa_family = AF_UNSPEC;
|
addr->m_addr.sa.sa_family = AF_UNSPEC;
|
||||||
}
|
}
|
||||||
@@ -191,14 +192,16 @@ static HTS_INLINE HTS_UNUSED void SOCaddr_clear_(SOCaddr*const addr,
|
|||||||
|
|
||||||
/** Set the port (host-order argument, stored network-order) on the active
|
/** Set the port (host-order argument, stored network-order) on the active
|
||||||
* family. */
|
* family. */
|
||||||
#define SOCaddr_initport(server, port) do { \
|
#define SOCaddr_initport(server, port) \
|
||||||
|
do { \
|
||||||
SOCaddr_sinport(server) = htons((in_port_t) (port)); \
|
SOCaddr_sinport(server) = htons((in_port_t) (port)); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/** Initialize as an all-zero IPv4 wildcard (INADDR_ANY) address; returns its
|
/** Initialize as an all-zero IPv4 wildcard (INADDR_ANY) address; returns its
|
||||||
sockaddr length. */
|
sockaddr length. */
|
||||||
static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_initany_(SOCaddr *const addr,
|
static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_initany_(SOCaddr *const addr,
|
||||||
const char *file, const int line) {
|
const char *file,
|
||||||
|
const int line) {
|
||||||
assertf_(addr != NULL, file, line);
|
assertf_(addr != NULL, file, line);
|
||||||
memset(&addr->m_addr.in, 0, sizeof(addr->m_addr.in));
|
memset(&addr->m_addr.in, 0, sizeof(addr->m_addr.in));
|
||||||
addr->m_addr.in.sin_family = AF_INET;
|
addr->m_addr.in.sin_family = AF_INET;
|
||||||
@@ -206,7 +209,8 @@ static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_initany_(SOCaddr*const addr,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Initialize server as an IPv4 wildcard (INADDR_ANY) address. */
|
/** Initialize server as an IPv4 wildcard (INADDR_ANY) address. */
|
||||||
#define SOCaddr_initany(server) do { \
|
#define SOCaddr_initany(server) \
|
||||||
|
do { \
|
||||||
SOCaddr_initany_(&(server), __FILE__, __LINE__); \
|
SOCaddr_initany_(&(server), __FILE__, __LINE__); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
@@ -215,8 +219,10 @@ static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_initany_(SOCaddr*const addr,
|
|||||||
with port zeroed. Any other size leaves an AF_INET shell. Returns the
|
with port zeroed. Any other size leaves an AF_INET shell. Returns the
|
||||||
resulting sockaddr length. */
|
resulting sockaddr length. */
|
||||||
static HTS_UNUSED socklen_t SOCaddr_copyaddr_(SOCaddr *const server,
|
static HTS_UNUSED socklen_t SOCaddr_copyaddr_(SOCaddr *const server,
|
||||||
const void *data, const size_t data_size,
|
const void *data,
|
||||||
const char *file, const int line) {
|
const size_t data_size,
|
||||||
|
const char *file,
|
||||||
|
const int line) {
|
||||||
assertf_(server != NULL, file, line);
|
assertf_(server != NULL, file, line);
|
||||||
assertf_(data != NULL, file, line);
|
assertf_(data != NULL, file, line);
|
||||||
|
|
||||||
@@ -248,32 +254,35 @@ static HTS_UNUSED socklen_t SOCaddr_copyaddr_(SOCaddr*const server,
|
|||||||
|
|
||||||
/** Copy hpaddr (length hpsize) into server, writing the result length into the
|
/** Copy hpaddr (length hpsize) into server, writing the result length into the
|
||||||
lvalue server_len (int). See SOCaddr_copyaddr_ for accepted forms. */
|
lvalue server_len (int). See SOCaddr_copyaddr_ for accepted forms. */
|
||||||
#define SOCaddr_copyaddr(server, server_len, hpaddr, hpsize) do { \
|
#define SOCaddr_copyaddr(server, server_len, hpaddr, hpsize) \
|
||||||
server_len = (int) SOCaddr_copyaddr_(&(server), hpaddr, hpsize, __FILE__, __LINE__); \
|
do { \
|
||||||
|
server_len = (int) SOCaddr_copyaddr_(&(server), hpaddr, hpsize, __FILE__, \
|
||||||
|
__LINE__); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/** Like SOCaddr_copyaddr but discards the result length. */
|
/** Like SOCaddr_copyaddr but discards the result length. */
|
||||||
#define SOCaddr_copyaddr2(server, hpaddr, hpsize) do { \
|
#define SOCaddr_copyaddr2(server, hpaddr, hpsize) \
|
||||||
|
do { \
|
||||||
(void) SOCaddr_copyaddr_(&(server), hpaddr, hpsize, __FILE__, __LINE__); \
|
(void) SOCaddr_copyaddr_(&(server), hpaddr, hpsize, __FILE__, __LINE__); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/** Copy one SOCaddr (src) into another (dest), preserving family and port. */
|
/** Copy one SOCaddr (src) into another (dest), preserving family and port. */
|
||||||
#define SOCaddr_copy_SOCaddr(dest, src) do { \
|
#define SOCaddr_copy_SOCaddr(dest, src) \
|
||||||
SOCaddr_copyaddr_(&(dest), &(src).m_addr.sa, SOCaddr_size(src), __FILE__, __LINE__); \
|
do { \
|
||||||
|
SOCaddr_copyaddr_(&(dest), &(src).m_addr.sa, SOCaddr_size(src), __FILE__, \
|
||||||
|
__LINE__); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/** Write the numeric (dotted/colon) host of ss into namebuf (capacity
|
/** Write the numeric (dotted/colon) host of ss into namebuf (capacity
|
||||||
namebuflen), scope id stripped. On failure namebuf becomes "". */
|
namebuflen), scope id stripped. On failure namebuf becomes "". */
|
||||||
static HTS_UNUSED void SOCaddr_inetntoa_(char *namebuf, size_t namebuflen,
|
static HTS_UNUSED void SOCaddr_inetntoa_(char *namebuf, size_t namebuflen,
|
||||||
SOCaddr *const ss,
|
SOCaddr *const ss, const char *file,
|
||||||
const char *file, const int line) {
|
const int line) {
|
||||||
assertf_(namebuf != NULL, file, line);
|
assertf_(namebuf != NULL, file, line);
|
||||||
assertf_(ss != NULL, file, line);
|
assertf_(ss != NULL, file, line);
|
||||||
|
|
||||||
if (getnameinfo(&ss->m_addr.sa, sizeof(ss->m_addr),
|
if (getnameinfo(&ss->m_addr.sa, sizeof(ss->m_addr), namebuf, namebuflen, NULL,
|
||||||
namebuf, namebuflen,
|
0, NI_NUMERICHOST) == 0) {
|
||||||
NULL, 0,
|
|
||||||
NI_NUMERICHOST) == 0) {
|
|
||||||
/* remove scope id(s) */
|
/* remove scope id(s) */
|
||||||
char *const pos = strchr(namebuf, '%');
|
char *const pos = strchr(namebuf, '%');
|
||||||
if (pos != NULL) {
|
if (pos != NULL) {
|
||||||
@@ -289,11 +298,28 @@ static HTS_UNUSED void SOCaddr_inetntoa_(char *namebuf, size_t namebuflen,
|
|||||||
SOCaddr_inetntoa_(namebuf, namebuflen, &(ss), __FILE__, __LINE__)
|
SOCaddr_inetntoa_(namebuf, namebuflen, &(ss), __FILE__, __LINE__)
|
||||||
|
|
||||||
/** Single-char family tag: '1' for IPv4, '2' otherwise (used in the cache). */
|
/** Single-char family tag: '1' for IPv4, '2' otherwise (used in the cache). */
|
||||||
#define SOCaddr_getproto(ss) ( SOCaddr_size(ss) == sizeof(struct sockaddr_in) ? '1' : '2')
|
#define SOCaddr_getproto(ss) \
|
||||||
|
(SOCaddr_size(ss) == sizeof(struct sockaddr_in) ? '1' : '2')
|
||||||
|
|
||||||
/** Length type for socket APIs (getsockname, accept, ...). */
|
/** Length type for socket APIs (getsockname, accept, ...). */
|
||||||
typedef socklen_t SOClen;
|
typedef socklen_t SOClen;
|
||||||
|
|
||||||
|
#if HTS_INET6 != 0
|
||||||
|
/** Resolver backend: getaddrinfo/freeaddrinfo as a swappable pair, so the
|
||||||
|
self-test can script DNS answers (families, multiplicity, errors)
|
||||||
|
in-process. The free function must match its getaddrinfo (a fake allocates
|
||||||
|
its own chain), hence the pair. */
|
||||||
|
typedef struct hts_resolver_backend {
|
||||||
|
int (*getaddrinfo)(const char *node, const char *service,
|
||||||
|
const struct addrinfo *hints, struct addrinfo **res);
|
||||||
|
void (*freeaddrinfo)(struct addrinfo *res);
|
||||||
|
} hts_resolver_backend;
|
||||||
|
|
||||||
|
/** Install a resolver backend for the process; NULL restores the libc default.
|
||||||
|
Test-only seam, not thread-safe; callers must serialize against resolves. */
|
||||||
|
void hts_dns_set_resolver_backend(const hts_resolver_backend *backend);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
71
src/htsopt.h
71
src/htsopt.h
@@ -72,6 +72,7 @@ typedef struct String String;
|
|||||||
#endif
|
#endif
|
||||||
#ifndef HTS_DEF_STRUCT_String
|
#ifndef HTS_DEF_STRUCT_String
|
||||||
#define HTS_DEF_STRUCT_String
|
#define HTS_DEF_STRUCT_String
|
||||||
|
|
||||||
struct String {
|
struct String {
|
||||||
char *buffer_;
|
char *buffer_;
|
||||||
size_t length_;
|
size_t length_;
|
||||||
@@ -179,6 +180,7 @@ typedef struct lien_url lien_url;
|
|||||||
|
|
||||||
#ifndef HTS_DEF_DEFSTRUCT_hts_log_type
|
#ifndef HTS_DEF_DEFSTRUCT_hts_log_type
|
||||||
#define HTS_DEF_DEFSTRUCT_hts_log_type
|
#define HTS_DEF_DEFSTRUCT_hts_log_type
|
||||||
|
|
||||||
typedef enum hts_log_type {
|
typedef enum hts_log_type {
|
||||||
LOG_PANIC,
|
LOG_PANIC,
|
||||||
LOG_ERROR,
|
LOG_ERROR,
|
||||||
@@ -239,7 +241,7 @@ struct htsoptstate {
|
|||||||
char *userhttptype;
|
char *userhttptype;
|
||||||
int verif_backblue_done; /**< backblue.gif/fade.gif already emitted */
|
int verif_backblue_done; /**< backblue.gif/fade.gif already emitted */
|
||||||
int verif_external_status;
|
int verif_external_status;
|
||||||
t_dnscache *dns_cache; /**< DNS resolution cache */
|
coucal dns_cache; /**< DNS resolution cache: hostname -> t_dnscache record */
|
||||||
int dns_cache_nthreads; /**< number of in-flight DNS resolver threads */
|
int dns_cache_nthreads; /**< number of in-flight DNS resolver threads */
|
||||||
/* HTML parsing state */
|
/* HTML parsing state */
|
||||||
char _hts_errmsg[HTS_CDLMAXSIZE + 256]; /**< last engine error message */
|
char _hts_errmsg[HTS_CDLMAXSIZE + 256]; /**< last engine error message */
|
||||||
@@ -288,6 +290,7 @@ typedef enum htsparsejava_flags {
|
|||||||
/* Link-rewriting style for saved pages (opt->urlmode). */
|
/* Link-rewriting style for saved pages (opt->urlmode). */
|
||||||
#ifndef HTS_DEF_DEFSTRUCT_hts_urlmode
|
#ifndef HTS_DEF_DEFSTRUCT_hts_urlmode
|
||||||
#define HTS_DEF_DEFSTRUCT_hts_urlmode
|
#define HTS_DEF_DEFSTRUCT_hts_urlmode
|
||||||
|
|
||||||
typedef enum hts_urlmode {
|
typedef enum hts_urlmode {
|
||||||
HTS_URLMODE_ABSOLUTE = 0, /**< absolute URL (http://host/path) everywhere */
|
HTS_URLMODE_ABSOLUTE = 0, /**< absolute URL (http://host/path) everywhere */
|
||||||
HTS_URLMODE_ABSOLUTE_FILE = 1, /**< legacy file: form, unused */
|
HTS_URLMODE_ABSOLUTE_FILE = 1, /**< legacy file: form, unused */
|
||||||
@@ -301,6 +304,7 @@ typedef enum hts_urlmode {
|
|||||||
/* Cache policy for updates and retries (opt->cache). */
|
/* Cache policy for updates and retries (opt->cache). */
|
||||||
#ifndef HTS_DEF_DEFSTRUCT_hts_cachemode
|
#ifndef HTS_DEF_DEFSTRUCT_hts_cachemode
|
||||||
#define HTS_DEF_DEFSTRUCT_hts_cachemode
|
#define HTS_DEF_DEFSTRUCT_hts_cachemode
|
||||||
|
|
||||||
typedef enum hts_cachemode {
|
typedef enum hts_cachemode {
|
||||||
HTS_CACHE_NONE = 0, /**< no cache */
|
HTS_CACHE_NONE = 0, /**< no cache */
|
||||||
HTS_CACHE_PRIORITY = 1, /**< cache takes priority over the network */
|
HTS_CACHE_PRIORITY = 1, /**< cache takes priority over the network */
|
||||||
@@ -311,6 +315,7 @@ typedef enum hts_cachemode {
|
|||||||
/* Interactive wizard level (opt->wizard). */
|
/* Interactive wizard level (opt->wizard). */
|
||||||
#ifndef HTS_DEF_DEFSTRUCT_hts_wizard
|
#ifndef HTS_DEF_DEFSTRUCT_hts_wizard
|
||||||
#define HTS_DEF_DEFSTRUCT_hts_wizard
|
#define HTS_DEF_DEFSTRUCT_hts_wizard
|
||||||
|
|
||||||
typedef enum hts_wizard {
|
typedef enum hts_wizard {
|
||||||
HTS_WIZARD_NONE = 0, /**< no wizard */
|
HTS_WIZARD_NONE = 0, /**< no wizard */
|
||||||
HTS_WIZARD_ASK = 1, /**< wizard asks questions */
|
HTS_WIZARD_ASK = 1, /**< wizard asks questions */
|
||||||
@@ -321,6 +326,7 @@ typedef enum hts_wizard {
|
|||||||
/* robots.txt / meta-robots obedience level (opt->robots). */
|
/* robots.txt / meta-robots obedience level (opt->robots). */
|
||||||
#ifndef HTS_DEF_DEFSTRUCT_hts_robots
|
#ifndef HTS_DEF_DEFSTRUCT_hts_robots
|
||||||
#define HTS_DEF_DEFSTRUCT_hts_robots
|
#define HTS_DEF_DEFSTRUCT_hts_robots
|
||||||
|
|
||||||
typedef enum hts_robots {
|
typedef enum hts_robots {
|
||||||
HTS_ROBOTS_NEVER = 0, /**< ignore robots rules */
|
HTS_ROBOTS_NEVER = 0, /**< ignore robots rules */
|
||||||
HTS_ROBOTS_SOMETIMES = 1, /**< partial obedience (default) */
|
HTS_ROBOTS_SOMETIMES = 1, /**< partial obedience (default) */
|
||||||
@@ -342,24 +348,44 @@ typedef enum hts_seeker {
|
|||||||
HTS_SEEKER_UP = 1 << 1 /**< may ascend to parent directories */
|
HTS_SEEKER_UP = 1 << 1 /**< may ascend to parent directories */
|
||||||
} hts_seeker;
|
} hts_seeker;
|
||||||
|
|
||||||
/* Link-following scope, stored in the low byte of opt->travel. */
|
/* opt->travel: link-following scope in the low byte, flags OR'd in above it. */
|
||||||
typedef enum hts_travel_scope {
|
typedef enum hts_travel_scope {
|
||||||
HTS_TRAVEL_SAME_ADDRESS = 0, /**< stay on the same address (host) */
|
HTS_TRAVEL_SAME_ADDRESS = 0, /**< stay on the same address (host) */
|
||||||
HTS_TRAVEL_SAME_DOMAIN = 1, /**< stay on the same principal domain */
|
HTS_TRAVEL_SAME_DOMAIN = 1, /**< stay on the same principal domain */
|
||||||
HTS_TRAVEL_SAME_TLD = 2, /**< stay on the same TLD (e.g. .com) */
|
HTS_TRAVEL_SAME_TLD = 2, /**< stay on the same TLD (e.g. .com) */
|
||||||
HTS_TRAVEL_EVERYWHERE = 7 /**< follow links anywhere on the web */
|
HTS_TRAVEL_EVERYWHERE = 7, /**< follow links anywhere on the web */
|
||||||
|
HTS_TRAVEL_TEST_ALL = 1 << 8 /**< also test forbidden URLs (-t) */
|
||||||
} hts_travel_scope;
|
} hts_travel_scope;
|
||||||
|
|
||||||
/* Flags OR'd into opt->travel above the scope value. */
|
/* Mask selecting the scope value out of opt->travel. */
|
||||||
#define HTS_TRAVEL_SCOPE_MASK 0xff /**< mask selecting the scope value */
|
#define HTS_TRAVEL_SCOPE_MASK 0xff
|
||||||
#define HTS_TRAVEL_TEST_ALL (1 << 8) /**< also test forbidden URLs (-t) */
|
|
||||||
|
|
||||||
/* Boolean option flag. An enum (not C bool) so the option fields stay int-sized
|
/* Text progress display detail (opt->verbosedisplay). */
|
||||||
and the httrackp layout/ABI is unchanged. */
|
typedef enum hts_verbosedisplay {
|
||||||
#ifndef HTS_DEF_DEFSTRUCT_hts_boolean
|
HTS_VERBOSE_NONE = 0, /**< no animated progress display (default) */
|
||||||
#define HTS_DEF_DEFSTRUCT_hts_boolean
|
HTS_VERBOSE_SIMPLE = 1, /**< minimal single-line progress */
|
||||||
typedef enum hts_boolean { HTS_FALSE = 0, HTS_TRUE = 1 } hts_boolean;
|
HTS_VERBOSE_FULL = 2 /**< full animated progress */
|
||||||
#endif
|
} hts_verbosedisplay;
|
||||||
|
|
||||||
|
/* Delayed file-type resolution policy (opt->savename_delayed). */
|
||||||
|
typedef enum hts_savename_delayed {
|
||||||
|
HTS_SAVENAME_DELAYED_NONE = 0, /**< resolve the type immediately */
|
||||||
|
HTS_SAVENAME_DELAYED_SOFT = 1, /**< delay the type check when unknown */
|
||||||
|
HTS_SAVENAME_DELAYED_HARD = 2 /**< always delay the type check (default) */
|
||||||
|
} hts_savename_delayed;
|
||||||
|
|
||||||
|
/* Saved-name length layout (opt->savename_83). */
|
||||||
|
typedef enum hts_savename_83 {
|
||||||
|
HTS_SAVENAME_83_LONG = 0, /**< long file names (default) */
|
||||||
|
HTS_SAVENAME_83_DOS = 1, /**< DOS 8.3 names (ISO9660 level 1) */
|
||||||
|
HTS_SAVENAME_83_ISO9660 = 2 /**< ISO9660 level 2 names (up to 31 chars) */
|
||||||
|
} hts_savename_83;
|
||||||
|
|
||||||
|
/* Host-banning triggers (opt->hostcontrol bitmask). */
|
||||||
|
typedef enum hts_hostcontrol {
|
||||||
|
HTS_HOSTCONTROL_BAN_TIMEOUT = 1 << 0, /**< ban a timing-out host */
|
||||||
|
HTS_HOSTCONTROL_BAN_SLOW = 1 << 1 /**< ban a too-slow host */
|
||||||
|
} hts_hostcontrol;
|
||||||
|
|
||||||
#ifndef HTS_DEF_FWSTRUCT_lien_buffers
|
#ifndef HTS_DEF_FWSTRUCT_lien_buffers
|
||||||
#define HTS_DEF_FWSTRUCT_lien_buffers
|
#define HTS_DEF_FWSTRUCT_lien_buffers
|
||||||
@@ -393,7 +419,7 @@ struct httrackp {
|
|||||||
hts_urlmode
|
hts_urlmode
|
||||||
urlmode; /**< saved-link rewriting style (relative, absolute, etc.) */
|
urlmode; /**< saved-link rewriting style (relative, absolute, etc.) */
|
||||||
hts_boolean no_type_change; // do not change file type according to MIME
|
hts_boolean no_type_change; // do not change file type according to MIME
|
||||||
int debug; /**< debug logging level */
|
hts_log_type debug; /**< debug logging level */
|
||||||
int getmode; /**< what to fetch (HTML, images, ...) bitmask */
|
int getmode; /**< what to fetch (HTML, images, ...) bitmask */
|
||||||
FILE *log; /**< informational log stream; NULL mutes it */
|
FILE *log; /**< informational log stream; NULL mutes it */
|
||||||
FILE *errlog; /**< error log stream; NULL mutes it */
|
FILE *errlog; /**< error log stream; NULL mutes it */
|
||||||
@@ -402,11 +428,11 @@ struct httrackp {
|
|||||||
LLint maxfile_html; /**< max bytes per HTML file */
|
LLint maxfile_html; /**< max bytes per HTML file */
|
||||||
int maxsoc; /**< max simultaneous sockets (-cN) */
|
int maxsoc; /**< max simultaneous sockets (-cN) */
|
||||||
LLint fragment; /**< split site after this many bytes */
|
LLint fragment; /**< split site after this many bytes */
|
||||||
hts_boolean
|
hts_tristate
|
||||||
nearlink; /**< also fetch images/data adjacent to a page but off-site */
|
nearlink; /**< also fetch images/data adjacent to a page but off-site */
|
||||||
hts_boolean makeindex; /**< build a top-level index.html */
|
hts_boolean makeindex; /**< build a top-level index.html */
|
||||||
hts_boolean kindex; /**< build a keyword index */
|
hts_boolean kindex; /**< build a keyword index */
|
||||||
hts_boolean delete_old; /**< delete locally obsolete files after update */
|
hts_tristate delete_old; /**< delete locally obsolete files after update */
|
||||||
int timeout; /**< connection timeout in seconds */
|
int timeout; /**< connection timeout in seconds */
|
||||||
int rateout; /**< minimum transfer rate (bytes/s) before abort */
|
int rateout; /**< minimum transfer rate (bytes/s) before abort */
|
||||||
int maxtime; /**< max total mirror duration in seconds */
|
int maxtime; /**< max total mirror duration in seconds */
|
||||||
@@ -417,11 +443,12 @@ struct httrackp {
|
|||||||
// int aff_progress; // progress bar
|
// int aff_progress; // progress bar
|
||||||
hts_boolean shell; /**< driven by a shell over stdin/stdout pipes */
|
hts_boolean shell; /**< driven by a shell over stdin/stdout pipes */
|
||||||
t_proxy proxy; /**< proxy configuration */
|
t_proxy proxy; /**< proxy configuration */
|
||||||
int savename_83; /**< force 8.3 (DOS) file names */
|
hts_savename_83
|
||||||
|
savename_83; /**< saved-name length layout (long/DOS/ISO9660) */
|
||||||
int savename_type; /**< saved-name layout (original tree, flat, ...) */
|
int savename_type; /**< saved-name layout (original tree, flat, ...) */
|
||||||
String
|
String
|
||||||
savename_userdef; /**< user-defined name template (e.g. %h%p/%n%q.%t) */
|
savename_userdef; /**< user-defined name template (e.g. %h%p/%n%q.%t) */
|
||||||
int savename_delayed; // delayed type check
|
hts_savename_delayed savename_delayed; /**< delayed type-check policy */
|
||||||
hts_boolean
|
hts_boolean
|
||||||
delayed_cached; // delayed type check can be cached to speedup updates
|
delayed_cached; // delayed type check can be cached to speedup updates
|
||||||
hts_boolean mimehtml; /**< produce a single MIME/MHTML archive */
|
hts_boolean mimehtml; /**< produce a single MIME/MHTML archive */
|
||||||
@@ -437,14 +464,14 @@ struct httrackp {
|
|||||||
hts_boolean makestat; /**< maintain a transfer-statistics log */
|
hts_boolean makestat; /**< maintain a transfer-statistics log */
|
||||||
hts_boolean maketrack; /**< maintain an operations-statistics log */
|
hts_boolean maketrack; /**< maintain an operations-statistics log */
|
||||||
int parsejava; /**< Java/JS parsing mode; see htsparsejava_flags */
|
int parsejava; /**< Java/JS parsing mode; see htsparsejava_flags */
|
||||||
int hostcontrol; /**< drop hosts that are too slow, etc. */
|
int hostcontrol; /**< ban slow/timing-out hosts; see hts_hostcontrol bits */
|
||||||
hts_boolean errpage; /**< generate an error page on 404 and similar */
|
hts_tristate errpage; /**< generate an error page on 404 and similar */
|
||||||
hts_boolean
|
hts_boolean
|
||||||
check_type; /**< probe unknown-type links (cgi/asp/dir) and follow moves
|
check_type; /**< probe unknown-type links (cgi/asp/dir) and follow moves
|
||||||
*/
|
*/
|
||||||
hts_boolean all_in_cache; /**< keep all retrieved data in the cache */
|
hts_boolean all_in_cache; /**< keep all retrieved data in the cache */
|
||||||
hts_robots robots; /**< robots.txt handling level */
|
hts_robots robots; /**< robots.txt handling level */
|
||||||
hts_boolean external; /**< render external links as error pages */
|
hts_tristate external; /**< render external links as error pages */
|
||||||
hts_boolean passprivacy; /**< strip passwords from external links */
|
hts_boolean passprivacy; /**< strip passwords from external links */
|
||||||
hts_boolean includequery; /**< include the query string in saved names */
|
hts_boolean includequery; /**< include the query string in saved names */
|
||||||
hts_boolean mirror_first_page; /**< only mirror the links of the first page */
|
hts_boolean mirror_first_page; /**< only mirror the links of the first page */
|
||||||
@@ -458,11 +485,11 @@ struct httrackp {
|
|||||||
hts_boolean sizehack; /**< treat same-size response as "updated" */
|
hts_boolean sizehack; /**< treat same-size response as "updated" */
|
||||||
hts_boolean urlhack; // force "url normalization" to avoid loops
|
hts_boolean urlhack; // force "url normalization" to avoid loops
|
||||||
hts_boolean tolerant; /**< accept an incorrect Content-Length */
|
hts_boolean tolerant; /**< accept an incorrect Content-Length */
|
||||||
hts_boolean
|
hts_tristate
|
||||||
parseall; /**< parse aggressively, including unknown tags with links */
|
parseall; /**< parse aggressively, including unknown tags with links */
|
||||||
hts_boolean parsedebug; /**< parser debug mode */
|
hts_boolean parsedebug; /**< parser debug mode */
|
||||||
hts_boolean norecatch; /**< do not re-fetch files the user deleted locally */
|
hts_boolean norecatch; /**< do not re-fetch files the user deleted locally */
|
||||||
int verbosedisplay; /**< animated text progress display */
|
hts_verbosedisplay verbosedisplay; /**< animated text progress display */
|
||||||
String footer; /**< footer/info line injected into pages */
|
String footer; /**< footer/info line injected into pages */
|
||||||
int maxcache; /**< in-memory cache backing limit (bytes) */
|
int maxcache; /**< in-memory cache backing limit (bytes) */
|
||||||
// int maxcache_anticipate; // maximum links to anticipate (upper bound)
|
// int maxcache_anticipate; // maximum links to anticipate (upper bound)
|
||||||
|
|||||||
184
src/htsparse.c
184
src/htsparse.c
@@ -296,6 +296,48 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
|
|||||||
return dst;
|
return dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Byte before html, or a space sentinel at the buffer start where html[-1]
|
||||||
|
would underflow; space reads as the word boundary the guards want there. */
|
||||||
|
static HTS_INLINE char html_prevc(const char *html, const char *start) {
|
||||||
|
return html > start ? html[-1] : ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
/* True if [s, s+len) is exactly an HTTP method token (XHR.open's first
|
||||||
|
argument is a method, not a URL: #218). Case-insensitive. */
|
||||||
|
static int is_http_method(const char *s, size_t len) {
|
||||||
|
static const char *const methods[] = {"GET", "POST", "PUT",
|
||||||
|
"DELETE", "HEAD", "OPTIONS",
|
||||||
|
"PATCH", "TRACE", NULL};
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; methods[i] != NULL; i++) {
|
||||||
|
if (strlen(methods[i]) == len && strfield(s, methods[i]) == (int) len)
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Percent-encode '(' and ')' in a link emitted into an unquoted url(...) (CSS
|
||||||
|
or JS): a literal ')' closes the token early and the UA mis-parses the value
|
||||||
|
(#163). The UA decodes %28/%29 back to the saved-on-disk name. */
|
||||||
|
static void escape_url_parens(char *const s, const size_t size) {
|
||||||
|
char BIGSTK buff[HTS_URLMAXSIZE * 2];
|
||||||
|
size_t i, j;
|
||||||
|
|
||||||
|
for (i = 0, j = 0; s[i] != '\0' && j + 3 < size && j + 3 < sizeof(buff);
|
||||||
|
i++) {
|
||||||
|
if (s[i] == '(' || s[i] == ')') {
|
||||||
|
buff[j++] = '%';
|
||||||
|
buff[j++] = '2';
|
||||||
|
buff[j++] = s[i] == '(' ? '8' : '9';
|
||||||
|
} else {
|
||||||
|
buff[j++] = s[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
buff[j] = '\0';
|
||||||
|
strlcpybuff(s, buff, size);
|
||||||
|
}
|
||||||
|
|
||||||
/* Main parser */
|
/* Main parser */
|
||||||
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||||
char catbuff[CATBUFF_SIZE];
|
char catbuff[CATBUFF_SIZE];
|
||||||
@@ -556,7 +598,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if (opt->getmode & HTS_GETMODE_HTML) {
|
if (opt->getmode & HTS_GETMODE_HTML) {
|
||||||
p = strfield(html, "title");
|
p = strfield(html, "title");
|
||||||
if (p) {
|
if (p) {
|
||||||
if (*(html - 1) == '/')
|
if (html_prevc(html, r->adr) == '/')
|
||||||
p = 0; // /title
|
p = 0; // /title
|
||||||
} else {
|
} else {
|
||||||
if (strfield(html, "/html"))
|
if (strfield(html, "/html"))
|
||||||
@@ -1341,6 +1383,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
int can_avoid_quotes = 0;
|
int can_avoid_quotes = 0;
|
||||||
char quotes_replacement = '\0';
|
char quotes_replacement = '\0';
|
||||||
int ensure_not_mime = 0;
|
int ensure_not_mime = 0;
|
||||||
|
// .open(method,url): reject an HTTP-method first arg (#218)
|
||||||
|
int ensure_not_method = 0;
|
||||||
|
// @import: the quoted token is the URL; a trailing
|
||||||
|
// media/supports/layer condition is not part of it
|
||||||
|
int is_import = 0;
|
||||||
|
|
||||||
if (inscript_tag)
|
if (inscript_tag)
|
||||||
expected_end = ";\"\'"; // voir a href="javascript:doc.location='foo'"
|
expected_end = ";\"\'"; // voir a href="javascript:doc.location='foo'"
|
||||||
@@ -1357,9 +1404,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if (!nc)
|
if (!nc)
|
||||||
nc = strfield(html, ":location"); // javascript:location="doc"
|
nc = strfield(html, ":location"); // javascript:location="doc"
|
||||||
if (!nc) { // location="doc"
|
if (!nc) { // location="doc"
|
||||||
if ((nc = strfield(html, "location"))
|
if ((nc = strfield(html, "location")) &&
|
||||||
&& !isspace(*(html - 1))
|
!isspace(html_prevc(html, r->adr)))
|
||||||
)
|
|
||||||
nc = 0;
|
nc = 0;
|
||||||
}
|
}
|
||||||
if (!nc)
|
if (!nc)
|
||||||
@@ -1369,6 +1415,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
expected = '('; // parenthèse
|
expected = '('; // parenthèse
|
||||||
expected_end = "),"; // fin: virgule ou parenthèse
|
expected_end = "),"; // fin: virgule ou parenthèse
|
||||||
ensure_not_mime = 1; //* ensure the url is not a mime type */
|
ensure_not_mime = 1; //* ensure the url is not a mime type */
|
||||||
|
ensure_not_method = 1; // xhr.open: don't grab method
|
||||||
}
|
}
|
||||||
if (!nc)
|
if (!nc)
|
||||||
if ((nc = strfield(html, ".replace"))) { // window.replace("url")
|
if ((nc = strfield(html, ".replace"))) { // window.replace("url")
|
||||||
@@ -1380,7 +1427,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
expected = '('; // parenthèse
|
expected = '('; // parenthèse
|
||||||
expected_end = ")"; // fin: parenthèse
|
expected_end = ")"; // fin: parenthèse
|
||||||
}
|
}
|
||||||
if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') { // url(url)
|
if (!nc && (nc = strfield(html, "url")) &&
|
||||||
|
(!isalnum(html_prevc(html, r->adr))) &&
|
||||||
|
html_prevc(html, r->adr) != '_') { // url(url)
|
||||||
expected = '('; // parenthèse
|
expected = '('; // parenthèse
|
||||||
expected_end = ")"; // fin: parenthèse
|
expected_end = ")"; // fin: parenthèse
|
||||||
can_avoid_quotes = 1;
|
can_avoid_quotes = 1;
|
||||||
@@ -1390,6 +1439,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if ((nc = strfield(html, "import"))) { // import "url"
|
if ((nc = strfield(html, "import"))) { // import "url"
|
||||||
if (is_space(*(html + nc))) {
|
if (is_space(*(html + nc))) {
|
||||||
expected = 0; // no char expected
|
expected = 0; // no char expected
|
||||||
|
is_import = 1;
|
||||||
} else
|
} else
|
||||||
nc = 0;
|
nc = 0;
|
||||||
}
|
}
|
||||||
@@ -1407,6 +1457,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
|
if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
|
||||||
const char *b, *c;
|
const char *b, *c;
|
||||||
int ndelim = 1;
|
int ndelim = 1;
|
||||||
|
int valid_url = 0;
|
||||||
|
|
||||||
if ((*a == 34) || (*a == '\''))
|
if ((*a == 34) || (*a == '\''))
|
||||||
a++;
|
a++;
|
||||||
@@ -1421,12 +1472,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
b++;
|
b++;
|
||||||
}
|
}
|
||||||
c = b--;
|
c = b--;
|
||||||
|
// no closing delimiter here (truncated input):
|
||||||
|
// Don't scan past the buffer NUL or capture it.
|
||||||
|
if (*c != '\0') {
|
||||||
c += ndelim;
|
c += ndelim;
|
||||||
while (*c == ' ')
|
while (*c == ' ')
|
||||||
c++;
|
c++;
|
||||||
if ((strchr(expected_end, *c)) || (*c == '\n')
|
valid_url =
|
||||||
|| (*c == '\r')) {
|
(strchr(expected_end, *c)) || (*c == '\n') ||
|
||||||
c -= (ndelim + 1);
|
(*c == '\r') ||
|
||||||
|
(is_import && *(b + 1 + ndelim) == ' ');
|
||||||
|
}
|
||||||
|
if (valid_url) {
|
||||||
|
// URL end = last char (b), not the delimiter
|
||||||
|
c = b;
|
||||||
if ((int) (c - a + 1)) {
|
if ((int) (c - a + 1)) {
|
||||||
if (ensure_not_mime) {
|
if (ensure_not_mime) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
@@ -1442,6 +1501,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// XHR.open's "GET" etc. is a method, not a URL
|
||||||
|
if (a != NULL && ensure_not_method &&
|
||||||
|
is_http_method(a, (size_t) (c - a + 1))) {
|
||||||
|
a = NULL;
|
||||||
|
}
|
||||||
// Check for bogus links (Vasiliy)
|
// Check for bogus links (Vasiliy)
|
||||||
if (a != NULL) {
|
if (a != NULL) {
|
||||||
const size_t size = c - a + 1;
|
const size_t size = c - a + 1;
|
||||||
@@ -1485,7 +1549,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1692,6 +1755,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
hts_nodetect[i -
|
hts_nodetect[i -
|
||||||
1]);
|
1]);
|
||||||
}
|
}
|
||||||
|
// xmlns / xmlns:prefix declare
|
||||||
|
// XML namespaces, not resources
|
||||||
|
// (#191)
|
||||||
|
else {
|
||||||
|
const int xl = strfield(
|
||||||
|
intag_startattr, "xmlns");
|
||||||
|
const char xc =
|
||||||
|
intag_startattr[xl];
|
||||||
|
if (xl &&
|
||||||
|
(xc == ':' || xc == '=' ||
|
||||||
|
is_space(xc))) {
|
||||||
|
url_ok = 0;
|
||||||
|
hts_log_print(
|
||||||
|
opt, LOG_DEBUG,
|
||||||
|
"dirty parsing: xmlns "
|
||||||
|
"namespace avoided");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2967,6 +3048,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
/* Never escape high-chars (we don't know the encoding!!) */
|
/* Never escape high-chars (we don't know the encoding!!) */
|
||||||
inplace_escape_uri_utf(tempo, sizeof(tempo));
|
inplace_escape_uri_utf(tempo, sizeof(tempo));
|
||||||
|
|
||||||
|
// unquoted url() (CSS/JS): keep parens escaped
|
||||||
|
if (ending_p == ')')
|
||||||
|
escape_url_parens(tempo, sizeof(tempo));
|
||||||
|
|
||||||
//if (!no_esc_utf)
|
//if (!no_esc_utf)
|
||||||
// escape_uri(tempo); // escape with %xx
|
// escape_uri(tempo); // escape with %xx
|
||||||
//else {
|
//else {
|
||||||
@@ -3664,44 +3749,60 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
|||||||
|
|
||||||
} // bloc
|
} // bloc
|
||||||
// erreur HTTP (ex: 404, not found)
|
// erreur HTTP (ex: 404, not found)
|
||||||
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED)
|
} else if ((r->statuscode == HTTP_PRECONDITION_FAILED) ||
|
||||||
|| (r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)
|
(r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)) {
|
||||||
) { // Precondition Failed, c'est à dire pour nous redemander TOUT le fichier
|
// 412/416: the resume partial is stale; re-get the whole file (#206)
|
||||||
if (fexist_utf8(heap(ptr)->sav)) {
|
lien_back *itemback = NULL;
|
||||||
remove(heap(ptr)->sav); // Eliminer
|
int had_partial = 0;
|
||||||
} else {
|
int ref_existed = 0;
|
||||||
hts_log_print(opt, LOG_WARNING,
|
int ref_gone;
|
||||||
"Unexpected 412/416 error (%s) for %s%s, '%s' could not be found on disk",
|
|
||||||
r->msg, urladr(), urlfil(),
|
// Drop the temp-ref, its partial, and heap->sav so the re-get carries no
|
||||||
heap(ptr)->sav != NULL ? heap(ptr)->sav : "");
|
// Range; else back_add rebuilds the same Range and loops.
|
||||||
|
if (back_unserialize_ref(opt, heap(ptr)->adr, heap(ptr)->fil,
|
||||||
|
&itemback) == 0) {
|
||||||
|
had_partial = 1;
|
||||||
|
ref_existed = 1;
|
||||||
|
// best-effort: an orphaned partial cannot re-Range once the ref is gone
|
||||||
|
if (fexist_utf8(itemback->url_sav))
|
||||||
|
(void) UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||||
|
itemback->url_sav));
|
||||||
|
back_clear_entry(itemback);
|
||||||
|
freet(itemback);
|
||||||
}
|
}
|
||||||
if (!fexist_utf8(heap(ptr)->sav)) { // Bien éliminé? (sinon on boucle..)
|
// don't re-record if the ref survived (it would re-Range and loop)
|
||||||
#if HDEBUG
|
ref_gone =
|
||||||
printf("Partial content NOT up-to-date, reget all file for %s\n",
|
url_savename_refname_remove(opt, heap(ptr)->adr, heap(ptr)->fil) ||
|
||||||
heap(ptr)->sav);
|
!ref_existed;
|
||||||
#endif
|
if (fexist_utf8(heap(ptr)->sav)) {
|
||||||
|
had_partial = 1;
|
||||||
|
remove(heap(ptr)->sav);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-get once, only if a partial existed and both Range triggers are
|
||||||
|
// gone; a failed removal gives up rather than looping. range_used is
|
||||||
|
// unreliable (it does not survive the delayed-type two-pass).
|
||||||
|
if (had_partial && ref_gone && !fexist_utf8(heap(ptr)->sav)) {
|
||||||
hts_log_print(opt, LOG_DEBUG, "Partial file reget (%s) for %s%s",
|
hts_log_print(opt, LOG_DEBUG, "Partial file reget (%s) for %s%s",
|
||||||
r->msg, urladr(), urlfil());
|
r->msg, urladr(), urlfil());
|
||||||
// enregistrer le MEME lien
|
|
||||||
if (hts_record_link(opt, heap(ptr)->adr, heap(ptr)->fil, heap(ptr)->sav, "", "", NULL)) {
|
if (hts_record_link(opt, heap(ptr)->adr, heap(ptr)->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||||
heap_top()->testmode = heap(ptr)->testmode; // mode test?
|
heap_top()->testmode = heap(ptr)->testmode;
|
||||||
heap_top()->link_import = 0; // pas mode import
|
heap_top()->link_import = 0;
|
||||||
heap_top()->depth = heap(ptr)->depth;
|
heap_top()->depth = heap(ptr)->depth;
|
||||||
heap_top()->pass2 = max(heap(ptr)->pass2, numero_passe);
|
heap_top()->pass2 = max(heap(ptr)->pass2, numero_passe);
|
||||||
heap_top()->retry = heap(ptr)->retry;
|
heap_top()->retry = heap(ptr)->retry;
|
||||||
heap_top()->premier = heap(ptr)->premier;
|
heap_top()->premier = heap(ptr)->premier;
|
||||||
heap_top()->precedent = ptr;
|
heap_top()->precedent = ptr;
|
||||||
//
|
|
||||||
// canceller lien actuel
|
|
||||||
error = 1;
|
error = 1;
|
||||||
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
hts_invalidate_link(opt, ptr); // invalidate hashtable entry
|
||||||
//
|
} else { // out of memory
|
||||||
} else { // oups erreur, plus de mémoire!!
|
XH_uninit;
|
||||||
XH_uninit; // désallocation mémoire & buffers
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
hts_log_print(opt, LOG_ERROR, "Can not remove old file %s", urlfil());
|
hts_log_print(opt, LOG_WARNING,
|
||||||
|
"Giving up on partial reget (%s) for %s%s", r->msg,
|
||||||
|
urladr(), urlfil());
|
||||||
error = 1;
|
error = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3722,7 +3823,8 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
|||||||
//case -1: can_retry=1; break;
|
//case -1: can_retry=1; break;
|
||||||
case STATUSCODE_TIMEOUT:
|
case STATUSCODE_TIMEOUT:
|
||||||
if (opt->hostcontrol) { // timeout et retry épuisés
|
if (opt->hostcontrol) { // timeout et retry épuisés
|
||||||
if ((opt->hostcontrol & 1) && (heap(ptr)->retry <= 0)) {
|
if ((opt->hostcontrol & HTS_HOSTCONTROL_BAN_TIMEOUT) &&
|
||||||
|
(heap(ptr)->retry <= 0)) {
|
||||||
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
||||||
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
||||||
hts_log_print(opt, LOG_DEBUG,
|
hts_log_print(opt, LOG_DEBUG,
|
||||||
@@ -3735,7 +3837,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
|||||||
break;
|
break;
|
||||||
case STATUSCODE_SLOW:
|
case STATUSCODE_SLOW:
|
||||||
if ((opt->hostcontrol) && (heap(ptr)->retry <= 0)) { // too slow
|
if ((opt->hostcontrol) && (heap(ptr)->retry <= 0)) { // too slow
|
||||||
if (opt->hostcontrol & 2) {
|
if (opt->hostcontrol & HTS_HOSTCONTROL_BAN_SLOW) {
|
||||||
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
||||||
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
||||||
hts_log_print(opt, LOG_DEBUG,
|
hts_log_print(opt, LOG_DEBUG,
|
||||||
@@ -4261,10 +4363,10 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
|
|||||||
char com[256];
|
char com[256];
|
||||||
|
|
||||||
linput(stdin, com, 200);
|
linput(stdin, com, 200);
|
||||||
if (opt->verbosedisplay == 2)
|
if (opt->verbosedisplay == HTS_VERBOSE_FULL)
|
||||||
opt->verbosedisplay = 1;
|
opt->verbosedisplay = HTS_VERBOSE_SIMPLE;
|
||||||
else
|
else
|
||||||
opt->verbosedisplay = 2;
|
opt->verbosedisplay = HTS_VERBOSE_FULL;
|
||||||
/* Info for wrappers */
|
/* Info for wrappers */
|
||||||
hts_log_print(opt, LOG_INFO, "engine: change-options");
|
hts_log_print(opt, LOG_INFO, "engine: change-options");
|
||||||
RUN_CALLBACK0(opt, chopt);
|
RUN_CALLBACK0(opt, chopt);
|
||||||
@@ -4374,7 +4476,7 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
|
|||||||
printf("%c\x0d", ("/-\\|")[roll]);
|
printf("%c\x0d", ("/-\\|")[roll]);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
} else if (opt->verbosedisplay == 1) {
|
} else if (opt->verbosedisplay == HTS_VERBOSE_SIMPLE) {
|
||||||
if (b >= 0) {
|
if (b >= 0) {
|
||||||
if (back[b].r.statuscode == HTTP_OK)
|
if (back[b].r.statuscode == HTTP_OK)
|
||||||
printf("%d/%d: %s%s (" LLintP " bytes) - OK\33[K\r", ptr, opt->lien_tot,
|
printf("%d/%d: %s%s (" LLintP " bytes) - OK\33[K\r", ptr, opt->lien_tot,
|
||||||
@@ -4465,8 +4567,8 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
|||||||
char in_error_msg[32];
|
char in_error_msg[32];
|
||||||
|
|
||||||
// resolve unresolved type
|
// resolve unresolved type
|
||||||
if (opt->savename_delayed != 0 && *forbidden_url == 0 && IS_DELAYED_EXT(afs->save)
|
if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
|
||||||
&& !opt->state.stop) {
|
*forbidden_url == 0 && IS_DELAYED_EXT(afs->save) && !opt->state.stop) {
|
||||||
int loops;
|
int loops;
|
||||||
int continue_loop;
|
int continue_loop;
|
||||||
|
|
||||||
|
|||||||
103
src/htssafe.h
103
src/htssafe.h
@@ -58,7 +58,8 @@ HTSEXT_API htsErrorCallback hts_get_error_callback(void);
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define HTSSAFE_ABORT_FUNCTION(A,B,C) do { \
|
#define HTSSAFE_ABORT_FUNCTION(A, B, C) \
|
||||||
|
do { \
|
||||||
htsErrorCallback callback = hts_get_error_callback(); \
|
htsErrorCallback callback = hts_get_error_callback(); \
|
||||||
if (callback != NULL) { \
|
if (callback != NULL) { \
|
||||||
callback(A, B, C); \
|
callback(A, B, C); \
|
||||||
@@ -75,7 +76,8 @@ HTSEXT_API htsErrorCallback hts_get_error_callback(void);
|
|||||||
/**
|
/**
|
||||||
* Fatal assertion check.
|
* Fatal assertion check.
|
||||||
*/
|
*/
|
||||||
#define assertf__(exp, sexp, file, line) (void) ( (exp) || (abortf_(sexp, file, line), 0) )
|
#define assertf__(exp, sexp, file, line) \
|
||||||
|
(void) ((exp) || (abortf_(sexp, file, line), 0))
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fatal assertion check.
|
* Fatal assertion check.
|
||||||
@@ -106,7 +108,8 @@ static HTS_UNUSED void abortf_(const char *exp, const char *file, int line) {
|
|||||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||||
|
|
||||||
/* Note: char[] and const char[] are compatible */
|
/* Note: char[] and const char[] are compatible */
|
||||||
#define HTS_IS_CHAR_BUFFER(VAR) ( __builtin_types_compatible_p ( typeof (VAR), char[] ) )
|
#define HTS_IS_CHAR_BUFFER(VAR) \
|
||||||
|
(__builtin_types_compatible_p(typeof(VAR), char[]))
|
||||||
#else
|
#else
|
||||||
/* Note: a bit lame as char[8] won't be seen. */
|
/* Note: a bit lame as char[8] won't be seen. */
|
||||||
#define HTS_IS_CHAR_BUFFER(VAR) (sizeof(VAR) != sizeof(char *))
|
#define HTS_IS_CHAR_BUFFER(VAR) (sizeof(VAR) != sizeof(char *))
|
||||||
@@ -201,10 +204,13 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
|||||||
*/
|
*/
|
||||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||||
|
|
||||||
#define strncatbuff(A, B, N) __builtin_choose_expr( HTS_IS_CHAR_BUFFER(A), \
|
#define strncatbuff(A, B, N) \
|
||||||
|
__builtin_choose_expr( \
|
||||||
|
HTS_IS_CHAR_BUFFER(A), \
|
||||||
strncat_safe_(A, sizeof(A), B, \
|
strncat_safe_(A, sizeof(A), B, \
|
||||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), N, \
|
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), N, \
|
||||||
"overflow while appending '" #B "' to '"#A"'", __FILE__, __LINE__), \
|
"overflow while appending '" #B "' to '" #A "'", __FILE__, \
|
||||||
|
__LINE__), \
|
||||||
strncatbuff_ptr_((A), (B), (N)))
|
strncatbuff_ptr_((A), (B), (N)))
|
||||||
#else
|
#else
|
||||||
#define strncatbuff(A, B, N) \
|
#define strncatbuff(A, B, N) \
|
||||||
@@ -212,7 +218,8 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
|||||||
? strncat(A, B, N) \
|
? strncat(A, B, N) \
|
||||||
: strncat_safe_(A, sizeof(A), B, \
|
: strncat_safe_(A, sizeof(A), B, \
|
||||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), N, \
|
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), N, \
|
||||||
"overflow while appending '" #B "' to '"#A"'", __FILE__, __LINE__) )
|
"overflow while appending '" #B "' to '" #A "'", \
|
||||||
|
__FILE__, __LINE__))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -222,18 +229,24 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
|||||||
*/
|
*/
|
||||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||||
|
|
||||||
#define strcatbuff(A, B) __builtin_choose_expr( HTS_IS_CHAR_BUFFER(A), \
|
#define strcatbuff(A, B) \
|
||||||
|
__builtin_choose_expr( \
|
||||||
|
HTS_IS_CHAR_BUFFER(A), \
|
||||||
strncat_safe_(A, sizeof(A), B, \
|
strncat_safe_(A, sizeof(A), B, \
|
||||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), (size_t) -1, \
|
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
||||||
"overflow while appending '" #B "' to '"#A"'", __FILE__, __LINE__), \
|
(size_t) -1, \
|
||||||
|
"overflow while appending '" #B "' to '" #A "'", __FILE__, \
|
||||||
|
__LINE__), \
|
||||||
strcatbuff_ptr_((A), (B)))
|
strcatbuff_ptr_((A), (B)))
|
||||||
#else
|
#else
|
||||||
#define strcatbuff(A, B) \
|
#define strcatbuff(A, B) \
|
||||||
(HTS_IS_NOT_CHAR_BUFFER(A) \
|
(HTS_IS_NOT_CHAR_BUFFER(A) \
|
||||||
? strcat(A, B) \
|
? strcat(A, B) \
|
||||||
: strncat_safe_(A, sizeof(A), B, \
|
: strncat_safe_(A, sizeof(A), B, \
|
||||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), (size_t) -1, \
|
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
||||||
"overflow while appending '" #B "' to '"#A"'", __FILE__, __LINE__) )
|
(size_t) -1, \
|
||||||
|
"overflow while appending '" #B "' to '" #A "'", \
|
||||||
|
__FILE__, __LINE__))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -243,10 +256,13 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
|||||||
*/
|
*/
|
||||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||||
|
|
||||||
#define strcpybuff(A, B) __builtin_choose_expr( HTS_IS_CHAR_BUFFER(A), \
|
#define strcpybuff(A, B) \
|
||||||
|
__builtin_choose_expr( \
|
||||||
|
HTS_IS_CHAR_BUFFER(A), \
|
||||||
strcpy_safe_(A, sizeof(A), B, \
|
strcpy_safe_(A, sizeof(A), B, \
|
||||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
||||||
"overflow while copying '" #B "' to '"#A"'", __FILE__, __LINE__), \
|
"overflow while copying '" #B "' to '" #A "'", __FILE__, \
|
||||||
|
__LINE__), \
|
||||||
strcpybuff_ptr_((A), (B)))
|
strcpybuff_ptr_((A), (B)))
|
||||||
#else
|
#else
|
||||||
#define strcpybuff(A, B) \
|
#define strcpybuff(A, B) \
|
||||||
@@ -254,7 +270,8 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
|||||||
? strcpy(A, B) \
|
? strcpy(A, B) \
|
||||||
: strcpy_safe_(A, sizeof(A), B, \
|
: strcpy_safe_(A, sizeof(A), B, \
|
||||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
||||||
"overflow while copying '" #B "' to '"#A"'", __FILE__, __LINE__) )
|
"overflow while copying '" #B "' to '" #A "'", __FILE__, \
|
||||||
|
__LINE__))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -269,9 +286,9 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
|||||||
* Append characters of "B" to "A", "A" having a maximum capacity of "S".
|
* Append characters of "B" to "A", "A" having a maximum capacity of "S".
|
||||||
*/
|
*/
|
||||||
#define strlcatbuff(A, B, S) \
|
#define strlcatbuff(A, B, S) \
|
||||||
strncat_safe_(A, S, B, \
|
strncat_safe_(A, S, B, HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
||||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), (size_t) -1, \
|
(size_t) -1, "overflow while appending '" #B "' to '" #A "'", \
|
||||||
"overflow while appending '" #B "' to '"#A"'", __FILE__, __LINE__)
|
__FILE__, __LINE__)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Append at most "N" characters of "B" to "A", "A" having a maximum capacity
|
* Append at most "N" characters of "B" to "A", "A" having a maximum capacity
|
||||||
@@ -286,16 +303,17 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
|||||||
* Copy characters of "B" to "A", "A" having a maximum capacity of "S".
|
* Copy characters of "B" to "A", "A" having a maximum capacity of "S".
|
||||||
*/
|
*/
|
||||||
#define strlcpybuff(A, B, S) \
|
#define strlcpybuff(A, B, S) \
|
||||||
strcpy_safe_(A, S, B, \
|
strcpy_safe_(A, S, B, HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
||||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
"overflow while copying '" #B "' to '" #A "'", __FILE__, \
|
||||||
"overflow while copying '" #B "' to '"#A"'", __FILE__, __LINE__)
|
__LINE__)
|
||||||
|
|
||||||
/** strnlen replacement (autotools). **/
|
/** strnlen replacement (autotools). **/
|
||||||
#if (!defined(_WIN32) && !defined(HAVE_STRNLEN))
|
#if (!defined(_WIN32) && !defined(HAVE_STRNLEN))
|
||||||
|
|
||||||
static HTS_UNUSED size_t strnlen(const char *s, size_t maxlen) {
|
static HTS_UNUSED size_t strnlen(const char *s, size_t maxlen) {
|
||||||
size_t i;
|
size_t i;
|
||||||
for(i = 0 ; i < maxlen && s[i] != '\0' ; i++) ;
|
for (i = 0; i < maxlen && s[i] != '\0'; i++)
|
||||||
|
;
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@@ -304,12 +322,13 @@ static HTS_UNUSED size_t strnlen(const char *s, size_t maxlen) {
|
|||||||
Aborts if source is NULL or has no NUL within that capacity. The sentinel
|
Aborts if source is NULL or has no NUL within that capacity. The sentinel
|
||||||
sizeof_source == (size_t)-1 means "capacity unknown", and falls back to the
|
sizeof_source == (size_t)-1 means "capacity unknown", and falls back to the
|
||||||
unbounded strlen (used when the source is a pointer rather than an array). */
|
unbounded strlen (used when the source is a pointer rather than an array). */
|
||||||
static HTS_INLINE HTS_UNUSED size_t strlen_safe_(const char *source, const size_t sizeof_source,
|
static HTS_INLINE HTS_UNUSED size_t strlen_safe_(const char *source,
|
||||||
|
const size_t sizeof_source,
|
||||||
const char *file, int line) {
|
const char *file, int line) {
|
||||||
size_t size;
|
size_t size;
|
||||||
assertf_(source != NULL, file, line);
|
assertf_(source != NULL, file, line);
|
||||||
size = sizeof_source != (size_t) -1
|
size = sizeof_source != (size_t) -1 ? strnlen(source, sizeof_source)
|
||||||
? strnlen(source, sizeof_source) : strlen(source);
|
: strlen(source);
|
||||||
assertf_(size < sizeof_source, file, line);
|
assertf_(size < sizeof_source, file, line);
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
@@ -319,10 +338,10 @@ static HTS_INLINE HTS_UNUSED size_t strlen_safe_(const char *source, const size_
|
|||||||
source's capacity or (size_t)-1 if unknown. Aborts if the result (existing
|
source's capacity or (size_t)-1 if unknown. Aborts if the result (existing
|
||||||
dest length + appended bytes + NUL) would not fit sizeof_dest: this NEVER
|
dest length + appended bytes + NUL) would not fit sizeof_dest: this NEVER
|
||||||
truncates. Always NUL-terminates on success. */
|
truncates. Always NUL-terminates on success. */
|
||||||
static HTS_INLINE HTS_UNUSED char* strncat_safe_(char *const dest, const size_t sizeof_dest,
|
static HTS_INLINE HTS_UNUSED char *
|
||||||
|
strncat_safe_(char *const dest, const size_t sizeof_dest,
|
||||||
const char *const source, const size_t sizeof_source,
|
const char *const source, const size_t sizeof_source,
|
||||||
const size_t n,
|
const size_t n, const char *exp, const char *file, int line) {
|
||||||
const char *exp, const char *file, int line) {
|
|
||||||
const size_t source_len = strlen_safe_(source, sizeof_source, file, line);
|
const size_t source_len = strlen_safe_(source, sizeof_source, file, line);
|
||||||
const size_t dest_len = strlen_safe_(dest, sizeof_dest, file, line);
|
const size_t dest_len = strlen_safe_(dest, sizeof_dest, file, line);
|
||||||
/* note: "size_t is an unsigned integral type" ((size_t) -1 is positive) */
|
/* note: "size_t is an unsigned integral type" ((size_t) -1 is positive) */
|
||||||
@@ -337,12 +356,14 @@ static HTS_INLINE HTS_UNUSED char* strncat_safe_(char *const dest, const size_t
|
|||||||
/* Core bounded copy: empties dest then appends all of source via
|
/* Core bounded copy: empties dest then appends all of source via
|
||||||
strncat_safe_. sizeof_dest is dest's total capacity (NUL included). Aborts
|
strncat_safe_. sizeof_dest is dest's total capacity (NUL included). Aborts
|
||||||
(no truncation) if source plus its NUL would not fit. */
|
(no truncation) if source plus its NUL would not fit. */
|
||||||
static HTS_INLINE HTS_UNUSED char* strcpy_safe_(char *const dest, const size_t sizeof_dest,
|
static HTS_INLINE HTS_UNUSED char *
|
||||||
|
strcpy_safe_(char *const dest, const size_t sizeof_dest,
|
||||||
const char *const source, const size_t sizeof_source,
|
const char *const source, const size_t sizeof_source,
|
||||||
const char *exp, const char *file, int line) {
|
const char *exp, const char *file, int line) {
|
||||||
assertf_(sizeof_dest != 0, file, line);
|
assertf_(sizeof_dest != 0, file, line);
|
||||||
dest[0] = '\0';
|
dest[0] = '\0';
|
||||||
return strncat_safe_(dest, sizeof_dest, source, sizeof_source, (size_t) -1, exp, file, line);
|
return strncat_safe_(dest, sizeof_dest, source, sizeof_source, (size_t) -1,
|
||||||
|
exp, file, line);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -385,22 +406,28 @@ static HTS_INLINE HTS_UNUSED htsbuff htsbuff_ptr_(char *buf, size_t cap) {
|
|||||||
|
|
||||||
/* 0 for an array, a -1 array-size compile error for a pointer. */
|
/* 0 for an array, a -1 array-size compile error for a pointer. */
|
||||||
#define htsbuff_must_be_array_(A) \
|
#define htsbuff_must_be_array_(A) \
|
||||||
(sizeof(char[1 - 2 * !!__builtin_types_compatible_p(typeof(A), typeof(&(A)[0]))]) - 1)
|
(sizeof(char[1 - 2 * !!__builtin_types_compatible_p(typeof(A), \
|
||||||
|
typeof(&(A)[0]))]) - \
|
||||||
|
1)
|
||||||
|
|
||||||
#define htsbuff_array(ARR) htsbuff_ptr_((ARR), sizeof(ARR) + htsbuff_must_be_array_(ARR))
|
#define htsbuff_array(ARR) \
|
||||||
|
htsbuff_ptr_((ARR), sizeof(ARR) + htsbuff_must_be_array_(ARR))
|
||||||
#else
|
#else
|
||||||
#define htsbuff_array(ARR) htsbuff_ptr_((ARR), sizeof(ARR))
|
#define htsbuff_array(ARR) htsbuff_ptr_((ARR), sizeof(ARR))
|
||||||
#endif
|
#endif
|
||||||
/** Builder over pointer P of known capacity N (N includes the NUL). */
|
/** Builder over pointer P of known capacity N (N includes the NUL). */
|
||||||
#define htsbuff_ptr(P, N) htsbuff_ptr_((P), (N))
|
#define htsbuff_ptr(P, N) htsbuff_ptr_((P), (N))
|
||||||
|
|
||||||
/** Append at most n characters of s (stopping at its NUL). Aborts on overflow. */
|
/** Append at most n characters of s (stopping at its NUL). Aborts on overflow.
|
||||||
static HTS_INLINE HTS_UNUSED void htsbuff_catn(htsbuff *b, const char *s, size_t n) {
|
*/
|
||||||
|
static HTS_INLINE HTS_UNUSED void htsbuff_catn(htsbuff *b, const char *s,
|
||||||
|
size_t n) {
|
||||||
const size_t add = strnlen(s, n);
|
const size_t add = strnlen(s, n);
|
||||||
/* Overflow-safe: keep the (potentially huge) 'add' alone on one side. The
|
/* Overflow-safe: keep the (potentially huge) 'add' alone on one side. The
|
||||||
maintained invariant len < cap makes 'cap - len' >= 1 (no underflow), so
|
maintained invariant len < cap makes 'cap - len' >= 1 (no underflow), so
|
||||||
'add < cap - len' cannot wrap the way 'len + add < cap' could. */
|
'add < cap - len' cannot wrap the way 'len + add < cap' could. */
|
||||||
assertf__(add < b->cap - b->len, "htsbuff append overflow", __FILE__, __LINE__);
|
assertf__(add < b->cap - b->len, "htsbuff append overflow", __FILE__,
|
||||||
|
__LINE__);
|
||||||
memcpy(b->buf + b->len, s, add);
|
memcpy(b->buf + b->len, s, add);
|
||||||
b->len += add;
|
b->len += add;
|
||||||
b->buf[b->len] = '\0';
|
b->buf[b->len] = '\0';
|
||||||
@@ -437,7 +464,13 @@ static HTS_INLINE HTS_UNUSED const char *htsbuff_str(const htsbuff *b) {
|
|||||||
|
|
||||||
#define calloct(A, B) calloc((A), (B))
|
#define calloct(A, B) calloc((A), (B))
|
||||||
|
|
||||||
#define freet(A) do { if ((A) != NULL) { free(A); (A) = NULL; } } while(0)
|
#define freet(A) \
|
||||||
|
do { \
|
||||||
|
if ((A) != NULL) { \
|
||||||
|
free(A); \
|
||||||
|
(A) = NULL; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
#define strdupt(A) strdup(A)
|
#define strdupt(A) strdup(A)
|
||||||
|
|
||||||
|
|||||||
@@ -60,6 +60,7 @@ typedef struct String String;
|
|||||||
#endif
|
#endif
|
||||||
#ifndef HTS_DEF_STRUCT_String
|
#ifndef HTS_DEF_STRUCT_String
|
||||||
#define HTS_DEF_STRUCT_String
|
#define HTS_DEF_STRUCT_String
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Growable owned string.
|
* Growable owned string.
|
||||||
*
|
*
|
||||||
@@ -131,14 +132,16 @@ struct String {
|
|||||||
|
|
||||||
/** Drop the last byte and re-terminate. Undefined if the String is empty
|
/** Drop the last byte and re-terminate. Undefined if the String is empty
|
||||||
(no length check; would underflow). **/
|
(no length check; would underflow). **/
|
||||||
#define StringPopRight(BLK) do { \
|
#define StringPopRight(BLK) \
|
||||||
|
do { \
|
||||||
StringBuffRW(BLK)[--StringLength(BLK)] = '\0'; \
|
StringBuffRW(BLK)[--StringLength(BLK)] = '\0'; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/** Grow so capacity_ >= CAPACITY (total bytes, including the NUL). May realloc
|
/** Grow so capacity_ >= CAPACITY (total bytes, including the NUL). May realloc
|
||||||
(invalidating prior buffer pointers); aborts via STRING_ASSERT on OOM.
|
(invalidating prior buffer pointers); aborts via STRING_ASSERT on OOM.
|
||||||
Never shrinks. **/
|
Never shrinks. **/
|
||||||
#define StringRoomTotal(BLK, CAPACITY) do { \
|
#define StringRoomTotal(BLK, CAPACITY) \
|
||||||
|
do { \
|
||||||
const size_t capacity_ = (size_t) (CAPACITY); \
|
const size_t capacity_ = (size_t) (CAPACITY); \
|
||||||
while ((BLK).capacity_ < capacity_) { \
|
while ((BLK).capacity_ < capacity_) { \
|
||||||
if ((BLK).capacity_ < 16) { \
|
if ((BLK).capacity_ < 16) { \
|
||||||
@@ -153,11 +156,13 @@ struct String {
|
|||||||
|
|
||||||
/** Reserve room for SIZE more bytes beyond the current length (plus the NUL).
|
/** Reserve room for SIZE more bytes beyond the current length (plus the NUL).
|
||||||
May realloc, invalidating prior buffer pointers. **/
|
May realloc, invalidating prior buffer pointers. **/
|
||||||
#define StringRoom(BLK, SIZE) StringRoomTotal(BLK, StringLength(BLK) + (SIZE) + 1)
|
#define StringRoom(BLK, SIZE) \
|
||||||
|
StringRoomTotal(BLK, StringLength(BLK) + (SIZE) + 1)
|
||||||
|
|
||||||
/** Reserve room for SIZE more bytes and return the (post-realloc) RW buffer,
|
/** Reserve room for SIZE more bytes and return the (post-realloc) RW buffer,
|
||||||
for appending in place. Does not update length_; the caller must. **/
|
for appending in place. Does not update length_; the caller must. **/
|
||||||
#define StringBuffN(BLK, SIZE) StringBuffN_(&(BLK), SIZE)
|
#define StringBuffN(BLK, SIZE) StringBuffN_(&(BLK), SIZE)
|
||||||
|
|
||||||
HTS_STATIC char *StringBuffN_(String *blk, int size) {
|
HTS_STATIC char *StringBuffN_(String *blk, int size) {
|
||||||
StringRoom(*blk, size);
|
StringRoom(*blk, size);
|
||||||
return StringBuffRW(*blk);
|
return StringBuffRW(*blk);
|
||||||
@@ -166,7 +171,8 @@ HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
|||||||
/** Zero the fields (NULL buffer, no allocation). Use on an uninitialized
|
/** Zero the fields (NULL buffer, no allocation). Use on an uninitialized
|
||||||
String only; does NOT free an existing buffer (use StringFree to reset
|
String only; does NOT free an existing buffer (use StringFree to reset
|
||||||
an owned one), so calling it on a live String leaks. **/
|
an owned one), so calling it on a live String leaks. **/
|
||||||
#define StringInit(BLK) do { \
|
#define StringInit(BLK) \
|
||||||
|
do { \
|
||||||
(BLK).buffer_ = NULL; \
|
(BLK).buffer_ = NULL; \
|
||||||
(BLK).capacity_ = 0; \
|
(BLK).capacity_ = 0; \
|
||||||
(BLK).length_ = 0; \
|
(BLK).length_ = 0; \
|
||||||
@@ -174,7 +180,8 @@ HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
|||||||
|
|
||||||
/** Truncate to length 0, keeping the allocation. Forces a non-NULL buffer
|
/** Truncate to length 0, keeping the allocation. Forces a non-NULL buffer
|
||||||
(allocates if empty) and writes the leading NUL, so StringBuff is "". **/
|
(allocates if empty) and writes the leading NUL, so StringBuff is "". **/
|
||||||
#define StringClear(BLK) do { \
|
#define StringClear(BLK) \
|
||||||
|
do { \
|
||||||
(BLK).length_ = 0; \
|
(BLK).length_ = 0; \
|
||||||
StringRoom(BLK, 0); \
|
StringRoom(BLK, 0); \
|
||||||
(BLK).buffer_[0] = '\0'; \
|
(BLK).buffer_[0] = '\0'; \
|
||||||
@@ -182,7 +189,8 @@ HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
|||||||
|
|
||||||
/** Set length_ to SIZE, or to strlen(buffer_) if SIZE is negative. Caller
|
/** Set length_ to SIZE, or to strlen(buffer_) if SIZE is negative. Caller
|
||||||
asserts SIZE fits the existing content; does not (re)allocate. **/
|
asserts SIZE fits the existing content; does not (re)allocate. **/
|
||||||
#define StringSetLength(BLK, SIZE) do { \
|
#define StringSetLength(BLK, SIZE) \
|
||||||
|
do { \
|
||||||
if (SIZE >= 0) { \
|
if (SIZE >= 0) { \
|
||||||
(BLK).length_ = SIZE; \
|
(BLK).length_ = SIZE; \
|
||||||
} else { \
|
} else { \
|
||||||
@@ -192,7 +200,8 @@ HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
|||||||
|
|
||||||
/** Release the owned buffer and reset to the empty state (NULL buffer).
|
/** Release the owned buffer and reset to the empty state (NULL buffer).
|
||||||
Idempotent; safe on an already-empty String. **/
|
Idempotent; safe on an already-empty String. **/
|
||||||
#define StringFree(BLK) do { \
|
#define StringFree(BLK) \
|
||||||
|
do { \
|
||||||
if ((BLK).buffer_ != NULL) { \
|
if ((BLK).buffer_ != NULL) { \
|
||||||
STRING_FREE((BLK).buffer_); \
|
STRING_FREE((BLK).buffer_); \
|
||||||
(BLK).buffer_ = NULL; \
|
(BLK).buffer_ = NULL; \
|
||||||
@@ -207,7 +216,8 @@ HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
|||||||
freed or used by the caller afterwards. length_/capacity_ are set to
|
freed or used by the caller afterwards. length_/capacity_ are set to
|
||||||
strlen(STR) (capacity_ here excludes the NUL, so the next append reallocs).
|
strlen(STR) (capacity_ here excludes the NUL, so the next append reallocs).
|
||||||
**/
|
**/
|
||||||
#define StringSetBuffer(BLK, STR) do { \
|
#define StringSetBuffer(BLK, STR) \
|
||||||
|
do { \
|
||||||
size_t len__ = strlen(STR); \
|
size_t len__ = strlen(STR); \
|
||||||
StringFree(BLK); \
|
StringFree(BLK); \
|
||||||
(BLK).buffer_ = (STR); \
|
(BLK).buffer_ = (STR); \
|
||||||
@@ -218,7 +228,8 @@ HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
|||||||
/** Append SIZE raw bytes from STR (NULs allowed as data). Grows as needed and
|
/** Append SIZE raw bytes from STR (NULs allowed as data). Grows as needed and
|
||||||
re-terminates with a NUL after the appended bytes. STR must not alias
|
re-terminates with a NUL after the appended bytes. STR must not alias
|
||||||
BLK's buffer (a realloc would invalidate it). **/
|
BLK's buffer (a realloc would invalidate it). **/
|
||||||
#define StringMemcat(BLK, STR, SIZE) do { \
|
#define StringMemcat(BLK, STR, SIZE) \
|
||||||
|
do { \
|
||||||
const char *str_mc_ = (STR); \
|
const char *str_mc_ = (STR); \
|
||||||
const size_t size_mc_ = (size_t) (SIZE); \
|
const size_t size_mc_ = (size_t) (SIZE); \
|
||||||
StringRoom(BLK, size_mc_); \
|
StringRoom(BLK, size_mc_); \
|
||||||
@@ -231,13 +242,15 @@ HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
|||||||
|
|
||||||
/** Replace content with SIZE raw bytes from STR (NULs allowed as data).
|
/** Replace content with SIZE raw bytes from STR (NULs allowed as data).
|
||||||
Same non-aliasing requirement as StringMemcat. **/
|
Same non-aliasing requirement as StringMemcat. **/
|
||||||
#define StringMemcpy(BLK, STR, SIZE) do { \
|
#define StringMemcpy(BLK, STR, SIZE) \
|
||||||
|
do { \
|
||||||
(BLK).length_ = 0; \
|
(BLK).length_ = 0; \
|
||||||
StringMemcat(BLK, STR, SIZE); \
|
StringMemcat(BLK, STR, SIZE); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/** Append one byte and re-terminate. Grows as needed. **/
|
/** Append one byte and re-terminate. Grows as needed. **/
|
||||||
#define StringAddchar(BLK, c) do { \
|
#define StringAddchar(BLK, c) \
|
||||||
|
do { \
|
||||||
String *const s__ = &(BLK); \
|
String *const s__ = &(BLK); \
|
||||||
char c__ = (c); \
|
char c__ = (c); \
|
||||||
StringRoom(*s__, 1); \
|
StringRoom(*s__, 1); \
|
||||||
@@ -281,7 +294,8 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
|||||||
|
|
||||||
/** Append the C string STR (up to its NUL). No-op if STR is NULL. STR must not
|
/** Append the C string STR (up to its NUL). No-op if STR is NULL. STR must not
|
||||||
alias BLK's buffer. **/
|
alias BLK's buffer. **/
|
||||||
#define StringCat(BLK, STR) do { \
|
#define StringCat(BLK, STR) \
|
||||||
|
do { \
|
||||||
const char *const str__ = (STR); \
|
const char *const str__ = (STR); \
|
||||||
if (str__ != NULL) { \
|
if (str__ != NULL) { \
|
||||||
const size_t size__ = strlen(str__); \
|
const size_t size__ = strlen(str__); \
|
||||||
@@ -291,7 +305,8 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
|||||||
|
|
||||||
/** Append at most SIZE leading bytes of the C string STR. No-op if STR is
|
/** Append at most SIZE leading bytes of the C string STR. No-op if STR is
|
||||||
NULL. STR must not alias BLK's buffer. **/
|
NULL. STR must not alias BLK's buffer. **/
|
||||||
#define StringCatN(BLK, STR, SIZE) do { \
|
#define StringCatN(BLK, STR, SIZE) \
|
||||||
|
do { \
|
||||||
const char *str__ = (STR); \
|
const char *str__ = (STR); \
|
||||||
if (str__ != NULL) { \
|
if (str__ != NULL) { \
|
||||||
size_t size__ = strlen(str__); \
|
size_t size__ = strlen(str__); \
|
||||||
@@ -304,7 +319,8 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
|||||||
|
|
||||||
/** Replace content with at most SIZE leading bytes of the C string STR.
|
/** Replace content with at most SIZE leading bytes of the C string STR.
|
||||||
If STR is NULL, clears to "". STR must not alias BLK's buffer. **/
|
If STR is NULL, clears to "". STR must not alias BLK's buffer. **/
|
||||||
#define StringCopyN(BLK, STR, SIZE) do { \
|
#define StringCopyN(BLK, STR, SIZE) \
|
||||||
|
do { \
|
||||||
const char *str__ = (STR); \
|
const char *str__ = (STR); \
|
||||||
const size_t usize__ = (SIZE); \
|
const size_t usize__ = (SIZE); \
|
||||||
(BLK).length_ = 0; \
|
(BLK).length_ = 0; \
|
||||||
@@ -326,7 +342,8 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
|||||||
/** Replace content with a copy of the C string STR. If STR is NULL, clears to
|
/** Replace content with a copy of the C string STR. If STR is NULL, clears to
|
||||||
"". STR must not alias BLK's buffer (use StringCopyOverlapped if it might).
|
"". STR must not alias BLK's buffer (use StringCopyOverlapped if it might).
|
||||||
**/
|
**/
|
||||||
#define StringCopy(BLK, STR) do { \
|
#define StringCopy(BLK, STR) \
|
||||||
|
do { \
|
||||||
const char *str__ = (STR); \
|
const char *str__ = (STR); \
|
||||||
if (str__ != NULL) { \
|
if (str__ != NULL) { \
|
||||||
size_t size__ = strlen(str__); \
|
size_t size__ = strlen(str__); \
|
||||||
@@ -338,7 +355,8 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
|||||||
|
|
||||||
/** Like StringCopy but safe when STR aliases BLK's own buffer: copies via a
|
/** Like StringCopy but safe when STR aliases BLK's own buffer: copies via a
|
||||||
temporary, so a self-copy or overlap is well-defined. **/
|
temporary, so a self-copy or overlap is well-defined. **/
|
||||||
#define StringCopyOverlapped(BLK, STR) do { \
|
#define StringCopyOverlapped(BLK, STR) \
|
||||||
|
do { \
|
||||||
String s__ = STRING_EMPTY; \
|
String s__ = STRING_EMPTY; \
|
||||||
StringCopy(s__, STR); \
|
StringCopy(s__, STR); \
|
||||||
StringCopyS(BLK, s__); \
|
StringCopyS(BLK, s__); \
|
||||||
|
|||||||
@@ -1213,7 +1213,7 @@ HTSEXT_API find_handle hts_findfirst(char *path) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
HTSEXT_API int hts_findnext(find_handle find) {
|
HTSEXT_API hts_boolean hts_findnext(find_handle find) {
|
||||||
if (find) {
|
if (find) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
if ((FindNextFileA(find->handle, &find->hdata)))
|
if ((FindNextFileA(find->handle, &find->hdata)))
|
||||||
@@ -1273,7 +1273,7 @@ HTSEXT_API int hts_findgetsize(find_handle find) {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
HTSEXT_API int hts_findisdir(find_handle find) {
|
HTSEXT_API hts_boolean hts_findisdir(find_handle find) {
|
||||||
if (find) {
|
if (find) {
|
||||||
if (!hts_findissystem(find)) {
|
if (!hts_findissystem(find)) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@@ -1287,7 +1287,7 @@ HTSEXT_API int hts_findisdir(find_handle find) {
|
|||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
HTSEXT_API int hts_findisfile(find_handle find) {
|
HTSEXT_API hts_boolean hts_findisfile(find_handle find) {
|
||||||
if (find) {
|
if (find) {
|
||||||
if (!hts_findissystem(find)) {
|
if (!hts_findissystem(find)) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@@ -1301,7 +1301,7 @@ HTSEXT_API int hts_findisfile(find_handle find) {
|
|||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
HTSEXT_API int hts_findissystem(find_handle find) {
|
HTSEXT_API hts_boolean hts_findissystem(find_handle find) {
|
||||||
if (find) {
|
if (find) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
if (find->hdata.
|
if (find->hdata.
|
||||||
|
|||||||
@@ -108,15 +108,15 @@ HTSEXT_API int hts_buildtopindex(httrackp * opt, const char *path,
|
|||||||
// Portable directory find functions
|
// Portable directory find functions
|
||||||
// Directory find functions
|
// Directory find functions
|
||||||
HTSEXT_API find_handle hts_findfirst(char *path);
|
HTSEXT_API find_handle hts_findfirst(char *path);
|
||||||
HTSEXT_API int hts_findnext(find_handle find);
|
HTSEXT_API hts_boolean hts_findnext(find_handle find);
|
||||||
HTSEXT_API int hts_findclose(find_handle find);
|
HTSEXT_API int hts_findclose(find_handle find);
|
||||||
|
|
||||||
//
|
//
|
||||||
HTSEXT_API char *hts_findgetname(find_handle find);
|
HTSEXT_API char *hts_findgetname(find_handle find);
|
||||||
HTSEXT_API int hts_findgetsize(find_handle find);
|
HTSEXT_API int hts_findgetsize(find_handle find);
|
||||||
HTSEXT_API int hts_findisdir(find_handle find);
|
HTSEXT_API hts_boolean hts_findisdir(find_handle find);
|
||||||
HTSEXT_API int hts_findisfile(find_handle find);
|
HTSEXT_API hts_boolean hts_findisfile(find_handle find);
|
||||||
HTSEXT_API int hts_findissystem(find_handle find);
|
HTSEXT_API hts_boolean hts_findissystem(find_handle find);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ typedef struct strc_int2bytes2 strc_int2bytes2;
|
|||||||
#endif
|
#endif
|
||||||
#ifndef HTS_DEF_DEFSTRUCT_hts_log_type
|
#ifndef HTS_DEF_DEFSTRUCT_hts_log_type
|
||||||
#define HTS_DEF_DEFSTRUCT_hts_log_type
|
#define HTS_DEF_DEFSTRUCT_hts_log_type
|
||||||
|
|
||||||
/** Log severity levels, most to least severe. A message is emitted only if its
|
/** Log severity levels, most to least severe. A message is emitted only if its
|
||||||
level is <= opt->debug. LOG_ERRNO is a flag OR'd into the level to append
|
level is <= opt->debug. LOG_ERRNO is a flag OR'd into the level to append
|
||||||
": <strerror(errno)>" to the message. */
|
": <strerror(errno)>" to the message. */
|
||||||
@@ -111,8 +112,10 @@ requires: htsdefines.h */
|
|||||||
* CALLBACKARG_USERDEF(). Allocates a t_hts_callbackarg with hts_malloc (not
|
* CALLBACKARG_USERDEF(). Allocates a t_hts_callbackarg with hts_malloc (not
|
||||||
* checked for OOM); it is freed by hts_free_opt().
|
* checked for OOM); it is freed by hts_free_opt().
|
||||||
*/
|
*/
|
||||||
#define CHAIN_FUNCTION(OPT, MEMBER, FUNCTION, ARGUMENT) do { \
|
#define CHAIN_FUNCTION(OPT, MEMBER, FUNCTION, ARGUMENT) \
|
||||||
t_hts_callbackarg *carg = (t_hts_callbackarg*) hts_malloc(sizeof(t_hts_callbackarg)); \
|
do { \
|
||||||
|
t_hts_callbackarg *carg = \
|
||||||
|
(t_hts_callbackarg *) hts_malloc(sizeof(t_hts_callbackarg)); \
|
||||||
carg->userdef = (ARGUMENT); \
|
carg->userdef = (ARGUMENT); \
|
||||||
carg->prev.fun = (void *) (OPT)->callbacks_fun->MEMBER.fun; \
|
carg->prev.fun = (void *) (OPT)->callbacks_fun->MEMBER.fun; \
|
||||||
carg->prev.carg = (OPT)->callbacks_fun->MEMBER.carg; \
|
carg->prev.carg = (OPT)->callbacks_fun->MEMBER.carg; \
|
||||||
@@ -120,8 +123,10 @@ requires: htsdefines.h */
|
|||||||
(OPT)->callbacks_fun->MEMBER.carg = carg; \
|
(OPT)->callbacks_fun->MEMBER.carg = carg; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* The following helpers are useful only if you know that an existing callback migh be existing before before the call to CHAIN_FUNCTION()
|
/* The following helpers are useful only if you know that an existing callback
|
||||||
If your functions were added just after hts_create_opt(), no need to make the previous function check */
|
migh be existing before before the call to CHAIN_FUNCTION() If your functions
|
||||||
|
were added just after hts_create_opt(), no need to make the previous function
|
||||||
|
check */
|
||||||
|
|
||||||
/** Inside a chained callback, return the ARGUMENT pointer originally passed to
|
/** Inside a chained callback, return the ARGUMENT pointer originally passed to
|
||||||
CHAIN_FUNCTION(), or NULL when CARG is NULL. */
|
CHAIN_FUNCTION(), or NULL when CARG is NULL. */
|
||||||
@@ -129,11 +134,13 @@ If your functions were added just after hts_create_opt(), no need to make the pr
|
|||||||
|
|
||||||
/** Return the callback of type NAME that this one chained over, cast to its
|
/** Return the callback of type NAME that this one chained over, cast to its
|
||||||
function-pointer type, or NULL. Call it to forward to the prior handler. */
|
function-pointer type, or NULL. Call it to forward to the prior handler. */
|
||||||
#define CALLBACKARG_PREV_FUN(CARG, NAME) ( (t_hts_htmlcheck_ ##NAME) ( ( (CARG) != NULL ) ? (CARG)->prev.fun : NULL ) )
|
#define CALLBACKARG_PREV_FUN(CARG, NAME) \
|
||||||
|
((t_hts_htmlcheck_##NAME)(((CARG) != NULL) ? (CARG)->prev.fun : NULL))
|
||||||
|
|
||||||
/** Return the carg of the callback this one chained over (pass it when
|
/** Return the carg of the callback this one chained over (pass it when
|
||||||
forwarding to the CALLBACKARG_PREV_FUN result), or NULL. */
|
forwarding to the CALLBACKARG_PREV_FUN result), or NULL. */
|
||||||
#define CALLBACKARG_PREV_CARG(CARG) ( ( (CARG) != NULL ) ? (CARG)->prev.carg : NULL )
|
#define CALLBACKARG_PREV_CARG(CARG) \
|
||||||
|
(((CARG) != NULL) ? (CARG)->prev.carg : NULL)
|
||||||
|
|
||||||
/* Functions */
|
/* Functions */
|
||||||
|
|
||||||
@@ -206,13 +213,14 @@ HTSEXT_API htsErrorCallback hts_get_error_callback(void);
|
|||||||
/* Logging */
|
/* Logging */
|
||||||
/** Legacy: write prefix then msg to opt->log. Returns 0 if written, 1 if
|
/** Legacy: write prefix then msg to opt->log. Returns 0 if written, 1 if
|
||||||
opt->log is NULL. Prefer hts_log_print(). */
|
opt->log is NULL. Prefer hts_log_print(). */
|
||||||
HTSEXT_API int hts_log(httrackp * opt, const char *prefix, const char *msg);
|
HTSEXT_API hts_boolean hts_log(httrackp *opt, const char *prefix,
|
||||||
|
const char *msg);
|
||||||
|
|
||||||
/** printf-style log at level @p type (an hts_log_type, optionally |LOG_ERRNO).
|
/** printf-style log at level @p type (an hts_log_type, optionally |LOG_ERRNO).
|
||||||
Forwards to the registered log callback, and when the level is <= opt->debug
|
Forwards to the registered log callback, and when the level is <= opt->debug
|
||||||
also to opt->log. @p format must be non-NULL. */
|
also to opt->log. @p format must be non-NULL. */
|
||||||
HTSEXT_API void hts_log_print(httrackp * opt, int type, const char *format,
|
HTSEXT_API void hts_log_print(httrackp *opt, int type, const char *format, ...)
|
||||||
...) HTS_PRINTF_FUN(3, 4);
|
HTS_PRINTF_FUN(3, 4);
|
||||||
|
|
||||||
/** va_list form of hts_log_print(). @p opt may be NULL (only the callback
|
/** va_list form of hts_log_print(). @p opt may be NULL (only the callback
|
||||||
runs). Preserves errno. @p format must be non-NULL. */
|
runs). Preserves errno. @p format must be non-NULL. */
|
||||||
@@ -254,7 +262,8 @@ HTSEXT_API int htswrap_add(httrackp * opt, const char *name, void *fct);
|
|||||||
or 0 if none or unknown. */
|
or 0 if none or unknown. */
|
||||||
HTSEXT_API uintptr_t htswrap_read(httrackp *opt, const char *name);
|
HTSEXT_API uintptr_t htswrap_read(httrackp *opt, const char *name);
|
||||||
|
|
||||||
/* Internal library allocators, if a different libc is being used by the client */
|
/* Internal library allocators, if a different libc is being used by the client
|
||||||
|
*/
|
||||||
/** strdup() through the library allocator. Returns a heap copy freed with
|
/** strdup() through the library allocator. Returns a heap copy freed with
|
||||||
hts_free(), or NULL on failure. */
|
hts_free(), or NULL on failure. */
|
||||||
HTSEXT_API char *hts_strdup(const char *string);
|
HTSEXT_API char *hts_strdup(const char *string);
|
||||||
@@ -313,7 +322,8 @@ HTSEXT_API T_SOC catch_url_init(int *port, char *adr);
|
|||||||
"ip:port". The buffers are caller-allocated and not bounds-checked: @p data
|
"ip:port". The buffers are caller-allocated and not bounds-checked: @p data
|
||||||
must be CATCH_URL_DATA_SIZE bytes, and @p url / @p method must fit the
|
must be CATCH_URL_DATA_SIZE bytes, and @p url / @p method must fit the
|
||||||
captured request line. */
|
captured request line. */
|
||||||
HTSEXT_API int catch_url(T_SOC soc, char *url, char *method, char *data);
|
HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||||
|
char *data);
|
||||||
|
|
||||||
/* State */
|
/* State */
|
||||||
/** Whether the engine is parsing HTML. Returns 0 if not, otherwise the percent
|
/** Whether the engine is parsing HTML. Returns 0 if not, otherwise the percent
|
||||||
@@ -334,10 +344,10 @@ HTSEXT_API int hts_is_exiting(httrackp * opt);
|
|||||||
caller-owned, NULL-terminated array of strings; the engine stores the
|
caller-owned, NULL-terminated array of strings; the engine stores the
|
||||||
pointer without copying, so the array and its strings must stay valid until
|
pointer without copying, so the array and its strings must stay valid until
|
||||||
the engine consumes them. @return nonzero if a list is now set. */
|
the engine consumes them. @return nonzero if a list is now set. */
|
||||||
HTSEXT_API int hts_addurl(httrackp * opt, char **url);
|
HTSEXT_API hts_boolean hts_addurl(httrackp *opt, char **url);
|
||||||
|
|
||||||
/** Clear any pending add-URL list set by hts_addurl(). Always returns 0. */
|
/** Clear any pending add-URL list set by hts_addurl(). Always returns 0. */
|
||||||
HTSEXT_API int hts_resetaddurl(httrackp * opt);
|
HTSEXT_API hts_boolean hts_resetaddurl(httrackp *opt);
|
||||||
|
|
||||||
/** Apply the runtime-tunable options from @p from onto @p to, to adjust a live
|
/** Apply the runtime-tunable options from @p from onto @p to, to adjust a live
|
||||||
mirror. Only fields set to a non-sentinel value are copied; the rest of @p
|
mirror. Only fields set to a non-sentinel value are copied; the rest of @p
|
||||||
@@ -356,7 +366,7 @@ HTSEXT_API int hts_setpause(httrackp * opt, int);
|
|||||||
lock, so it is safe to call from another thread). @p force is currently
|
lock, so it is safe to call from another thread). @p force is currently
|
||||||
ignored.
|
ignored.
|
||||||
@return 0; no-op if @p opt is NULL. */
|
@return 0; no-op if @p opt is NULL. */
|
||||||
HTSEXT_API int hts_request_stop(httrackp * opt, int force);
|
HTSEXT_API int hts_request_stop(httrackp *opt, hts_boolean force);
|
||||||
|
|
||||||
/** Queue a single in-progress file, by URL, to be cancelled by the engine.
|
/** Queue a single in-progress file, by URL, to be cancelled by the engine.
|
||||||
@p url is copied internally. Takes the state lock, so it is thread-safe.
|
@p url is copied internally. Takes the state lock, so it is thread-safe.
|
||||||
@@ -373,7 +383,7 @@ HTSEXT_API void hts_cancel_parsing(httrackp * opt);
|
|||||||
|
|
||||||
/** Nonzero once the mirror has fully ended. Read under the engine state lock,
|
/** Nonzero once the mirror has fully ended. Read under the engine state lock,
|
||||||
so safe to poll from another thread. Wait for this before hts_free_opt(). */
|
so safe to poll from another thread. Wait for this before hts_free_opt(). */
|
||||||
HTSEXT_API int hts_has_stopped(httrackp * opt);
|
HTSEXT_API hts_boolean hts_has_stopped(httrackp *opt);
|
||||||
|
|
||||||
/* Tools */
|
/* Tools */
|
||||||
/** Ensure the directory chain leading to @p path exists, creating missing
|
/** Ensure the directory chain leading to @p path exists, creating missing
|
||||||
@@ -390,7 +400,7 @@ HTSEXT_API int structcheck_utf8(const char *path);
|
|||||||
/** Whether the directory containing @p path exists. The basename is stripped
|
/** Whether the directory containing @p path exists. The basename is stripped
|
||||||
first, so passing a file path tests its parent directory. @return 1 if it is
|
first, so passing a file path tests its parent directory. @return 1 if it is
|
||||||
a directory, 0 otherwise. */
|
a directory, 0 otherwise. */
|
||||||
HTSEXT_API int dir_exists(const char *path);
|
HTSEXT_API hts_boolean dir_exists(const char *path);
|
||||||
|
|
||||||
/** Write the HTTP reason phrase for @p statuscode into @p msg, a caller buffer
|
/** Write the HTTP reason phrase for @p statuscode into @p msg, a caller buffer
|
||||||
of at least 64 bytes. For an unknown code a non-empty @p msg is kept,
|
of at least 64 bytes. For an unknown code a non-empty @p msg is kept,
|
||||||
@@ -488,40 +498,50 @@ HTSEXT_API void unescape_amp(char *s);
|
|||||||
|
|
||||||
/** Percent-escape only spaces (' ' becomes "%20"); copy everything else
|
/** Percent-escape only spaces (' ' becomes "%20"); copy everything else
|
||||||
* verbatim. */
|
* verbatim. */
|
||||||
HTSEXT_API size_t escape_spc_url(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t escape_spc_url(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Aggressively percent-escape @p src for use as a single URL path segment
|
/** Aggressively percent-escape @p src for use as a single URL path segment
|
||||||
(reserved, delimiter, unwise, special, avoid and mark characters). */
|
(reserved, delimiter, unwise, special, avoid and mark characters). */
|
||||||
HTSEXT_API size_t escape_in_url(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t escape_in_url(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Percent-escape @p src as a URI, escaping only what is necessary and keeping
|
/** Percent-escape @p src as a URI, escaping only what is necessary and keeping
|
||||||
'/' and other reserved characters. */
|
'/' and other reserved characters. */
|
||||||
HTSEXT_API size_t escape_uri(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t escape_uri(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Like escape_uri() for a UTF-8 URI: also escapes reserved characters other
|
/** Like escape_uri() for a UTF-8 URI: also escapes reserved characters other
|
||||||
than '/'. */
|
than '/'. */
|
||||||
HTSEXT_API size_t escape_uri_utf(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t escape_uri_utf(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Minimal "make safe" escape: percent-escapes only '"', ' ' and control
|
/** Minimal "make safe" escape: percent-escapes only '"', ' ' and control
|
||||||
characters, leaving an already-formed URL otherwise intact. */
|
characters, leaving an already-formed URL otherwise intact. */
|
||||||
HTSEXT_API size_t escape_check_url(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t escape_check_url(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Append-variant of escape_spc_url(): escapes @p src after the existing
|
/** Append-variant of escape_spc_url(): escapes @p src after the existing
|
||||||
NUL-terminated content of @p dest. Returns the bytes appended (excluding the
|
NUL-terminated content of @p dest. Returns the bytes appended (excluding the
|
||||||
NUL). */
|
NUL). */
|
||||||
HTSEXT_API size_t append_escape_spc_url(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t append_escape_spc_url(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Append-variant of escape_in_url(). See append_escape_spc_url(). */
|
/** Append-variant of escape_in_url(). See append_escape_spc_url(). */
|
||||||
HTSEXT_API size_t append_escape_in_url(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t append_escape_in_url(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Append-variant of escape_uri(). See append_escape_spc_url(). */
|
/** Append-variant of escape_uri(). See append_escape_spc_url(). */
|
||||||
HTSEXT_API size_t append_escape_uri(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t append_escape_uri(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Append-variant of escape_uri_utf(). See append_escape_spc_url(). */
|
/** Append-variant of escape_uri_utf(). See append_escape_spc_url(). */
|
||||||
HTSEXT_API size_t append_escape_uri_utf(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t append_escape_uri_utf(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Append-variant of escape_check_url(). See append_escape_spc_url(). */
|
/** Append-variant of escape_check_url(). See append_escape_spc_url(). */
|
||||||
HTSEXT_API size_t append_escape_check_url(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API size_t append_escape_check_url(const char *const src,
|
||||||
|
char *const dest, const size_t size);
|
||||||
|
|
||||||
/** In-place variant of escape_spc_url(): escapes the NUL-terminated string in
|
/** In-place variant of escape_spc_url(): escapes the NUL-terminated string in
|
||||||
@p dest back into @p dest. */
|
@p dest back into @p dest. */
|
||||||
@@ -541,53 +561,60 @@ HTSEXT_API size_t inplace_escape_check_url(char *const dest, const size_t size);
|
|||||||
|
|
||||||
/** Same escaping as escape_check_url() but returns @p dest instead of the byte
|
/** Same escaping as escape_check_url() but returns @p dest instead of the byte
|
||||||
count. */
|
count. */
|
||||||
HTSEXT_API char *escape_check_url_addr(const char *const src, char *const dest, const size_t size);
|
HTSEXT_API char *escape_check_url_addr(const char *const src, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Build a MIME/MHTML content-id token in @p dest from @p adr and @p fil:
|
/** Build a MIME/MHTML content-id token in @p dest from @p adr and @p fil:
|
||||||
escape_in_url() both, then replace every '%' with 'X' so the result is one
|
escape_in_url() both, then replace every '%' with 'X' so the result is one
|
||||||
opaque token. */
|
opaque token. */
|
||||||
HTSEXT_API size_t make_content_id(const char *const adr, const char *const fil, char *const dest, const size_t size);
|
HTSEXT_API size_t make_content_id(const char *const adr, const char *const fil,
|
||||||
|
char *const dest, const size_t size);
|
||||||
|
|
||||||
/** Low-level percent-escaper backing the escape_* family. @p mode selects the
|
/** Low-level percent-escaper backing the escape_* family. @p mode selects the
|
||||||
character class to escape: 0 check_url, 1 in_url, 2 spc_url, 3 uri,
|
character class to escape: 0 check_url, 1 in_url, 2 spc_url, 3 uri,
|
||||||
30 uri_utf. @p max_size is the dest capacity including the NUL. */
|
30 uri_utf. @p max_size is the dest capacity including the NUL. */
|
||||||
HTSEXT_API size_t x_escape_http(const char *const s, char *const dest, const size_t max_size, const int mode);
|
HTSEXT_API size_t x_escape_http(const char *const s, char *const dest,
|
||||||
|
const size_t max_size, const int mode);
|
||||||
|
|
||||||
/** Strip all control characters (byte value < 32) from @p s in place. */
|
/** Strip all control characters (byte value < 32) from @p s in place. */
|
||||||
HTSEXT_API void escape_remove_control(char *const s);
|
HTSEXT_API void escape_remove_control(char *const s);
|
||||||
|
|
||||||
/** HTML-escape for text output: rewrite '&' to "&" and pass every other
|
/** HTML-escape for text output: rewrite '&' to "&" and pass every other
|
||||||
byte through unchanged. */
|
byte through unchanged. */
|
||||||
HTSEXT_API size_t escape_for_html_print(const char *const s, char *const dest, const size_t size);
|
HTSEXT_API size_t escape_for_html_print(const char *const s, char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Like escape_for_html_print() but also convert every high byte (>= 128) to a
|
/** Like escape_for_html_print() but also convert every high byte (>= 128) to a
|
||||||
numeric entity "&#xNN;". */
|
numeric entity "&#xNN;". */
|
||||||
HTSEXT_API size_t escape_for_html_print_full(const char *const s, char *const dest, const size_t size);
|
HTSEXT_API size_t escape_for_html_print_full(const char *const s,
|
||||||
|
char *const dest,
|
||||||
|
const size_t size);
|
||||||
|
|
||||||
/** Percent-decode @p s into @p catbuff (capacity @p size) and return @p
|
/** Percent-decode @p s into @p catbuff (capacity @p size) and return @p
|
||||||
catbuff. Decodes every "%xx" hex escape. */
|
catbuff. Decodes every "%xx" hex escape. */
|
||||||
HTSEXT_API char *unescape_http(char *const catbuff, const size_t size, const char *const s);
|
HTSEXT_API char *unescape_http(char *const catbuff, const size_t size,
|
||||||
|
const char *const s);
|
||||||
|
|
||||||
/** Percent-decode @p s into @p catbuff, but only the escapes that are safe to
|
/** Percent-decode @p s into @p catbuff, but only the escapes that are safe to
|
||||||
decode while keeping a valid URI (reserved, delimiter, unwise, control and
|
decode while keeping a valid URI (reserved, delimiter, unwise, control and
|
||||||
must-avoid escapes are kept encoded, and %25 is never decoded). @p no_high &
|
must-avoid escapes are kept encoded, and %25 is never decoded). @p no_high &
|
||||||
1 also decodes high (>= 128) bytes; @p no_high & 2 also decodes an escaped
|
1 also decodes high (>= 128) bytes; @p no_high & 2 also decodes an escaped
|
||||||
space. Returns @p catbuff. */
|
space. Returns @p catbuff. */
|
||||||
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size, const char *s, const int no_high);
|
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size,
|
||||||
|
const char *s, const hts_boolean no_high);
|
||||||
|
|
||||||
/** Determine the MIME type of local file name @p fil into @p s (capacity
|
/** Determine the MIME type of local file name @p fil into @p s (capacity
|
||||||
@p ssize): user --assume rules, then ".html", then the built-in extension
|
@p ssize): user --assume rules, then ".html", then the built-in extension
|
||||||
table. @p flag != 0 forces a fallback type. @return 1 if a type was written,
|
table. @p flag != 0 forces a fallback type. @return 1 if a type was written,
|
||||||
0 otherwise. */
|
0 otherwise. */
|
||||||
HTSEXT_API int get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||||
const char *fil, int flag);
|
const char *fil, hts_boolean flag);
|
||||||
|
|
||||||
/** @deprecated Use get_httptype_sized(). Assumes @p s has at least
|
/** @deprecated Use get_httptype_sized(). Assumes @p s has at least
|
||||||
HTS_MIMETYPE_SIZE capacity. */
|
HTS_MIMETYPE_SIZE capacity. */
|
||||||
HTS_DEPRECATED("use get_httptype_sized(opt, s, ssize, fil, flag)")
|
HTS_DEPRECATED("use get_httptype_sized(opt, s, ssize, fil, flag)")
|
||||||
|
|
||||||
HTSEXT_API void get_httptype(httrackp * opt, char *s, const char *fil,
|
HTSEXT_API void get_httptype(httrackp *opt, char *s, const char *fil, int flag);
|
||||||
int flag);
|
|
||||||
|
|
||||||
/** Classify @p fil by its extension: 0 unknown, 1 known non-HTML, 2 known HTML.
|
/** Classify @p fil by its extension: 0 unknown, 1 known non-HTML, 2 known HTML.
|
||||||
Consults the built-in table then user --assume rules. 0 for a NULL @p fil.
|
Consults the built-in table then user --assume rules. 0 for a NULL @p fil.
|
||||||
@@ -600,7 +627,7 @@ HTSEXT_API int is_userknowntype(httrackp * opt, const char *fil);
|
|||||||
|
|
||||||
/** 1 if @p fil, an extension such as "asp" or "php" (not a full filename), is a
|
/** 1 if @p fil, an extension such as "asp" or "php" (not a full filename), is a
|
||||||
known dynamic-page type, else 0. */
|
known dynamic-page type, else 0. */
|
||||||
HTSEXT_API int is_dyntype(const char *fil);
|
HTSEXT_API hts_boolean is_dyntype(const char *fil);
|
||||||
|
|
||||||
/** Extract the extension of @p fil (text after the last '.', stopping at '?')
|
/** Extract the extension of @p fil (text after the last '.', stopping at '?')
|
||||||
into caller scratch @p catbuff (capacity @p size) and return it. Returns ""
|
into caller scratch @p catbuff (capacity @p size) and return it. Returns ""
|
||||||
@@ -610,12 +637,12 @@ HTSEXT_API const char *get_ext(char *catbuff, size_t size, const char *fil);
|
|||||||
|
|
||||||
/** 1 if MIME type @p st must not be reclassified or renamed (hypertext types
|
/** 1 if MIME type @p st must not be reclassified or renamed (hypertext types
|
||||||
and a built-in keep-list of commonly mislabeled types), else 0. */
|
and a built-in keep-list of commonly mislabeled types), else 0. */
|
||||||
HTSEXT_API int may_unknown(httrackp * opt, const char *st);
|
HTSEXT_API hts_boolean may_unknown(httrackp *opt, const char *st);
|
||||||
|
|
||||||
/** Guess the MIME type of local file @p fil into @p s (capacity @p ssize),
|
/** Guess the MIME type of local file @p fil into @p s (capacity @p ssize),
|
||||||
always producing a type. @return 1 if a type was written. */
|
always producing a type. @return 1 if a type was written. */
|
||||||
HTSEXT_API int guess_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
HTSEXT_API hts_boolean guess_httptype_sized(httrackp *opt, char *s,
|
||||||
const char *fil);
|
size_t ssize, const char *fil);
|
||||||
|
|
||||||
/** @deprecated Use guess_httptype_sized(). Assumes @p s has at least
|
/** @deprecated Use guess_httptype_sized(). Assumes @p s has at least
|
||||||
HTS_MIMETYPE_SIZE capacity. */
|
HTS_MIMETYPE_SIZE capacity. */
|
||||||
@@ -630,11 +657,13 @@ HTSEXT_API void guess_httptype(httrackp * opt, char *s, const char *fil);
|
|||||||
time), not a pointer. */
|
time), not a pointer. */
|
||||||
/** Concatenate @p a and @p b into @p catbuff (NULL or empty operands are
|
/** Concatenate @p a and @p b into @p catbuff (NULL or empty operands are
|
||||||
* skipped). */
|
* skipped). */
|
||||||
HTSEXT_API char *concat(char *catbuff, size_t size, const char *a, const char *b);
|
HTSEXT_API char *concat(char *catbuff, size_t size, const char *a,
|
||||||
|
const char *b);
|
||||||
|
|
||||||
/** Like concat(a, b) but convert '/' to the platform path separator (Windows).
|
/** Like concat(a, b) but convert '/' to the platform path separator (Windows).
|
||||||
*/
|
*/
|
||||||
HTSEXT_API char *fconcat(char *catbuff, size_t size, const char *a, const char *b);
|
HTSEXT_API char *fconcat(char *catbuff, size_t size, const char *a,
|
||||||
|
const char *b);
|
||||||
|
|
||||||
/** Copy @p a into @p catbuff, converting '/' to the platform path separator
|
/** Copy @p a into @p catbuff, converting '/' to the platform path separator
|
||||||
(Windows). */
|
(Windows). */
|
||||||
@@ -677,7 +706,7 @@ HTSEXT_API find_handle hts_findfirst(char *path);
|
|||||||
|
|
||||||
/** Advance to the next directory entry. Returns 1 if an entry is available, 0
|
/** Advance to the next directory entry. Returns 1 if an entry is available, 0
|
||||||
at end of directory. */
|
at end of directory. */
|
||||||
HTSEXT_API int hts_findnext(find_handle find);
|
HTSEXT_API hts_boolean hts_findnext(find_handle find);
|
||||||
|
|
||||||
/** Close the iteration and free @p find. Always returns 0; NULL is accepted. */
|
/** Close the iteration and free @p find. Always returns 0; NULL is accepted. */
|
||||||
HTSEXT_API int hts_findclose(find_handle find);
|
HTSEXT_API int hts_findclose(find_handle find);
|
||||||
@@ -692,16 +721,16 @@ HTSEXT_API int hts_findgetsize(find_handle find);
|
|||||||
|
|
||||||
/** 1 if the current entry is a directory, else 0 (a system/special entry, see
|
/** 1 if the current entry is a directory, else 0 (a system/special entry, see
|
||||||
hts_findissystem(), reports 0). */
|
hts_findissystem(), reports 0). */
|
||||||
HTSEXT_API int hts_findisdir(find_handle find);
|
HTSEXT_API hts_boolean hts_findisdir(find_handle find);
|
||||||
|
|
||||||
/** 1 if the current entry is a regular file, else 0 (a system/special entry,
|
/** 1 if the current entry is a regular file, else 0 (a system/special entry,
|
||||||
see hts_findissystem(), reports 0). */
|
see hts_findissystem(), reports 0). */
|
||||||
HTSEXT_API int hts_findisfile(find_handle find);
|
HTSEXT_API hts_boolean hts_findisfile(find_handle find);
|
||||||
|
|
||||||
/** 1 if the current entry is a special/system entry to skip: "." or "..", on
|
/** 1 if the current entry is a special/system entry to skip: "." or "..", on
|
||||||
POSIX also device/fifo/socket nodes, on Windows also system, hidden or
|
POSIX also device/fifo/socket nodes, on Windows also system, hidden or
|
||||||
temporary entries. Else 0. */
|
temporary entries. Else 0. */
|
||||||
HTSEXT_API int hts_findissystem(find_handle find);
|
HTSEXT_API hts_boolean hts_findissystem(find_handle find);
|
||||||
|
|
||||||
/* UTF-8 aware FILE API */
|
/* UTF-8 aware FILE API */
|
||||||
/* On non-Windows these macros resolve directly to the POSIX calls. On Windows
|
/* On non-Windows these macros resolve directly to the POSIX calls. On Windows
|
||||||
@@ -753,7 +782,8 @@ typedef struct utimbuf STRUCT_UTIMBUF;
|
|||||||
/** Macro aimed to break at build-time if a size is not a sizeof() strictly
|
/** Macro aimed to break at build-time if a size is not a sizeof() strictly
|
||||||
* greater than sizeof(char*). **/
|
* greater than sizeof(char*). **/
|
||||||
#undef COMPILE_TIME_CHECK_SIZE
|
#undef COMPILE_TIME_CHECK_SIZE
|
||||||
#define COMPILE_TIME_CHECK_SIZE(A) (void) ((void (*)(char[A - sizeof(char*) - 1])) NULL)
|
#define COMPILE_TIME_CHECK_SIZE(A) \
|
||||||
|
(void) ((void (*)(char[A - sizeof(char *) - 1])) NULL)
|
||||||
|
|
||||||
/** Macro aimed to break at compile-time if a size is not a sizeof() strictly
|
/** Macro aimed to break at compile-time if a size is not a sizeof() strictly
|
||||||
* greater than sizeof(char*). **/
|
* greater than sizeof(char*). **/
|
||||||
|
|||||||
@@ -288,7 +288,7 @@ static void __cdecl htsshow_uninit(t_hts_callbackarg * carg) {
|
|||||||
}
|
}
|
||||||
static int __cdecl htsshow_start(t_hts_callbackarg * carg, httrackp * opt) {
|
static int __cdecl htsshow_start(t_hts_callbackarg * carg, httrackp * opt) {
|
||||||
use_show = 0;
|
use_show = 0;
|
||||||
if (opt->verbosedisplay == 2) {
|
if (opt->verbosedisplay == HTS_VERBOSE_FULL) {
|
||||||
use_show = 1;
|
use_show = 1;
|
||||||
vt_clear();
|
vt_clear();
|
||||||
}
|
}
|
||||||
@@ -852,7 +852,7 @@ static void sig_doback(int blind) { // mettre en backing
|
|||||||
if (global_opt != NULL) {
|
if (global_opt != NULL) {
|
||||||
// suppress logging and asking lousy questions
|
// suppress logging and asking lousy questions
|
||||||
global_opt->quiet = 1;
|
global_opt->quiet = 1;
|
||||||
global_opt->verbosedisplay = 0;
|
global_opt->verbosedisplay = HTS_VERBOSE_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!blind)
|
if (!blind)
|
||||||
|
|||||||
@@ -1176,11 +1176,15 @@ static void proxytrack_process_HTTP(PT_Indexes indexes, T_SOC soc_c) {
|
|||||||
if (element != NULL) {
|
if (element != NULL) {
|
||||||
msgCode = element->statuscode;
|
msgCode = element->statuscode;
|
||||||
StringRoom(headers, 8192);
|
StringRoom(headers, 8192);
|
||||||
sprintf(StringBuffRW(headers), "HTTP/1.1 %d %s\r\n"
|
sprintf(StringBuffRW(headers),
|
||||||
|
"HTTP/1.1 %d %s\r\n"
|
||||||
#ifndef NO_WEBDAV
|
#ifndef NO_WEBDAV
|
||||||
"%s"
|
"%s"
|
||||||
#endif
|
#endif
|
||||||
"Content-Type: %s%s%s%s\r\n" "%s%s%s" "%s%s%s" "%s%s%s",
|
"Content-Type: %s%s%s%s\r\n"
|
||||||
|
"%s%s%s"
|
||||||
|
"%s%s%s"
|
||||||
|
"%s%s%s",
|
||||||
/* */
|
/* */
|
||||||
msgCode, element->msg,
|
msgCode, element->msg,
|
||||||
#ifndef NO_WEBDAV
|
#ifndef NO_WEBDAV
|
||||||
@@ -1188,16 +1192,18 @@ static void proxytrack_process_HTTP(PT_Indexes indexes, T_SOC soc_c) {
|
|||||||
StringBuff(davHeaders),
|
StringBuff(davHeaders),
|
||||||
#endif
|
#endif
|
||||||
/* Content-type: foo; [ charset=bar ] */
|
/* Content-type: foo; [ charset=bar ] */
|
||||||
element->contenttype,
|
hts_effective_mime(element->contenttype),
|
||||||
((element->charset[0]) ? "; charset=\"" : ""),
|
((element->charset[0]) ? "; charset=\"" : ""),
|
||||||
element->charset, ((element->charset[0]) ? "\"" : ""),
|
element->charset, ((element->charset[0]) ? "\"" : ""),
|
||||||
/* location */
|
/* location */
|
||||||
((element->location != NULL
|
((element->location != NULL && element->location[0])
|
||||||
&& element->location[0]) ? "Location: " : ""),
|
? "Location: "
|
||||||
((element->location != NULL
|
: ""),
|
||||||
&& element->location[0]) ? element->location : ""),
|
((element->location != NULL && element->location[0])
|
||||||
((element->location != NULL
|
? element->location
|
||||||
&& element->location[0]) ? "\r\n" : ""),
|
: ""),
|
||||||
|
((element->location != NULL && element->location[0]) ? "\r\n"
|
||||||
|
: ""),
|
||||||
/* last-modified */
|
/* last-modified */
|
||||||
((element->lastmodified[0]) ? "Last-Modified: " : ""),
|
((element->lastmodified[0]) ? "Last-Modified: " : ""),
|
||||||
((element->lastmodified[0]) ? element->lastmodified : ""),
|
((element->lastmodified[0]) ? element->lastmodified : ""),
|
||||||
@@ -1205,8 +1211,7 @@ static void proxytrack_process_HTTP(PT_Indexes indexes, T_SOC soc_c) {
|
|||||||
/* etag */
|
/* etag */
|
||||||
((element->etag[0]) ? "ETag: " : ""),
|
((element->etag[0]) ? "ETag: " : ""),
|
||||||
((element->etag[0]) ? element->etag : ""),
|
((element->etag[0]) ? element->etag : ""),
|
||||||
((element->etag[0]) ? "\r\n" : "")
|
((element->etag[0]) ? "\r\n" : ""));
|
||||||
);
|
|
||||||
StringLength(headers) = (int) strlen(StringBuff(headers));
|
StringLength(headers) = (int) strlen(StringBuff(headers));
|
||||||
} else {
|
} else {
|
||||||
/* No query string, no ending / : check the the <url>/ page */
|
/* No query string, no ending / : check the the <url>/ page */
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ Please visit our Website: http://www.httrack.com
|
|||||||
|
|
||||||
#include "htscore.h"
|
#include "htscore.h"
|
||||||
#include "htsback.h"
|
#include "htsback.h"
|
||||||
|
#include "htslib.h" /* hts_effective_mime */
|
||||||
|
|
||||||
#include "store.h"
|
#include "store.h"
|
||||||
#include "proxystrings.h"
|
#include "proxystrings.h"
|
||||||
@@ -2289,10 +2290,17 @@ static int PT_SaveCache__Arc_Fun(void *arg, const char *url, PT_Element element)
|
|||||||
int size_headers;
|
int size_headers;
|
||||||
|
|
||||||
sprintf(st->headers,
|
sprintf(st->headers,
|
||||||
"HTTP/1.0 %d %s" "\r\n" "X-Server: ProxyTrack " PROXYTRACK_VERSION
|
"HTTP/1.0 %d %s"
|
||||||
"\r\n" "Content-type: %s%s%s%s" "\r\n" "Last-modified: %s" "\r\n"
|
"\r\n"
|
||||||
"Content-length: %d" "\r\n", element->statuscode, element->msg,
|
"X-Server: ProxyTrack " PROXYTRACK_VERSION "\r\n"
|
||||||
/**/ element->contenttype,
|
"Content-type: %s%s%s%s"
|
||||||
|
"\r\n"
|
||||||
|
"Last-modified: %s"
|
||||||
|
"\r\n"
|
||||||
|
"Content-length: %d"
|
||||||
|
"\r\n",
|
||||||
|
element->statuscode, element->msg,
|
||||||
|
/**/ hts_effective_mime(element->contenttype),
|
||||||
(element->charset[0] ? "; charset=\"" : ""),
|
(element->charset[0] ? "; charset=\"" : ""),
|
||||||
(element->charset[0] ? element->charset : ""),
|
(element->charset[0] ? element->charset : ""),
|
||||||
(element->charset[0] ? "\"" : ""), /**/ element->lastmodified,
|
(element->charset[0] ? "\"" : ""), /**/ element->lastmodified,
|
||||||
@@ -2328,10 +2336,10 @@ static int PT_SaveCache__Arc_Fun(void *arg, const char *url, PT_Element element)
|
|||||||
/* args */
|
/* args */
|
||||||
(link_has_authority(url) ? "" : "http://"), url, "0.0.0.0",
|
(link_has_authority(url) ? "" : "http://"), url, "0.0.0.0",
|
||||||
tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour,
|
tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour,
|
||||||
tm->tm_min, tm->tm_sec, element->contenttype, element->statuscode,
|
tm->tm_min, tm->tm_sec, hts_effective_mime(element->contenttype),
|
||||||
st->md5, (element->location ? element->location : "-"),
|
element->statuscode, st->md5,
|
||||||
(long int) ftell(fp), st->filename,
|
(element->location ? element->location : "-"), (long int) ftell(fp),
|
||||||
(long int) (size_headers + element->size));
|
st->filename, (long int) (size_headers + element->size));
|
||||||
/* network_doc */
|
/* network_doc */
|
||||||
if (fwrite(st->headers, 1, size_headers, fp) != size_headers
|
if (fwrite(st->headers, 1, size_headers, fp) != size_headers
|
||||||
|| (element->size > 0
|
|| (element->size > 0
|
||||||
|
|||||||
@@ -4,28 +4,33 @@
|
|||||||
# Initializes the htsserver GUI frontend and launch the default browser
|
# Initializes the htsserver GUI frontend and launch the default browser
|
||||||
|
|
||||||
BROWSEREXE=
|
BROWSEREXE=
|
||||||
SRCHBROWSEREXE="x-www-browser www-browser iceape mozilla firefox-developer-edition firefox icecat iceweasel abrowser firebird galeon konqueror midori opera google-chrome chrome chromium chromium-browser netscape firefox-developer-edition"
|
SRCHBROWSEREXE=(x-www-browser www-browser iceape mozilla firefox-developer-edition firefox icecat iceweasel abrowser firebird galeon konqueror midori opera google-chrome chrome chromium chromium-browser netscape firefox-developer-edition)
|
||||||
|
# shellcheck disable=SC2153 # BROWSER is the standard freedesktop env var, not a typo
|
||||||
if test -n "${BROWSER}"; then
|
if test -n "${BROWSER}"; then
|
||||||
# sensible-browser will f up if BROWSER is not set
|
# sensible-browser will f up if BROWSER is not set
|
||||||
SRCHBROWSEREXE="xdg-open sensible-browser ${SRCHBROWSEREXE}"
|
SRCHBROWSEREXE=(xdg-open sensible-browser "${SRCHBROWSEREXE[@]}")
|
||||||
fi
|
fi
|
||||||
# Patch for Darwin/Mac by Ross Williams
|
# Patch for Darwin/Mac by Ross Williams
|
||||||
if test "`uname -s`" == "Darwin"; then
|
if test "$(uname -s)" == "Darwin"; then
|
||||||
# Darwin/Mac OS X uses a system 'open' command to find
|
# Darwin/Mac OS X uses a system 'open' command to find
|
||||||
# the default browser. The -W flag causes it to wait for
|
# the default browser. The -W flag causes it to wait for
|
||||||
# the browser to exit
|
# the browser to exit
|
||||||
BROWSEREXE="/usr/bin/open -W"
|
BROWSEREXE="/usr/bin/open -W"
|
||||||
fi
|
fi
|
||||||
BINWD=`dirname "$0"`
|
BINWD=$(dirname "$0")
|
||||||
SRCHPATH="$BINWD /usr/local/bin /usr/share/bin /usr/bin /usr/lib/httrack /usr/local/lib/httrack /usr/local/share/httrack /opt/local/bin /sw/bin ${HOME}/usr/bin ${HOME}/bin"
|
SRCHPATH=("$BINWD" /usr/local/bin /usr/share/bin /usr/bin /usr/lib/httrack /usr/local/lib/httrack /usr/local/share/httrack /opt/local/bin /sw/bin "${HOME}/usr/bin" "${HOME}/bin")
|
||||||
SRCHPATH="$SRCHPATH "`echo $PATH | tr ":" " "`
|
IFS=':' read -ra pathdirs <<<"$PATH"
|
||||||
SRCHDISTPATH="$BINWD/../share $BINWD/.. /usr/share /usr/local /usr /local /usr/local/share ${HOME}/usr ${HOME}/usr/share /opt/local/share /sw ${HOME}/usr/local ${HOME}/usr/share"
|
for d in "${pathdirs[@]}"; do
|
||||||
|
# drop empty PATH fields, matching the old echo|tr word-split
|
||||||
|
test -n "$d" && SRCHPATH+=("$d")
|
||||||
|
done
|
||||||
|
SRCHDISTPATH=("$BINWD/../share" "$BINWD/.." /usr/share /usr/local /usr /local /usr/local/share "${HOME}/usr" "${HOME}/usr/share" /opt/local/share /sw "${HOME}/usr/local" "${HOME}/usr/share")
|
||||||
|
|
||||||
###
|
###
|
||||||
# And now some famous cuisine
|
# And now some famous cuisine
|
||||||
|
|
||||||
function log {
|
function log {
|
||||||
echo "$0($$): $@" >&2
|
echo "$0($$): $*" >&2
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -42,35 +47,35 @@ log "Browser (or helper) exited"
|
|||||||
|
|
||||||
# First ensure that we can launch the server
|
# First ensure that we can launch the server
|
||||||
BINPATH=
|
BINPATH=
|
||||||
for i in ${SRCHPATH}; do
|
for i in "${SRCHPATH[@]}"; do
|
||||||
! test -n "${BINPATH}" && test -x ${i}/htsserver && BINPATH=${i}
|
! test -n "${BINPATH}" && test -x "${i}/htsserver" && BINPATH="${i}"
|
||||||
done
|
done
|
||||||
for i in ${SRCHDISTPATH}; do
|
for i in "${SRCHDISTPATH[@]}"; do
|
||||||
! test -n "${DISTPATH}" && test -f "${i}/httrack/lang.def" && DISTPATH="${i}/httrack"
|
! test -n "${DISTPATH}" && test -f "${i}/httrack/lang.def" && DISTPATH="${i}/httrack"
|
||||||
done
|
done
|
||||||
test -n "${BINPATH}" || ! log "Could not find htsserver" || exit 1
|
test -n "${BINPATH}" || ! log "Could not find htsserver" || exit 1
|
||||||
test -n "${DISTPATH}" || ! log "Could not find httrack directory" || exit 1
|
test -n "${DISTPATH}" || ! log "Could not find httrack directory" || exit 1
|
||||||
test -f ${DISTPATH}/lang.def || ! log "Could not find ${DISTPATH}/lang.def" || exit 1
|
test -f "${DISTPATH}/lang.def" || ! log "Could not find ${DISTPATH}/lang.def" || exit 1
|
||||||
test -f ${DISTPATH}/lang.indexes || ! log "Could not find ${DISTPATH}/lang.indexes" || exit 1
|
test -f "${DISTPATH}/lang.indexes" || ! log "Could not find ${DISTPATH}/lang.indexes" || exit 1
|
||||||
test -d ${DISTPATH}/lang || ! log "Could not find ${DISTPATH}/lang" || exit 1
|
test -d "${DISTPATH}/lang" || ! log "Could not find ${DISTPATH}/lang" || exit 1
|
||||||
test -d ${DISTPATH}/html || ! log "Could not find ${DISTPATH}/html" || exit 1
|
test -d "${DISTPATH}/html" || ! log "Could not find ${DISTPATH}/html" || exit 1
|
||||||
|
|
||||||
# Locale
|
# Locale
|
||||||
HTSLANG="${LC_MESSAGES}"
|
HTSLANG="${LC_MESSAGES}"
|
||||||
! test -n "${HTSLANG}" && HTSLANG="${LC_ALL}"
|
! test -n "${HTSLANG}" && HTSLANG="${LC_ALL}"
|
||||||
! test -n "${HTSLANG}" && HTSLANG="${LANG}"
|
! test -n "${HTSLANG}" && HTSLANG="${LANG}"
|
||||||
HTSLANG="`echo $LANG | cut -f1 -d'.' | cut -f1 -d'_'`"
|
HTSLANG="$(echo "$LANG" | cut -f1 -d'.' | cut -f1 -d'_')"
|
||||||
LANGN=`grep -E "^${HTSLANG}:" ${DISTPATH}/lang.indexes | cut -f2 -d':'`
|
LANGN=$(grep -E "^${HTSLANG}:" "${DISTPATH}/lang.indexes" | cut -f2 -d':')
|
||||||
! test -n "${LANGN}" && LANGN=1
|
! test -n "${LANGN}" && LANGN=1
|
||||||
|
|
||||||
# Find the browser
|
# Find the browser
|
||||||
# note: not all systems have sensible-browser or www-browser alternative
|
# note: not all systems have sensible-browser or www-browser alternative
|
||||||
# thefeore, we have to find a bit more if sensible-browser could not be found
|
# thefeore, we have to find a bit more if sensible-browser could not be found
|
||||||
|
|
||||||
for i in ${SRCHBROWSEREXE}; do
|
for i in "${SRCHBROWSEREXE[@]}"; do
|
||||||
for j in ${SRCHPATH}; do
|
for j in "${SRCHPATH[@]}"; do
|
||||||
if test -x ${j}/${i}; then
|
if test -x "${j}/${i}"; then
|
||||||
BROWSEREXE=${j}/${i}
|
BROWSEREXE="${j}/${i}"
|
||||||
fi
|
fi
|
||||||
test -n "$BROWSEREXE" && break
|
test -n "$BROWSEREXE" && break
|
||||||
done
|
done
|
||||||
@@ -81,7 +86,7 @@ test -n "$BROWSEREXE" || ! log "Could not find any suitable browser" || exit 1
|
|||||||
# "browse" command
|
# "browse" command
|
||||||
if test "$1" = "browse"; then
|
if test "$1" = "browse"; then
|
||||||
if test -f "${HOME}/.httrack.ini"; then
|
if test -f "${HOME}/.httrack.ini"; then
|
||||||
INDEXF=`cat ${HOME}/.httrack.ini | tr '\r' '\n' | grep -E "^path=" | cut -f2- -d'='`
|
INDEXF=$(tr '\r' '\n' <"${HOME}/.httrack.ini" | grep -E "^path=" | cut -f2- -d'=')
|
||||||
if test -n "${INDEXF}" -a -d "${INDEXF}" -a -f "${INDEXF}/index.html"; then
|
if test -n "${INDEXF}" -a -d "${INDEXF}" -a -f "${INDEXF}/index.html"; then
|
||||||
INDEXF="${INDEXF}/index.html"
|
INDEXF="${INDEXF}/index.html"
|
||||||
else
|
else
|
||||||
@@ -96,39 +101,43 @@ exit $?
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Create a temporary filename
|
# Create a temporary filename
|
||||||
TMPSRVFILE="$(mktemp ${TMPDIR:-/tmp}/.webhttrack.XXXXXXXX)" || ! log "Could not create the temporary file ${TMPSRVFILE}" || exit 1
|
TMPSRVFILE="$(mktemp "${TMPDIR:-/tmp}/.webhttrack.XXXXXXXX")" || ! log "Could not create the temporary file ${TMPSRVFILE}" || exit 1
|
||||||
# Launch htsserver binary and setup the server
|
# Launch htsserver binary and setup the server
|
||||||
(${BINPATH}/htsserver "${DISTPATH}/" --ppid "$$" path "${HOME}/websites" lang "${LANGN}" $@; echo SRVURL=error) > ${TMPSRVFILE}&
|
(
|
||||||
|
"${BINPATH}/htsserver" "${DISTPATH}/" --ppid "$$" path "${HOME}/websites" lang "${LANGN}" "$@"
|
||||||
|
echo SRVURL=error
|
||||||
|
) >"${TMPSRVFILE}" &
|
||||||
# Find the generated SRVURL
|
# Find the generated SRVURL
|
||||||
SRVURL=
|
SRVURL=
|
||||||
MAXCOUNT=60
|
MAXCOUNT=60
|
||||||
while ! test -n "$SRVURL"; do
|
while ! test -n "$SRVURL"; do
|
||||||
MAXCOUNT=$[$MAXCOUNT - 1]
|
MAXCOUNT=$((MAXCOUNT - 1))
|
||||||
test $MAXCOUNT -gt 0 || exit 1
|
test $MAXCOUNT -gt 0 || exit 1
|
||||||
test $MAXCOUNT -lt 50 && echo "waiting for server to reply.."
|
test $MAXCOUNT -lt 50 && echo "waiting for server to reply.."
|
||||||
SRVURL=`grep -E URL= ${TMPSRVFILE} | cut -f2- -d=`
|
SRVURL=$(grep -E URL= "${TMPSRVFILE}" | cut -f2- -d=)
|
||||||
test ! "$SRVURL" = "error" || ! log "Could not spawn htsserver" || exit 1
|
test ! "$SRVURL" = "error" || ! log "Could not spawn htsserver" || exit 1
|
||||||
test -n "$SRVURL" || sleep 1
|
test -n "$SRVURL" || sleep 1
|
||||||
done
|
done
|
||||||
|
|
||||||
# Cleanup function
|
# Cleanup function
|
||||||
|
# shellcheck disable=SC2120 # $1 is an optional "signal caught" marker; bare calls are intentional
|
||||||
function cleanup {
|
function cleanup {
|
||||||
test -n "$1" && log "Nasty signal caught, cleaning up.."
|
test -n "$1" && log "Nasty signal caught, cleaning up.."
|
||||||
# Do not kill if browser exited (chrome bug issue) ; server will die itself
|
# Do not kill if browser exited (chrome bug issue) ; server will die itself
|
||||||
test -n "$1" && test -f ${TMPSRVFILE} && SRVPID=`grep -E PID= ${TMPSRVFILE} | cut -f2- -d=`
|
test -n "$1" && test -f "${TMPSRVFILE}" && SRVPID=$(grep -E PID= "${TMPSRVFILE}" | cut -f2- -d=)
|
||||||
test -n "${SRVPID}" && kill -9 ${SRVPID}
|
test -n "${SRVPID}" && kill -9 "${SRVPID}"
|
||||||
test -f ${TMPSRVFILE} && rm ${TMPSRVFILE}
|
test -f "${TMPSRVFILE}" && rm "${TMPSRVFILE}"
|
||||||
test -n "$1" && log "..Done"
|
test -n "$1" && log "..Done"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
# Cleanup in case of emergency
|
# Cleanup in case of emergency
|
||||||
trap "cleanup now; exit" 1 2 3 4 5 6 7 8 9 11 13 14 15 16 19 24 25
|
trap "cleanup now; exit" HUP INT QUIT ILL TRAP ABRT BUS FPE SEGV PIPE ALRM TERM STKFLT XCPU XFSZ
|
||||||
|
|
||||||
# Got SRVURL, launch browser
|
# Got SRVURL, launch browser
|
||||||
launch_browser "${BROWSEREXE}" "${SRVURL}"
|
launch_browser "${BROWSEREXE}" "${SRVURL}"
|
||||||
|
|
||||||
# That's all, folks!
|
# That's all, folks!
|
||||||
trap "" 1 2 3 4 5 6 7 8 9 11 13 14 15 16 19 24 25
|
trap "" HUP INT QUIT ILL TRAP ABRT BUS FPE SEGV PIPE ALRM TERM STKFLT XCPU XFSZ
|
||||||
cleanup
|
cleanup
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
15
tests/01_engine-cookies.test
Executable file
15
tests/01_engine-cookies.test
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Issue #151 guard: the request Cookie header must be bare RFC 6265 name=value
|
||||||
|
# pairs, no $Version/$Path attributes. Driven by the 'httrack -#Q' selftest.
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
# A trailing token is required; a bare '-#Q' falls through to the usage screen.
|
||||||
|
out=$(httrack -#Q run)
|
||||||
|
|
||||||
|
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||||
|
test "$out" = "cookie-header: OK" || {
|
||||||
|
echo "expected 'cookie-header: OK', got: $out" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
17
tests/01_engine-copyopt.test
Executable file
17
tests/01_engine-copyopt.test
Executable file
@@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Regression guard for the unsigned-enum sentinel trap: copy_htsopt's
|
||||||
|
# `if (from->X > -1)` guard is always false for unsigned hts_boolean fields, so
|
||||||
|
# they silently stop being copied. Driven by the in-process 'httrack -#9' test.
|
||||||
|
# Keep POSIX-portable (harness runs it via $(BASH), a plain /bin/sh on macOS).
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
# A trailing token is required; a bare '-#9' falls through to the usage screen.
|
||||||
|
out=$(httrack -#9 run)
|
||||||
|
|
||||||
|
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||||
|
test "$out" = "copy-htsopt: OK" || {
|
||||||
|
echo "expected 'copy-htsopt: OK', got: $out" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
15
tests/01_engine-dns.test
Normal file
15
tests/01_engine-dns.test
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# DNS resolver/cache self-test: a mock getaddrinfo (no network) checks address
|
||||||
|
# family, single-address selection, the -@i4/-@i6 family filter, and cache reuse.
|
||||||
|
# The trailing token is required, like the other -# selftests, so a bare command
|
||||||
|
# line isn't treated as "no arguments" and routed to the usage screen.
|
||||||
|
out=$(httrack -#D run)
|
||||||
|
|
||||||
|
test "$out" = "dns-selftest: OK" || {
|
||||||
|
echo "expected 'dns-selftest: OK', got: $out" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
@@ -89,4 +89,37 @@ grep -q NEWCONTENT "$(find "$out" -path '*/a.html' -print -quit)" || {
|
|||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# --- 3. an empty quoted arg survives the doit.log round-trip (#106) ----------
|
||||||
|
# -%F "" (empty footer) records an empty "" token in doit.log; -r2 follows it so
|
||||||
|
# a "drop the empty token" bug shifts -r2 into -%F's slot (the reprise then sees
|
||||||
|
# -%F -r2 and panics "%F needs to be followed by ..."), making the bug visible
|
||||||
|
# rather than a harmless run off the end of argv.
|
||||||
|
out2="$tmp/out2"
|
||||||
|
rc=0
|
||||||
|
"$bin" "$url" -O "$out2" --quiet -n -%v0 -%F "" -r2 >/dev/null 2>&1 || rc=$?
|
||||||
|
test "$rc" -eq 0 || {
|
||||||
|
echo "FAIL: initial mirror with empty footer exited $rc"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
# precondition: the writer put the empty token on disk for the reader to reload.
|
||||||
|
grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
|
||||||
|
echo "FAIL: empty footer not recorded as -%F \"\" -r2 in doit.log"
|
||||||
|
grep -- '-%F' "$out2/hts-cache/doit.log" || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
# no-url reprise: the reader rebuilds argv from doit.log and rewrites doit.log
|
||||||
|
# from it. The empty token surviving in the regenerated file proves the reader
|
||||||
|
# kept it (a drop/swallow would panic above or rewrite -%F without the "").
|
||||||
|
rc=0
|
||||||
|
"$bin" -O "$out2" --quiet >/dev/null 2>&1 || rc=$?
|
||||||
|
test "$rc" -eq 0 || {
|
||||||
|
echo "FAIL: empty-footer reprise exited $rc (empty token dropped from doit.log?)"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
|
||||||
|
echo "FAIL: empty footer did not survive the doit.log reload round-trip"
|
||||||
|
grep -- '-%F' "$out2/hts-cache/doit.log" || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
@@ -154,4 +154,173 @@ grep -Eq "style=\"background-image:url\('ibgs\.gif'\)\"" "$saved2" ||
|
|||||||
grep -q 'title="file://' "$saved2" ||
|
grep -q 'title="file://' "$saved2" ||
|
||||||
! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1
|
! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1
|
||||||
|
|
||||||
|
# xmlns / xmlns:prefix decls must not be crawled (#191). Local file:// targets so a
|
||||||
|
# regression downloads them; each is the LAST attr (heuristic only scans a value before '>').
|
||||||
|
site3="$tmp/xmlns"
|
||||||
|
mkdir -p "$site3"
|
||||||
|
for f in ns og rdfs real; do gif "$site3/$f.gif"; done
|
||||||
|
cat >"$site3/index.html" <<EOF
|
||||||
|
<html xmlns="file://$site3/ns.gif"><body>
|
||||||
|
<svg xmlns:og="file://$site3/og.gif"></svg>
|
||||||
|
<div class="c" xmlns:rdfs="file://$site3/rdfs.gif"></div>
|
||||||
|
<a href="file://$site3/real.gif">real link</a>
|
||||||
|
</body></html>
|
||||||
|
EOF
|
||||||
|
out3="$tmp/xmlns-out"
|
||||||
|
crawl "$site3/index.html" "$out3"
|
||||||
|
|
||||||
|
# the real link is still captured
|
||||||
|
found "real.gif" "$out3"
|
||||||
|
# namespace-declaration targets must not be fetched (default + prefixed forms)
|
||||||
|
notfound "ns.gif" "$out3"
|
||||||
|
notfound "og.gif" "$out3"
|
||||||
|
notfound "rdfs.gif" "$out3"
|
||||||
|
|
||||||
|
# CSS @import (#94): every form's target is captured, crawling the .css directly.
|
||||||
|
# The "cond"/"sup"/"spc" cases carry a trailing media/supports/layer condition (or
|
||||||
|
# a space before ';'); they are the negative controls: without the parser fix the
|
||||||
|
# URL is dropped, so a regression fails these found() checks.
|
||||||
|
site4="$tmp/cssimport"
|
||||||
|
mkdir -p "$site4"
|
||||||
|
for f in nq dqu squ dqs sqs med cond sup lay spc; do printf 'body{}\n' >"$site4/$f.css"; done
|
||||||
|
cat >"$site4/main.css" <<'EOF'
|
||||||
|
@import url(nq.css);
|
||||||
|
@import url("dqu.css");
|
||||||
|
@import url('squ.css');
|
||||||
|
@import "dqs.css";
|
||||||
|
@import 'sqs.css';
|
||||||
|
@import url(med.css) screen and (min-width: 400px);
|
||||||
|
@import "cond.css" screen;
|
||||||
|
@import "sup.css" supports(display: flex);
|
||||||
|
@import url(lay.css) layer(base);
|
||||||
|
@import "spc.css" ;
|
||||||
|
EOF
|
||||||
|
out4="$tmp/cssimport-out"
|
||||||
|
crawl "$site4/main.css" "$out4"
|
||||||
|
for f in nq dqu squ dqs sqs med cond sup lay spc; do found "$f.css" "$out4"; done
|
||||||
|
|
||||||
|
# Over-capture guard: the trailing condition is not part of the URL, so it must
|
||||||
|
# survive the rewrite verbatim. A regression that grabs it would mangle these.
|
||||||
|
m4=$(find "$out4" -type f -path '*/file/*' -name main.css -print -quit)
|
||||||
|
test -n "$m4" || ! echo "FAIL: saved main.css not found" || exit 1
|
||||||
|
for cond in '@import "cond.css" screen;' 'supports(display: flex)' 'layer(base)'; do
|
||||||
|
grep -Fq "$cond" "$m4" ||
|
||||||
|
! echo "FAIL #94: '$cond' altered on rewrite (condition captured as URL?)" || exit 1
|
||||||
|
done
|
||||||
|
|
||||||
|
# Malformed input: an unterminated @import quote (truncated CSS) must not crash or
|
||||||
|
# capture a bogus link; a valid sibling import is still captured. Guards a heap
|
||||||
|
# overflow on the URL-end scan that aborts under ASan (CI sanitizer job).
|
||||||
|
site5="$tmp/cssimport-trunc"
|
||||||
|
mkdir -p "$site5"
|
||||||
|
printf 'body{}\n' >"$site5/good.css"
|
||||||
|
printf '@import "good.css";\n@import "trunc' >"$site5/main.css"
|
||||||
|
out5="$tmp/cssimport-trunc-out"
|
||||||
|
crawl "$site5/main.css" "$out5"
|
||||||
|
found "good.css" "$out5"
|
||||||
|
notfound "trunc" "$out5"
|
||||||
|
|
||||||
|
# Offset-0 underflow (#396): a token at the buffer start makes the detector's
|
||||||
|
# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
|
||||||
|
# url() target is still captured; here it just must not underflow.
|
||||||
|
site6="$tmp/parse-off0"
|
||||||
|
mkdir -p "$site6"
|
||||||
|
printf 'body{}\n' >"$site6/off0.css"
|
||||||
|
printf 'url(off0.css)\n' >"$site6/main.css"
|
||||||
|
out6="$tmp/parse-off0-out"
|
||||||
|
crawl "$site6/main.css" "$out6"
|
||||||
|
found "off0.css" "$out6"
|
||||||
|
|
||||||
|
# XMLHttpRequest.open(method, url) (#218): the first argument is an HTTP method,
|
||||||
|
# not a URL. Without the fix "GET" is captured as a link and fetched (the offline
|
||||||
|
# fixture saves a bare file named GET; a live server mangles it to GET.html).
|
||||||
|
# window.open(url) detection must be unaffected.
|
||||||
|
site7="$tmp/xhropen"
|
||||||
|
mkdir -p "$site7"
|
||||||
|
gif "$site7/winopen.gif"
|
||||||
|
cat >"$site7/index.html" <<EOF
|
||||||
|
<html><body><script>
|
||||||
|
var x = new XMLHttpRequest();
|
||||||
|
x.open("GET", "ajax_info.txt");
|
||||||
|
var y = new XMLHttpRequest();
|
||||||
|
y.open("Post", "submit.cgi");
|
||||||
|
window.open("file://$site7/winopen.gif");
|
||||||
|
</script></body></html>
|
||||||
|
EOF
|
||||||
|
out7="$tmp/xhropen-out"
|
||||||
|
crawl "$site7/index.html" "$out7"
|
||||||
|
# negative control: without the fix a file named exactly GET is downloaded
|
||||||
|
notfound "GET" "$out7"
|
||||||
|
# methods are matched case-insensitively (XHR spec normalizes them): a mixed-case
|
||||||
|
# method is rejected too, so a file named Post must not appear either
|
||||||
|
notfound "Post" "$out7"
|
||||||
|
# regression guard: window.open(url) is still detected, so its absolute URL is
|
||||||
|
# rewritten to a local link. The rewrite only happens if the parser saw it, so
|
||||||
|
# these two assertions fail if .open detection broke (not a trivial --near save).
|
||||||
|
saved7=$(savedhtml "$out7")
|
||||||
|
test -n "$saved7" || ! echo "FAIL: saved xhr page not found" || exit 1
|
||||||
|
grep -Fq 'window.open("winopen.gif")' "$saved7" ||
|
||||||
|
! echo "FAIL #218: window.open(url) no longer detected/rewritten" || exit 1
|
||||||
|
! grep -Fq 'window.open("file://' "$saved7" ||
|
||||||
|
! echo "FAIL #218: window.open URL left absolute (not rewritten)" || exit 1
|
||||||
|
|
||||||
|
# Parens in an unquoted url(...) (#163): the source %28/%29 decode to literal
|
||||||
|
# '(' ')' in the saved name, but a literal ')' in the rewritten url() closes the
|
||||||
|
# token early, so they must stay encoded. Negative control: without the fix the
|
||||||
|
# %281%29 greps fail (parens are RFC2396 "mark" chars the escaper leaves alone).
|
||||||
|
site8="$tmp/cssparens"
|
||||||
|
mkdir -p "$site8"
|
||||||
|
for f in 'img (1).gif' 'a(b)c(1).gif' 'q (4).gif'; do gif "$site8/$f"; done
|
||||||
|
cat >"$site8/style.css" <<'EOF'
|
||||||
|
.a { background: url(img%20%281%29.gif); }
|
||||||
|
.b { background: url(a%28b%29c%281%29.gif); }
|
||||||
|
.c { background: url("q%20%284%29.gif"); }
|
||||||
|
EOF
|
||||||
|
out8="$tmp/cssparens-out"
|
||||||
|
crawl "$site8/style.css" "$out8"
|
||||||
|
found "img (1).gif" "$out8"
|
||||||
|
found "a(b)c(1).gif" "$out8"
|
||||||
|
found "q (4).gif" "$out8"
|
||||||
|
css8=$(find "$out8" -type f -path '*/file/*' -name style.css -print -quit)
|
||||||
|
test -n "$css8" || ! echo "FAIL: saved style.css not found" || exit 1
|
||||||
|
grep -Fq 'url(img%20%281%29.gif)' "$css8" ||
|
||||||
|
! echo "FAIL #163: parens in unquoted url() not percent-encoded on rewrite" || exit 1
|
||||||
|
grep -Fq 'url(a%28b%29c%281%29.gif)' "$css8" ||
|
||||||
|
! echo "FAIL #163: not every paren in a url() was percent-encoded" || exit 1
|
||||||
|
grep -Fq 'url("q%20%284%29.gif")' "$css8" ||
|
||||||
|
! echo "FAIL #163: quoted url() altered or parens left literal on rewrite" || exit 1
|
||||||
|
|
||||||
|
# The url() detector is not CSS-specific: <script> and inline style= get the
|
||||||
|
# same encoding, but ordinary href/src (ending_p is the quote, not ')') keep
|
||||||
|
# literal parens -- the attribute checks guard the gate against over-firing.
|
||||||
|
site9="$tmp/urlparens"
|
||||||
|
mkdir -p "$site9"
|
||||||
|
for f in 'js (1).gif' 'inl (2).gif' 'asrc (3).gif' 'ahref (4).gif'; do gif "$site9/$f"; done
|
||||||
|
cat >"$site9/index.html" <<EOF
|
||||||
|
<html><body>
|
||||||
|
<script>var bg = "url(js%20%281%29.gif)";</script>
|
||||||
|
<div style="background-image:url(inl%20%282%29.gif)"></div>
|
||||||
|
<img src="asrc%20%283%29.gif">
|
||||||
|
<a href="ahref%20%284%29.gif">link</a>
|
||||||
|
</body></html>
|
||||||
|
EOF
|
||||||
|
out9="$tmp/urlparens-out"
|
||||||
|
crawl "$site9/index.html" "$out9"
|
||||||
|
saved9=$(savedhtml "$out9")
|
||||||
|
test -n "$saved9" || ! echo "FAIL: saved urlparens page not found" || exit 1
|
||||||
|
# rewrite-only: the JS-string asset is not queued for download
|
||||||
|
grep -Fq 'url(js%20%281%29.gif)' "$saved9" ||
|
||||||
|
! echo "FAIL #163: parens in <script> url() not percent-encoded" || exit 1
|
||||||
|
found "inl (2).gif" "$out9"
|
||||||
|
grep -Fq 'url(inl%20%282%29.gif)' "$saved9" ||
|
||||||
|
! echo "FAIL #163: parens in inline style url() not percent-encoded" || exit 1
|
||||||
|
found "asrc (3).gif" "$out9"
|
||||||
|
found "ahref (4).gif" "$out9"
|
||||||
|
grep -Fq 'src="asrc%20(3).gif"' "$saved9" ||
|
||||||
|
! echo "FAIL #163: parens in a plain src attribute were wrongly encoded" || exit 1
|
||||||
|
grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
|
||||||
|
! echo "FAIL #163: parens in a plain href attribute were wrongly encoded" || exit 1
|
||||||
|
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
|
||||||
|
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
68
tests/01_engine-relative.test
Executable file
68
tests/01_engine-relative.test
Executable file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# lienrelatif (build relative path) + ident_url_relatif (resolve a link, collapse
|
||||||
|
# ./ and ../). Regression net for #137/#162; expected values hand-computed.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# relative path from <curr>'s directory to <link>
|
||||||
|
rel() {
|
||||||
|
local got
|
||||||
|
got=$(httrack -O /dev/null -#l "$1" "$2")
|
||||||
|
test "$got" == "relative=$3" ||
|
||||||
|
{
|
||||||
|
echo "FAIL rel($1, $2): got '$got' want 'relative=$3'"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# resolve <link> against origin <adr>/<fil> -> adr=.. fil=..
|
||||||
|
ident() {
|
||||||
|
local got
|
||||||
|
got=$(httrack -O /dev/null -#i "$1" "$2" "$3")
|
||||||
|
test "$got" == "$4" ||
|
||||||
|
{
|
||||||
|
echo "FAIL ident($1, $2, $3): got '$got' want '$4'"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
### lienrelatif
|
||||||
|
|
||||||
|
rel 'dir/page.html' 'dir/index.html' 'page.html'
|
||||||
|
rel 'dir/page.html' 'dir/page.html' 'page.html' # self-link
|
||||||
|
rel 'a.html' 'dir/index.html' '../a.html'
|
||||||
|
rel 'x.html' 'a/b/c/index.html' '../../../x.html'
|
||||||
|
rel 'h/a/x.jpg' 'h/a/sub/page.html' '../x.jpg'
|
||||||
|
rel 'a/b/c/x.html' 'index.html' 'a/b/c/x.html'
|
||||||
|
rel 'h/sub/x.jpg' 'h/page.html' 'sub/x.jpg'
|
||||||
|
rel 'h/dir2/x.jpg' 'h/dir1/page.html' '../dir2/x.jpg' # sibling dir
|
||||||
|
rel 'h/bc/x.jpg' 'h/b/page.html' '../bc/x.jpg' # b/bc prefix trap
|
||||||
|
rel 'h/b/x.jpg' 'h/bc/page.html' '../b/x.jpg'
|
||||||
|
rel 'h2/img/x.jpg' 'h1/p/page.html' '../../h2/img/x.jpg' # cross-host
|
||||||
|
rel 'img.cdn/photo.jpg' 'www.site/articles/2020/post.html' '../../../img.cdn/photo.jpg'
|
||||||
|
rel 'h/a/' 'h/a/sub/page.html' '../' # link is ancestor dir
|
||||||
|
rel 'x.html' 'page.html' 'x.html'
|
||||||
|
rel 'dir/page.html?x=1' 'dir/index.html?y=2' 'page.html' # ? stripped
|
||||||
|
|
||||||
|
### ident_url_relatif
|
||||||
|
|
||||||
|
ident 'img.gif' 'www.foo.com' '/dir/page.html' 'adr=www.foo.com fil=/dir/img.gif'
|
||||||
|
ident 'sub/img.gif' 'www.foo.com' '/dir/page.html' 'adr=www.foo.com fil=/dir/sub/img.gif'
|
||||||
|
ident '/img.gif' 'www.foo.com' '/dir/page.html' 'adr=www.foo.com fil=/img.gif'
|
||||||
|
# embedded ../ collapses (#137)
|
||||||
|
ident '../img.gif' 'www.foo.com' '/dir/sub/page.html' 'adr=www.foo.com fil=/dir/img.gif'
|
||||||
|
ident 'sub/../logo.png' 'www.foo.com' '/articles/2020/post.html' 'adr=www.foo.com fil=/articles/2020/logo.png'
|
||||||
|
ident '../../pix/sub/../logo.png' 'www.foo.com' '/articles/2020/post.html' 'adr=www.foo.com fil=/pix/logo.png'
|
||||||
|
ident '../../../../x.gif' 'www.foo.com' '/a/b/page.html' 'adr=www.foo.com fil=/x.gif' # above-root clamp
|
||||||
|
ident '?page=2' 'www.foo.com' '/dir/index.html?old=1' 'adr=www.foo.com fil=/dir/index.html?page=2'
|
||||||
|
ident 'http://other.com/a/b/../c/index.html' 'www.foo.com' '/p.html' 'adr=other.com fil=/a/c/index.html'
|
||||||
|
# file:// collapses ../ like the other schemes; traversal contained, // authority kept
|
||||||
|
ident 'file:///var/data/pix/sub/../logo.png' 'www.foo.com' '/p.html' 'adr=file:// fil=/var/data/pix/logo.png'
|
||||||
|
ident 'file:///a/b/c/../../d/e.gif' 'www.foo.com' '/p.html' 'adr=file:// fil=/a/d/e.gif'
|
||||||
|
ident 'file:///a/../../b' 'www.foo.com' '/p.html' 'adr=file:// fil=/b'
|
||||||
|
ident 'file://srv/share/../x' 'www.foo.com' '/p.html' 'adr=file:// fil=//srv/x'
|
||||||
|
ident 'mailto:foo@bar.com' 'www.foo.com' '/p.html' 'error=-1' # unsupported scheme
|
||||||
|
ident 'javascript:void(0)' 'www.foo.com' '/p.html' 'error=-1'
|
||||||
|
|
||||||
|
echo "OK"
|
||||||
41
tests/01_engine-savename.test
Executable file
41
tests/01_engine-savename.test
Executable file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Local save-name extension resolution (url_savename via -#N <fil> <content-type>).
|
||||||
|
# Asserts on the basename of "savename: <path>".
|
||||||
|
|
||||||
|
name() {
|
||||||
|
out="$(httrack -O /dev/null -#N "$1" "$2" | sed -n 's/^savename: //p')"
|
||||||
|
test "${out##*/}" == "$3" || {
|
||||||
|
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# #115: an unknown trailing ".token" is part of the name, keep it and append the type.
|
||||||
|
name '/article-1.884291' 'text/html' 'article-1.884291.html'
|
||||||
|
name '/news/story-12345.987654' 'text/html' 'story-12345.987654.html'
|
||||||
|
|
||||||
|
# Recognized extensions still collapse to the resolved type.
|
||||||
|
name '/page.php' 'text/html' 'page.html'
|
||||||
|
name '/page.asp' 'text/html' 'page.html'
|
||||||
|
name '/foo' 'text/html' 'foo.html'
|
||||||
|
|
||||||
|
# A bare trailing dot is not a tail to keep.
|
||||||
|
name '/page.' 'text/html' 'page.html'
|
||||||
|
|
||||||
|
# Soft-404 (#267/#408): a binary URL served as HTML is named .html.
|
||||||
|
name '/x.pdf' 'text/html' 'x.html'
|
||||||
|
name '/x.gif' 'text/html' 'x.html'
|
||||||
|
|
||||||
|
# Type agrees with the extension: keep it, no churn, no double extension.
|
||||||
|
name '/x.pdf' 'application/pdf' 'x.pdf'
|
||||||
|
name '/x.jpg' 'image/jpeg' 'x.jpg'
|
||||||
|
name '/x.html' 'text/html' 'x.html'
|
||||||
|
name '/x.js' 'application/x-javascript' 'x.js'
|
||||||
|
name '/types/data.json' 'application/json' 'data.json'
|
||||||
|
|
||||||
|
# Agreeing type must not rewrite the extension's casing (no strip-and-reappend).
|
||||||
|
name '/x.JPG' 'image/jpeg' 'x.JPG'
|
||||||
@@ -26,3 +26,17 @@ simp './a/../../b' 'b'
|
|||||||
|
|
||||||
# empty segments ('//') are not dot-segments and are preserved, per RFC 3986
|
# empty segments ('//') are not dot-segments and are preserved, per RFC 3986
|
||||||
simp 'a//b' 'a//b'
|
simp 'a//b' 'a//b'
|
||||||
|
simp 'a//b/../c' 'a//c'
|
||||||
|
|
||||||
|
# absolute paths keep the leading '/'; above-root '..' is clamped to it
|
||||||
|
simp '/a/../b' '/b'
|
||||||
|
simp '/a/../../b' '/b'
|
||||||
|
simp '/../x' '/x'
|
||||||
|
|
||||||
|
# collapses to nothing -> './' (relative) or '/' (absolute)
|
||||||
|
simp '..' './'
|
||||||
|
simp 'a/..' './'
|
||||||
|
simp '/' '/'
|
||||||
|
|
||||||
|
simp 'a/b/..' 'a/' # trailing bare '..'
|
||||||
|
simp 'a/../b?x=../y' 'b?x=../y' # '?' freezes simplification
|
||||||
|
|||||||
@@ -21,9 +21,15 @@ test "$out" == "strsafe: OK" || exit 1
|
|||||||
# the bounded macro aborts (non-zero exit), so don't let set -e trip on it
|
# the bounded macro aborts (non-zero exit), so don't let set -e trip on it
|
||||||
err=$(httrack -#8 overflow "this string is far too long for the buffer" 2>&1) || true
|
err=$(httrack -#8 overflow "this string is far too long for the buffer" 2>&1) || true
|
||||||
case "$err" in
|
case "$err" in
|
||||||
*"strsafe: NOT aborted"*) echo "over-capacity write was NOT caught" >&2; exit 1 ;;
|
*"strsafe: NOT aborted"*)
|
||||||
|
echo "over-capacity write was NOT caught" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
*"overflow while copying"*) ;;
|
*"overflow while copying"*) ;;
|
||||||
*) echo "expected htssafe overflow abort, got: $err" >&2; exit 1 ;;
|
*)
|
||||||
|
echo "expected htssafe overflow abort, got: $err" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
# Same guarantee for the htsbuff builder. The source is exactly the buffer
|
# Same guarantee for the htsbuff builder. The source is exactly the buffer
|
||||||
@@ -32,7 +38,13 @@ esac
|
|||||||
# aborted"). Match the specific htsbuff abort message, not just any assert.
|
# aborted"). Match the specific htsbuff abort message, not just any assert.
|
||||||
err=$(httrack -#8 overflow-buff "abcd" 2>&1) || true
|
err=$(httrack -#8 overflow-buff "abcd" 2>&1) || true
|
||||||
case "$err" in
|
case "$err" in
|
||||||
*"strsafe: NOT aborted"*) echo "htsbuff over-capacity write was NOT caught" >&2; exit 1 ;;
|
*"strsafe: NOT aborted"*)
|
||||||
|
echo "htsbuff over-capacity write was NOT caught" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
*"htsbuff append overflow"*) ;;
|
*"htsbuff append overflow"*) ;;
|
||||||
*) echo "expected htsbuff overflow abort, got: $err" >&2; exit 1 ;;
|
*)
|
||||||
|
echo "expected htsbuff overflow abort, got: $err" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|||||||
136
tests/13_crawl_proxy_https.test
Normal file
136
tests/13_crawl_proxy_https.test
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Issue #85: an https crawl must go through the configured proxy (CONNECT
|
||||||
|
# tunnel), not bypass it and hit the origin directly. Fully local: a self-signed
|
||||||
|
# TLS origin plus a logging CONNECT proxy, so no network access is needed.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
if test "${HTTPS_SUPPORT:-}" == "no"; then
|
||||||
|
echo "no https support compiled, skipping"
|
||||||
|
exit 77
|
||||||
|
fi
|
||||||
|
if ! command -v python3 >/dev/null 2>&1 || ! command -v openssl >/dev/null 2>&1; then
|
||||||
|
echo "python3/openssl missing, skipping"
|
||||||
|
exit 77
|
||||||
|
fi
|
||||||
|
|
||||||
|
server="$top_srcdir/tests/proxy-https-server.py"
|
||||||
|
tmpdir=$(mktemp -d)
|
||||||
|
pids=
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
for pid in $pids; do
|
||||||
|
kill "$pid" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
# self-signed cert for the local TLS origin (httrack does not verify certs)
|
||||||
|
openssl req -x509 -newkey rsa:2048 -keyout "$tmpdir/key.pem" \
|
||||||
|
-out "$tmpdir/cert.pem" -days 2 -nodes -subj "/CN=127.0.0.1" \
|
||||||
|
>/dev/null 2>&1
|
||||||
|
cat "$tmpdir/key.pem" "$tmpdir/cert.pem" >"$tmpdir/both.pem"
|
||||||
|
|
||||||
|
# start_server <logdir> <mode>: launches a proxy+origin pair, sets $origin_port
|
||||||
|
# and $proxy_port from its announced ephemeral ports.
|
||||||
|
start_server() {
|
||||||
|
local dir="$1" mode="$2" ports
|
||||||
|
mkdir -p "$dir"
|
||||||
|
ports="$dir/ports.txt"
|
||||||
|
python3 "$server" "$tmpdir/both.pem" "$dir" "$mode" \
|
||||||
|
>"$ports" 2>"$dir/server.err" &
|
||||||
|
pids="$pids $!"
|
||||||
|
for _ in $(seq 1 100); do
|
||||||
|
grep -q "^ready" "$ports" 2>/dev/null && break
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
grep -q "^ready" "$ports" 2>/dev/null || {
|
||||||
|
echo "server ($mode) did not start" >&2
|
||||||
|
cat "$dir/server.err" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
origin_port=$(awk '/^ORIGIN/{print $2}' "$ports")
|
||||||
|
proxy_port=$(awk '/^PROXY/{print $2}' "$ports")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run httrack, but kill it after a deadline so a hang (e.g. a missing bound on
|
||||||
|
# the proxy response) surfaces as the kill code $HANG_RC instead of stalling the
|
||||||
|
# whole job. A portable stand-in for `timeout`, which macOS lacks.
|
||||||
|
HANG_RC=137 # 128 + SIGKILL
|
||||||
|
run_crawl() {
|
||||||
|
local out="$1" proxy="$2" port="$3"
|
||||||
|
rm -rf "$out"
|
||||||
|
httrack "https://127.0.0.1:${port}/" --proxy "$proxy" \
|
||||||
|
-O "$out" -r1 -s0 --timeout=10 >"$out.log" 2>&1 &
|
||||||
|
local pid=$!
|
||||||
|
(sleep 60 && kill -9 "$pid" 2>/dev/null) &
|
||||||
|
local guard=$!
|
||||||
|
local rc=0
|
||||||
|
wait "$pid" 2>/dev/null || rc=$?
|
||||||
|
kill "$guard" 2>/dev/null || true
|
||||||
|
wait "$guard" 2>/dev/null || true
|
||||||
|
return "$rc"
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- working proxy ----------------------------------------------------------
|
||||||
|
ok="$tmpdir/ok"
|
||||||
|
start_server "$ok" ok
|
||||||
|
|
||||||
|
# 1. page retrieved AND the proxy saw a CONNECT to the origin
|
||||||
|
run_crawl "$ok/out" "127.0.0.1:${proxy_port}" "$origin_port"
|
||||||
|
grep -rq "ORIGIN-PAGE-85" "$ok/out" || {
|
||||||
|
echo "FAIL: origin page not downloaded through proxy" >&2
|
||||||
|
cat "$ok/out.log" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q "^CONNECT 127.0.0.1:${origin_port} " "$ok/proxy.log" || {
|
||||||
|
echo "FAIL: proxy never received a CONNECT (https bypassed the proxy)" >&2
|
||||||
|
cat "$ok/proxy.log" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
echo "OK: https tunneled through proxy via CONNECT"
|
||||||
|
|
||||||
|
# 2. authenticated proxy: creds ride the CONNECT, and NEVER reach the origin
|
||||||
|
: >"$ok/proxy.log"
|
||||||
|
: >"$ok/origin-headers.log"
|
||||||
|
run_crawl "$ok/out2" "user:secret@127.0.0.1:${proxy_port}" "$origin_port"
|
||||||
|
grep -rq "ORIGIN-PAGE-85" "$ok/out2" || {
|
||||||
|
echo "FAIL: origin page not downloaded through authenticated proxy" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
got=$(awk '/^AUTH Basic /{print $3}' "$ok/proxy.log" | head -1)
|
||||||
|
# base64("user:secret"); compared as a literal to stay portable (no base64 -d,
|
||||||
|
# which differs between GNU and BSD)
|
||||||
|
test "$got" == "dXNlcjpzZWNyZXQ=" || {
|
||||||
|
echo "FAIL: Proxy-Authorization not carried on CONNECT (got '$got')" >&2
|
||||||
|
cat "$ok/proxy.log" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
if grep -qi "proxy-authorization" "$ok/origin-headers.log"; then
|
||||||
|
echo "FAIL: proxy credentials leaked to the origin through the tunnel" >&2
|
||||||
|
cat "$ok/origin-headers.log" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "OK: proxy credentials carried on CONNECT, not leaked to origin"
|
||||||
|
|
||||||
|
# --- hostile proxy ----------------------------------------------------------
|
||||||
|
# A proxy that answers 200 then streams headers forever must not hang the crawl:
|
||||||
|
# the client bounds the response. run_crawl kills a hung httrack after 60s, so a
|
||||||
|
# missing bound surfaces as $HANG_RC here.
|
||||||
|
flood="$tmpdir/flood"
|
||||||
|
start_server "$flood" flood
|
||||||
|
rc=0
|
||||||
|
run_crawl "$flood/out" "127.0.0.1:${proxy_port}" "$origin_port" || rc=$?
|
||||||
|
test "$rc" -ne "$HANG_RC" || {
|
||||||
|
echo "FAIL: crawl hung on a flooding proxy (bounded read missing)" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -rq "ORIGIN-PAGE-85" "$flood/out" 2>/dev/null && {
|
||||||
|
echo "FAIL: flooding proxy unexpectedly served the page" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
echo "OK: bounded proxy response, no hang on a flooding proxy"
|
||||||
15
tests/13_local-cookies.test
Executable file
15
tests/13_local-cookies.test
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Cookie chain against the local test server (replaces the old online
|
||||||
|
# ut/cookies/*.php fixtures). entrance.php sets cat/cake; second.php checks
|
||||||
|
# them and sets badger; third.php checks all three. A missing or wrong cookie
|
||||||
|
# returns 500, which would surface as an httrack error and a missing file, so a
|
||||||
|
# clean 3-files/0-errors run proves the cookie jar is replayed across links.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --files 3 \
|
||||||
|
--found 'cookies/entrance.html' \
|
||||||
|
--found 'cookies/second.html' \
|
||||||
|
--found 'cookies/third.html' \
|
||||||
|
httrack 'BASEURL/cookies/entrance.php'
|
||||||
18
tests/14_local-https.test
Executable file
18
tests/14_local-https.test
Executable file
@@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# HTTPS crawl against the local test server, using the shipped self-signed
|
||||||
|
# cert. httrack does not verify certs (htslib.c: SSL_CTX_new with no
|
||||||
|
# SSL_CTX_set_verify), so the self-signed cert is accepted as-is and this
|
||||||
|
# exercises the real TLS path offline. basic.html links to link.html with four
|
||||||
|
# distinct query strings, each saved under a hashed name -> 5 files.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
if test "$HTTPS_SUPPORT" == "no"; then
|
||||||
|
echo "no https support compiled, skipping"
|
||||||
|
exit 77
|
||||||
|
fi
|
||||||
|
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --tls --errors 0 --files 5 \
|
||||||
|
--found 'simple/basic.html' \
|
||||||
|
httrack 'BASEURL/simple/basic.html'
|
||||||
25
tests/15_local-types.test
Normal file
25
tests/15_local-types.test
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Content-Type vs URL-extension naming (issue #267 family) under the default
|
||||||
|
# delayed type check (-%N2). Policy: a MISSING Content-Type must not clobber a
|
||||||
|
# URL extension that maps to a specific non-HTML type (.png/.pdf stay as-is);
|
||||||
|
# an explicitly DECLARED type is trusted, so a binary-looking URL that really
|
||||||
|
# serves HTML (text/html on .pdf/.jpg) is named .html. The "wrong" names are
|
||||||
|
# asserted absent so a regression in either direction fails here.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||||
|
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||||
|
--found 'types/notype.pdf' --not-found 'types/notype.html' \
|
||||||
|
--found 'types/photo.png' \
|
||||||
|
--found 'types/doc.pdf' \
|
||||||
|
--found 'types/lie.html' --not-found 'types/lie.png' \
|
||||||
|
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||||
|
--found 'types/page.htm' --not-found 'types/page.html' \
|
||||||
|
--found 'types/script.js' \
|
||||||
|
--found 'types/style.css' \
|
||||||
|
--found 'types/data.json' \
|
||||||
|
--found 'types/control.html' --not-found 'types/control.php' \
|
||||||
|
--found 'types/gend61c.png' --not-found 'types/gend61c.html' \
|
||||||
|
httrack 'BASEURL/types/index.html'
|
||||||
11
tests/16_local-assume.test
Normal file
11
tests/16_local-assume.test
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# --assume under the default delayed type check (-%N2), issue #56. A user type
|
||||||
|
# pinned with --assume must be honored immediately, not lost to the delayed
|
||||||
|
# name: photo.png served as image/png but assumed text/html is saved as .html.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||||
|
--found 'types/photo.html' --not-found 'types/photo.png' \
|
||||||
|
httrack 'BASEURL/types/photo.png' --assume png=text/html
|
||||||
12
tests/17_local-empty-ct.test
Normal file
12
tests/17_local-empty-ct.test
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# An empty "Content-Type:" header value must be treated as "no usable type"
|
||||||
|
# (keep the URL extension), not parsed from an uninitialized buffer. The crawl
|
||||||
|
# also runs under ASan/UBSan in CI, which catches the uninitialized read this
|
||||||
|
# guards against.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||||
|
--found 'types/emptyct.png' --not-found 'types/emptyct.html' \
|
||||||
|
httrack 'BASEURL/types/index.html'
|
||||||
15
tests/18_local-update.test
Normal file
15
tests/18_local-update.test
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# A second (update) pass must keep the names the first crawl chose. The stored
|
||||||
|
# Content-Type rides the cache, so the update reads back the same value -- the
|
||||||
|
# unknown/unknown sentinel for a typeless response, the declared type otherwise
|
||||||
|
# -- and names consistently: a declared-text/html .pdf stays .html and a
|
||||||
|
# typeless .png stays .png across the update rather than reverting.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||||
|
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||||
|
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||||
|
--found 'types/lie.html' \
|
||||||
|
httrack 'BASEURL/types/index.html'
|
||||||
110
tests/19_local-connect-fallback.test
Normal file
110
tests/19_local-connect-fallback.test
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# A host that resolves to several addresses must fall back to the next one when
|
||||||
|
# a connect fails, instead of giving up on the first (dead IPv6 on a dual-stack
|
||||||
|
# host, ...). HTTRACK_DEBUG_RESOLVE pins "deadhost" to a refused address first
|
||||||
|
# (127.0.0.2, nothing listening) then the live server (127.0.0.1): the crawl
|
||||||
|
# only succeeds if httrack retries the second address. A second case pins every
|
||||||
|
# address to a refused one, so the slot must exhaust the list and error out
|
||||||
|
# (rather than hang or loop).
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
if test "${V6_SUPPORT:-}" == "no"; then
|
||||||
|
echo "no IPv6 support (resolver list/override is IPv6-only), skipping"
|
||||||
|
exit 77
|
||||||
|
fi
|
||||||
|
if ! command -v python3 >/dev/null 2>&1; then
|
||||||
|
echo "python3 missing, skipping"
|
||||||
|
exit 77
|
||||||
|
fi
|
||||||
|
|
||||||
|
server="$top_srcdir/tests/local-server.py"
|
||||||
|
root="$top_srcdir/tests/server-root"
|
||||||
|
tmpdir=$(mktemp -d)
|
||||||
|
serverpid=
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if test -n "$serverpid"; then
|
||||||
|
kill "$serverpid" 2>/dev/null || true
|
||||||
|
wait "$serverpid" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
# bind the live server to 127.0.0.1 only, so 127.0.0.2 refuses the connect
|
||||||
|
python3 "$server" --root "$root" --bind 127.0.0.1 >"$tmpdir/srv.out" 2>"$tmpdir/srv.err" &
|
||||||
|
serverpid=$!
|
||||||
|
port=
|
||||||
|
for _ in $(seq 1 50); do
|
||||||
|
line=$(head -n1 "$tmpdir/srv.out" 2>/dev/null || true)
|
||||||
|
if test "${line%% *}" == "PORT"; then
|
||||||
|
port="${line#PORT }"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
kill -0 "$serverpid" 2>/dev/null || {
|
||||||
|
echo "server exited early: $(cat "$tmpdir/srv.err")"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
test -n "$port" || {
|
||||||
|
echo "could not discover server port"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
out="$tmpdir/crawl"
|
||||||
|
HTTRACK_DEBUG_RESOLVE="deadhost:127.0.0.2,127.0.0.1" \
|
||||||
|
httrack "http://deadhost:$port/simple/basic.html" -O "$out" \
|
||||||
|
-c1 --robots=0 --timeout=30 --quiet -Z >"$tmpdir/log" 2>&1
|
||||||
|
|
||||||
|
log="$out/hts-log.txt"
|
||||||
|
|
||||||
|
# the dead address was tried, then the next one (proves the fallback ran)
|
||||||
|
if ! grep -q "trying next address" "$log"; then
|
||||||
|
echo "FAIL: no connect fallback happened"
|
||||||
|
cat "$log"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 0 errors and the file was actually fetched (over the live address)
|
||||||
|
errs=$(grep -iEc "^[0-9:]*[[:space:]]Error:" "$log" || true)
|
||||||
|
test "$errs" == "0" || {
|
||||||
|
echo "FAIL: $errs error(s) reported"
|
||||||
|
grep -iE "Error:" "$log"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
test -f "$out/deadhost_$port/simple/basic.html" || {
|
||||||
|
echo "FAIL: basic.html not downloaded via fallback"
|
||||||
|
find "$out" -type f
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# every address refused: the slot exhausts the list, then errors out (the
|
||||||
|
# harness timeout would catch a hang/loop; refused connects are instant)
|
||||||
|
out2="$tmpdir/crawl2"
|
||||||
|
HTTRACK_DEBUG_RESOLVE="alldead:127.0.0.2,127.0.0.3" \
|
||||||
|
httrack "http://alldead:$port/simple/basic.html" -O "$out2" \
|
||||||
|
-c1 --robots=0 --timeout=30 --quiet -Z >"$tmpdir/log2" 2>&1
|
||||||
|
log2="$out2/hts-log.txt"
|
||||||
|
|
||||||
|
grep -q "trying next address" "$log2" || {
|
||||||
|
echo "FAIL: exhaustion path never tried the fallback address"
|
||||||
|
cat "$log2"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -iqE "^[0-9:]*[[:space:]]Error:" "$log2" || {
|
||||||
|
echo "FAIL: all addresses failing did not report an error"
|
||||||
|
cat "$log2"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
test ! -f "$out2/alldead_$port/simple/basic.html" || {
|
||||||
|
echo "FAIL: file downloaded despite every address failing"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "OK: connect fallback succeeds, and exhausting all addresses errors out"
|
||||||
113
tests/20_local-resume-loop.test
Executable file
113
tests/20_local-resume-loop.test
Executable file
@@ -0,0 +1,113 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Issue #206: a continue/update crawl looped forever when the resume Range got a
|
||||||
|
# 416. Pass 1 leaves a partial + temp-ref; pass 2 must terminate and not loop.
|
||||||
|
set -u
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
testdir=$(cd "$(dirname "$0")" && pwd)
|
||||||
|
server="${testdir}/local-server.py"
|
||||||
|
|
||||||
|
command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
|
||||||
|
|
||||||
|
tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_206.XXXXXX") || exit 1
|
||||||
|
serverpid=
|
||||||
|
crawlpid=
|
||||||
|
cleanup() {
|
||||||
|
test -n "$crawlpid" && kill -9 "$crawlpid" 2>/dev/null
|
||||||
|
if test -n "$serverpid"; then
|
||||||
|
kill "$serverpid" 2>/dev/null
|
||||||
|
wait "$serverpid" 2>/dev/null
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT HUP INT QUIT PIPE TERM
|
||||||
|
|
||||||
|
# --- start the server, discover its ephemeral port --------------------------
|
||||||
|
# RESUME_COUNTER gets a byte per /resume/blob.txt request (pass-2 delta bounds re-gets).
|
||||||
|
serverlog="${tmpdir}/server.log"
|
||||||
|
counter="${tmpdir}/blobcount"
|
||||||
|
RESUME_COUNTER="$counter" python3 "$server" --root "${testdir}/server-root" >"$serverlog" 2>&1 &
|
||||||
|
serverpid=$!
|
||||||
|
port=
|
||||||
|
for _ in $(seq 1 50); do
|
||||||
|
line=$(head -n1 "$serverlog" 2>/dev/null)
|
||||||
|
if test "${line%% *}" == "PORT"; then
|
||||||
|
port="${line#PORT }"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
kill -0 "$serverpid" 2>/dev/null || {
|
||||||
|
echo "server exited early: $(cat "$serverlog")"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
test -n "$port" || {
|
||||||
|
echo "could not discover server port"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
base="http://127.0.0.1:${port}"
|
||||||
|
|
||||||
|
which httrack >/dev/null || {
|
||||||
|
echo "could not find httrack"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
out="${tmpdir}/crawl"
|
||||||
|
mkdir "$out"
|
||||||
|
common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0)
|
||||||
|
refdir="${out}/hts-cache/ref"
|
||||||
|
|
||||||
|
# --- pass 1: crawl, interrupt once the blob download is underway -------------
|
||||||
|
printf '[pass 1: interrupt mid-download] ..\t'
|
||||||
|
httrack "${common[@]}" "${base}/resume/index.html" >"${tmpdir}/log1" 2>&1 &
|
||||||
|
crawlpid=$!
|
||||||
|
# Wait until blob.txt is requested, then SIGTERM so httrack's exit handler
|
||||||
|
# finalizes the cache and serializes the temp-ref.
|
||||||
|
for _ in $(seq 1 300); do
|
||||||
|
test -s "$counter" && break
|
||||||
|
kill -0 "$crawlpid" 2>/dev/null || break
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
sleep 0.5
|
||||||
|
kill -TERM "$crawlpid" 2>/dev/null
|
||||||
|
wait "$crawlpid" 2>/dev/null
|
||||||
|
crawlpid=
|
||||||
|
test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
|
||||||
|
echo "FAIL: no temp-ref survived pass 1; cannot drive #206"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
echo "OK (temp-ref present)"
|
||||||
|
before=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||||
|
|
||||||
|
# --- pass 2: --continue -> resume Range -> 416, bounded against the #206 loop -
|
||||||
|
# Kill pass 2 after a deadline (portable stand-in for `timeout`, absent on macOS).
|
||||||
|
printf '[pass 2: resume must terminate] ..\t'
|
||||||
|
HANG_RC=137 # 128 + SIGKILL
|
||||||
|
httrack "${common[@]}" --continue "${base}/resume/index.html" >"${tmpdir}/log2" 2>&1 &
|
||||||
|
crawlpid=$!
|
||||||
|
(sleep 30 && kill -9 "$crawlpid" 2>/dev/null) &
|
||||||
|
guard=$!
|
||||||
|
rc=0
|
||||||
|
wait "$crawlpid" 2>/dev/null || rc=$?
|
||||||
|
crawlpid=
|
||||||
|
kill "$guard" 2>/dev/null || true
|
||||||
|
wait "$guard" 2>/dev/null || true
|
||||||
|
if test "$rc" -eq "$HANG_RC"; then
|
||||||
|
echo "FAIL: pass 2 did not terminate (#206 resume->416 loop)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "OK (terminated, rc=$rc)"
|
||||||
|
|
||||||
|
# The fix re-gets once (resume Range + range-less re-get = 2): the lower bound
|
||||||
|
# rejects a drop-the-link non-fix (1), the upper bound rejects the loop (many).
|
||||||
|
after=$(wc -c <"$counter" 2>/dev/null || echo 0)
|
||||||
|
hits=$((after - before))
|
||||||
|
printf '[bounded re-get count] ..\t'
|
||||||
|
if test "$hits" -lt 2; then
|
||||||
|
echo "FAIL: only ${hits} pass-2 request(s); the stale partial was not re-got"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if test "$hits" -gt 8; then
|
||||||
|
echo "FAIL: ${hits} pass-2 requests for blob.txt (resume is looping)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "OK (${hits} requests)"
|
||||||
11
tests/21_local-intl-update.test
Normal file
11
tests/21_local-intl-update.test
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# #157: a dotless, accented URL named .html on the first crawl must keep .html
|
||||||
|
# across an update -- not revert to the extensionless name.
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||||
|
--found 'intl/Instalação_CVS_no_Ubuntu.html' \
|
||||||
|
--not-found 'intl/Instalação_CVS_no_Ubuntu' \
|
||||||
|
httrack 'BASEURL/intl/index.html'
|
||||||
@@ -2,6 +2,9 @@
|
|||||||
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
||||||
# silently drop it from the dist tarball and break "make distcheck".
|
# silently drop it from the dist tarball and break "make distcheck".
|
||||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||||
|
proxy-https-server.py \
|
||||||
|
local-crawl.sh local-server.py server.crt server.key \
|
||||||
|
server-root/simple/basic.html server-root/simple/link.html \
|
||||||
fixtures/cache-golden/hts-cache/new.zip
|
fixtures/cache-golden/hts-cache/new.zip
|
||||||
|
|
||||||
TESTS_ENVIRONMENT =
|
TESTS_ENVIRONMENT =
|
||||||
@@ -10,6 +13,7 @@ TESTS_ENVIRONMENT += PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH
|
|||||||
### TESTS_ENVIRONMENT += $(SHLIBPATH_VAR)="$(top_builddir)/src/$(LT_CV_OBJDIR)$${$(SHLIBPATH_VAR):+$(PATH_SEPARATOR)}$$$(SHLIBPATH_VAR)"
|
### TESTS_ENVIRONMENT += $(SHLIBPATH_VAR)="$(top_builddir)/src/$(LT_CV_OBJDIR)$${$(SHLIBPATH_VAR):+$(PATH_SEPARATOR)}$$$(SHLIBPATH_VAR)"
|
||||||
TESTS_ENVIRONMENT += ONLINE_UNIT_TESTS=$(ONLINE_UNIT_TESTS)
|
TESTS_ENVIRONMENT += ONLINE_UNIT_TESTS=$(ONLINE_UNIT_TESTS)
|
||||||
TESTS_ENVIRONMENT += HTTPS_SUPPORT=$(HTTPS_SUPPORT)
|
TESTS_ENVIRONMENT += HTTPS_SUPPORT=$(HTTPS_SUPPORT)
|
||||||
|
TESTS_ENVIRONMENT += V6_SUPPORT=$(V6_SUPPORT)
|
||||||
TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir)
|
TESTS_ENVIRONMENT += top_srcdir=$(top_srcdir)
|
||||||
|
|
||||||
TEST_EXTENSIONS = .test
|
TEST_EXTENSIONS = .test
|
||||||
@@ -24,6 +28,9 @@ TESTS = \
|
|||||||
01_engine-cache-golden.test \
|
01_engine-cache-golden.test \
|
||||||
01_engine-charset.test \
|
01_engine-charset.test \
|
||||||
01_engine-cmdline.test \
|
01_engine-cmdline.test \
|
||||||
|
01_engine-cookies.test \
|
||||||
|
01_engine-copyopt.test \
|
||||||
|
01_engine-dns.test \
|
||||||
01_engine-doitlog.test \
|
01_engine-doitlog.test \
|
||||||
01_engine-entities.test \
|
01_engine-entities.test \
|
||||||
01_engine-filter.test \
|
01_engine-filter.test \
|
||||||
@@ -32,6 +39,8 @@ TESTS = \
|
|||||||
01_engine-mime.test \
|
01_engine-mime.test \
|
||||||
01_engine-parse.test \
|
01_engine-parse.test \
|
||||||
01_engine-rcfile.test \
|
01_engine-rcfile.test \
|
||||||
|
01_engine-relative.test \
|
||||||
|
01_engine-savename.test \
|
||||||
01_engine-simplify.test \
|
01_engine-simplify.test \
|
||||||
01_engine-strsafe.test \
|
01_engine-strsafe.test \
|
||||||
02_manpage-regen.test \
|
02_manpage-regen.test \
|
||||||
@@ -42,6 +51,16 @@ TESTS = \
|
|||||||
11_crawl-international.test \
|
11_crawl-international.test \
|
||||||
11_crawl-longurl.test \
|
11_crawl-longurl.test \
|
||||||
11_crawl-parsing.test \
|
11_crawl-parsing.test \
|
||||||
12_crawl_https.test
|
12_crawl_https.test \
|
||||||
|
13_crawl_proxy_https.test \
|
||||||
|
13_local-cookies.test \
|
||||||
|
14_local-https.test \
|
||||||
|
15_local-types.test \
|
||||||
|
16_local-assume.test \
|
||||||
|
17_local-empty-ct.test \
|
||||||
|
18_local-update.test \
|
||||||
|
19_local-connect-fallback.test \
|
||||||
|
20_local-resume-loop.test \
|
||||||
|
21_local-intl-update.test
|
||||||
|
|
||||||
CLEANFILES = check-network_sh.cache
|
CLEANFILES = check-network_sh.cache
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ function debug {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function info {
|
function info {
|
||||||
printf "[$*] ..\t" >&2
|
printf '[%s] ..\t' "$*" >&2
|
||||||
}
|
}
|
||||||
|
|
||||||
function result {
|
function result {
|
||||||
@@ -66,31 +66,30 @@ function start-crawl {
|
|||||||
--debug)
|
--debug)
|
||||||
verbose=1
|
verbose=1
|
||||||
;;
|
;;
|
||||||
--no-purge|--summary|--print-files)
|
--no-purge | --summary | --print-files) ;;
|
||||||
;;
|
|
||||||
--errors | --files | --found | --not-found | --directory)
|
--errors | --files | --found | --not-found | --directory)
|
||||||
pos=$[${pos}+1]
|
pos=$((pos + 1))
|
||||||
test "$#" -ge "$pos" || warning "missing argument" || return 1
|
test "$#" -ge "$pos" || warning "missing argument" || return 1
|
||||||
;;
|
;;
|
||||||
httrack)
|
httrack)
|
||||||
pos=$[${pos}+1]
|
pos=$((pos + 1))
|
||||||
break;
|
break
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
warning "unrecognized option ${!pos}"
|
warning "unrecognized option ${!pos}"
|
||||||
return 1
|
return 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
pos=$[${pos}+1]
|
pos=$((pos + 1))
|
||||||
done
|
done
|
||||||
debug "remaining args: ${@:${pos}}"
|
debug "remaining args: ${*:pos}"
|
||||||
|
|
||||||
# ut/ won't exceed 2 minutes
|
# ut/ won't exceed 2 minutes
|
||||||
moreargs="--quiet --max-time=120 --timeout=30 --connection-per-second=5"
|
moreargs=(--quiet --max-time=120 --timeout=30 --connection-per-second=5)
|
||||||
|
|
||||||
# proxy environment ?
|
# proxy environment ?
|
||||||
if test -n "$http_proxy"; then
|
if test -n "${http_proxy:-}"; then
|
||||||
moreargs="$moreargs --proxy $http_proxy"
|
moreargs+=(--proxy "$http_proxy")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
test -n "$tmpdir" || ! warning "no tmpdir" || return 1
|
test -n "$tmpdir" || ! warning "no tmpdir" || return 1
|
||||||
@@ -104,9 +103,9 @@ function start-crawl {
|
|||||||
|
|
||||||
# start crawl
|
# start crawl
|
||||||
log="${tmp}/log"
|
log="${tmp}/log"
|
||||||
debug starting httrack -O "${tmp}" ${moreargs} ${@:${pos}}
|
debug starting httrack -O "${tmp}" "${moreargs[@]}" "${@:pos}"
|
||||||
info "running httrack ${@:${pos}}"
|
info "running httrack ${*:pos}"
|
||||||
httrack -O "${tmp}" --user-agent="httrack $ver ut ($(uname -omrs))" ${moreargs} ${@:${pos}} >"${log}" 2>&1 &
|
httrack -O "${tmp}" --user-agent="httrack $ver ut ($(uname -omrs))" "${moreargs[@]}" "${@:pos}" >"${log}" 2>&1 &
|
||||||
crawlpid="$!"
|
crawlpid="$!"
|
||||||
debug "started cralwer on pid $crawlpid"
|
debug "started cralwer on pid $crawlpid"
|
||||||
wait "$crawlpid"
|
wait "$crawlpid"
|
||||||
@@ -164,12 +163,12 @@ function start-crawl {
|
|||||||
;;
|
;;
|
||||||
--files)
|
--files)
|
||||||
shift
|
shift
|
||||||
nFiles=$(grep -E "^HTTrack Website Copier/[^ ]* mirror complete in " "${tmp}/hts-log.txt" \
|
nFiles=$(grep -E "^HTTrack Website Copier/[^ ]* mirror complete in " "${tmp}/hts-log.txt" |
|
||||||
| sed -e 's/.*[[:space:]]\([^ ]*\)[[:space:]]files written.*/\1/g')
|
sed -e 's/.*[[:space:]]\([^ ]*\)[[:space:]]files written.*/\1/g')
|
||||||
assert_equals "checking files" "$1" "$nFiles"
|
assert_equals "checking files" "$1" "$nFiles"
|
||||||
;;
|
;;
|
||||||
httrack)
|
httrack)
|
||||||
break;
|
break
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
shift
|
shift
|
||||||
@@ -195,7 +194,7 @@ tmpdir=
|
|||||||
crawlpid=
|
crawlpid=
|
||||||
nopurge=
|
nopurge=
|
||||||
verbose=
|
verbose=
|
||||||
trap "cleanup" 0 1 2 3 4 5 6 7 8 9 11 13 14 15 16 19 24 25
|
trap cleanup EXIT HUP INT QUIT ILL TRAP ABRT BUS FPE SEGV PIPE ALRM TERM STKFLT XCPU XFSZ
|
||||||
|
|
||||||
# working directory
|
# working directory
|
||||||
tmpdir="${tmptopdir}/httrack_ut.$$"
|
tmpdir="${tmptopdir}/httrack_ut.$$"
|
||||||
|
|||||||
262
tests/local-crawl.sh
Executable file
262
tests/local-crawl.sh
Executable file
@@ -0,0 +1,262 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Launcher for httrack crawl tests against the local Python test server.
|
||||||
|
#
|
||||||
|
# Starts tests/local-server.py on an ephemeral port, discovers the port from
|
||||||
|
# the server's stdout, then runs httrack against http(s)://127.0.0.1:$PORT and
|
||||||
|
# audits the mirror. The server is always killed and the tmpdir removed on exit.
|
||||||
|
#
|
||||||
|
# The token BASEURL in any httrack argument is replaced with the discovered
|
||||||
|
# http(s)://127.0.0.1:$PORT base. --found/--directory paths are relative to the
|
||||||
|
# discovered host root (127.0.0.1_<port>/), since the random port leaks into
|
||||||
|
# the mirror directory name.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# bash local-crawl.sh [--tls] [--root DIR] \
|
||||||
|
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||||
|
# httrack BASEURL/some/path [httrack-args...]
|
||||||
|
|
||||||
|
set -u
|
||||||
|
|
||||||
|
testdir=$(cd "$(dirname "$0")" && pwd)
|
||||||
|
server="${testdir}/local-server.py"
|
||||||
|
root="${LOCAL_SERVER_ROOT:-${testdir}/server-root}"
|
||||||
|
cert="${testdir}/server.crt"
|
||||||
|
key="${testdir}/server.key"
|
||||||
|
|
||||||
|
tls=
|
||||||
|
verbose=
|
||||||
|
rerun=
|
||||||
|
tmpdir=
|
||||||
|
serverpid=
|
||||||
|
crawlpid=
|
||||||
|
|
||||||
|
function warning {
|
||||||
|
echo "** $*" >&2
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
function die {
|
||||||
|
warning "$*"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
function debug {
|
||||||
|
test -n "$verbose" && echo "$*" >&2
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
function info { printf "[%s] ..\t" "$*" >&2; }
|
||||||
|
function result { echo "$*" >&2; }
|
||||||
|
|
||||||
|
function cleanup {
|
||||||
|
if test -n "$crawlpid"; then
|
||||||
|
kill -9 "$crawlpid" 2>/dev/null
|
||||||
|
crawlpid=
|
||||||
|
fi
|
||||||
|
if test -n "$serverpid"; then
|
||||||
|
kill "$serverpid" 2>/dev/null
|
||||||
|
# Reap it so the port is released before we rm the tmpdir/log.
|
||||||
|
wait "$serverpid" 2>/dev/null
|
||||||
|
serverpid=
|
||||||
|
fi
|
||||||
|
if test -n "$tmpdir" && test -d "$tmpdir"; then
|
||||||
|
test -n "$nopurge" || rm -rf "$tmpdir"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function assert_equals {
|
||||||
|
info "$1"
|
||||||
|
if test ! "$2" == "$3"; then
|
||||||
|
result "expected '$2', got '$3'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
result "OK ($2)"
|
||||||
|
}
|
||||||
|
|
||||||
|
nopurge=
|
||||||
|
trap cleanup EXIT HUP INT QUIT PIPE TERM
|
||||||
|
|
||||||
|
# python3 is required; mirror check-network.sh's skip-with-77 convention.
|
||||||
|
command -v python3 >/dev/null || ! echo "python3 not found; skipping local crawl tests" || exit 77
|
||||||
|
|
||||||
|
tmptopdir=${TMPDIR:-/tmp}
|
||||||
|
test -d "$tmptopdir" || mkdir -p "$tmptopdir" || die "no temporary directory; set TMPDIR"
|
||||||
|
tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create tmpdir"
|
||||||
|
|
||||||
|
# --- parse leading control flags --------------------------------------------
|
||||||
|
declare -a audit=()
|
||||||
|
scheme=http
|
||||||
|
pos=0
|
||||||
|
args=("$@")
|
||||||
|
nargs=$#
|
||||||
|
while test "$pos" -lt "$nargs"; do
|
||||||
|
case "${args[$pos]}" in
|
||||||
|
--debug) verbose=1 ;;
|
||||||
|
--rerun) rerun=1 ;; # run httrack a second time (update pass) before auditing
|
||||||
|
--no-purge)
|
||||||
|
nopurge=1
|
||||||
|
audit+=("--no-purge")
|
||||||
|
;;
|
||||||
|
--tls)
|
||||||
|
tls=1
|
||||||
|
scheme=https
|
||||||
|
;;
|
||||||
|
--root)
|
||||||
|
pos=$((pos + 1))
|
||||||
|
root="${args[$pos]}"
|
||||||
|
;;
|
||||||
|
--errors | --files)
|
||||||
|
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||||
|
pos=$((pos + 1))
|
||||||
|
;;
|
||||||
|
--found | --not-found | --directory)
|
||||||
|
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||||
|
pos=$((pos + 1))
|
||||||
|
;;
|
||||||
|
httrack)
|
||||||
|
pos=$((pos + 1))
|
||||||
|
break
|
||||||
|
;;
|
||||||
|
*) die "unrecognized option ${args[$pos]}" ;;
|
||||||
|
esac
|
||||||
|
pos=$((pos + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# --- start the server --------------------------------------------------------
|
||||||
|
test -r "$server" || die "cannot read $server"
|
||||||
|
serverlog="${tmpdir}/server.log"
|
||||||
|
serverargs=(--root "$root")
|
||||||
|
if test -n "$tls"; then
|
||||||
|
serverargs+=(--tls --cert "$cert" --key "$key")
|
||||||
|
fi
|
||||||
|
debug "starting python3 $server ${serverargs[*]}"
|
||||||
|
python3 "$server" "${serverargs[@]}" >"$serverlog" 2>&1 &
|
||||||
|
serverpid=$!
|
||||||
|
|
||||||
|
# Wait for the "PORT <n>" line (server prints it once bound).
|
||||||
|
port=
|
||||||
|
for _ in $(seq 1 50); do
|
||||||
|
if test -s "$serverlog"; then
|
||||||
|
line=$(head -n1 "$serverlog")
|
||||||
|
if test "${line%% *}" == "PORT"; then
|
||||||
|
port="${line#PORT }"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
kill -0 "$serverpid" 2>/dev/null || die "server exited early: $(cat "$serverlog")"
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
test -n "$port" || die "could not discover server port: $(cat "$serverlog")"
|
||||||
|
debug "server listening on ${scheme}://127.0.0.1:${port}"
|
||||||
|
|
||||||
|
baseurl="${scheme}://127.0.0.1:${port}"
|
||||||
|
|
||||||
|
# --- substitute BASEURL in the remaining (httrack) args ----------------------
|
||||||
|
declare -a hts=()
|
||||||
|
while test "$pos" -lt "$nargs"; do
|
||||||
|
hts+=("${args[$pos]//BASEURL/$baseurl}")
|
||||||
|
pos=$((pos + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# --- run httrack -------------------------------------------------------------
|
||||||
|
which httrack >/dev/null || die "could not find httrack"
|
||||||
|
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')
|
||||||
|
test -n "$ver" || die "could not run httrack"
|
||||||
|
|
||||||
|
out="${tmpdir}/crawl"
|
||||||
|
mkdir "$out" || die "could not create $out"
|
||||||
|
# Localhost is fast; disable the rate/bandwidth safety limits but keep a
|
||||||
|
# max-time backstop so a hang cannot wedge the suite.
|
||||||
|
declare -a moreargs=(--quiet --max-time=120 --timeout=30 --disable-security-limits --robots=0)
|
||||||
|
log="${tmpdir}/log"
|
||||||
|
info "running httrack ${hts[*]}"
|
||||||
|
httrack -O "$out" --user-agent="httrack $ver local ($(uname -omrs))" "${moreargs[@]}" "${hts[@]}" >"$log" 2>&1 &
|
||||||
|
crawlpid=$!
|
||||||
|
wait "$crawlpid"
|
||||||
|
crawlres=$?
|
||||||
|
crawlpid=
|
||||||
|
# httrack exits 0 even on hard connect/DNS errors, so this is a backstop only;
|
||||||
|
# the real guard is the audit below (--errors 0 plus the host-root existence check).
|
||||||
|
test "$crawlres" -eq 0 || ! result "httrack exited $crawlres" || {
|
||||||
|
cat "$log" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
result "OK"
|
||||||
|
grep -iE "^[0-9:]*[[:space:]]Error:" "${out}/hts-log.txt" >&2
|
||||||
|
|
||||||
|
# --- optional second pass: re-mirror into the same dir (cache/update path) ----
|
||||||
|
if test -n "$rerun"; then
|
||||||
|
info "re-running httrack (update pass)"
|
||||||
|
httrack -O "$out" --user-agent="httrack $ver local ($(uname -omrs))" \
|
||||||
|
"${moreargs[@]}" "${hts[@]}" >"${log}.2" 2>&1 &
|
||||||
|
crawlpid=$!
|
||||||
|
wait "$crawlpid"
|
||||||
|
crawlres=$?
|
||||||
|
crawlpid=
|
||||||
|
test "$crawlres" -eq 0 || ! result "update pass exited $crawlres" || {
|
||||||
|
cat "${log}.2" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
result "OK (update)"
|
||||||
|
# The update summary reports "files updated"; a fresh crawl never does. Assert
|
||||||
|
# it so a regression that bypasses the cache (re-crawls fresh) can't pass.
|
||||||
|
info "checking update used the cache"
|
||||||
|
if grep -aqE "mirror complete in .*files updated" "${out}/hts-log.txt"; then
|
||||||
|
result "OK"
|
||||||
|
else
|
||||||
|
result "update pass did not report cache activity"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||||
|
hostroot=
|
||||||
|
for cand in "${out}/127.0.0.1_${port}" "${out}/127.0.0.1"; do
|
||||||
|
if test -d "$cand"; then
|
||||||
|
hostroot="$cand"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
test -n "$hostroot" || die "could not find host root under $out"
|
||||||
|
debug "host root: $hostroot"
|
||||||
|
|
||||||
|
# --- audit -------------------------------------------------------------------
|
||||||
|
i=0
|
||||||
|
while test "$i" -lt "${#audit[@]}"; do
|
||||||
|
case "${audit[$i]}" in
|
||||||
|
--errors)
|
||||||
|
i=$((i + 1))
|
||||||
|
assert_equals "checking errors" "${audit[$i]}" \
|
||||||
|
"$(grep -iEc "^[0-9:]*[[:space:]]Error:" "${out}/hts-log.txt")"
|
||||||
|
;;
|
||||||
|
--files)
|
||||||
|
i=$((i + 1))
|
||||||
|
nFiles=$(grep -E "^HTTrack Website Copier/[^ ]* mirror complete in " "${out}/hts-log.txt" |
|
||||||
|
sed -e 's/.*[[:space:]]\([^ ]*\)[[:space:]]files written.*/\1/g')
|
||||||
|
assert_equals "checking files" "${audit[$i]}" "$nFiles"
|
||||||
|
;;
|
||||||
|
--found)
|
||||||
|
i=$((i + 1))
|
||||||
|
info "checking for ${audit[$i]}"
|
||||||
|
if test -f "${hostroot}/${audit[$i]}"; then result "OK"; else
|
||||||
|
result "not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
--not-found)
|
||||||
|
i=$((i + 1))
|
||||||
|
info "checking absence of ${audit[$i]}"
|
||||||
|
if test ! -f "${hostroot}/${audit[$i]}"; then result "OK"; else
|
||||||
|
result "present"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
--directory)
|
||||||
|
i=$((i + 1))
|
||||||
|
info "checking for dir ${audit[$i]}"
|
||||||
|
if test -d "${hostroot}/${audit[$i]}"; then result "OK"; else
|
||||||
|
result "not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
i=$((i + 1))
|
||||||
|
done
|
||||||
308
tests/local-server.py
Executable file
308
tests/local-server.py
Executable file
@@ -0,0 +1,308 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Self-contained local web server for httrack's crawl tests.
|
||||||
|
|
||||||
|
Serves static fixtures from a docroot plus a handful of dynamic endpoints
|
||||||
|
(cookies, ...) so httrack can be exercised over loopback, deterministically and
|
||||||
|
offline, instead of crawling the live ut.httrack.com.
|
||||||
|
|
||||||
|
Binds to an ephemeral port (port 0) and prints the chosen port to stdout as
|
||||||
|
"PORT <n>\n" so a launcher can discover it. Pass --tls to wrap the socket with
|
||||||
|
the shipped self-signed test cert; httrack does not verify certs, so no CA
|
||||||
|
trust plumbing is needed.
|
||||||
|
|
||||||
|
stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||||
|
from urllib.parse import quote, unquote, urlsplit
|
||||||
|
|
||||||
|
# Cookie chain replicated from the old ut/cookies/*.php fixtures.
|
||||||
|
COOKIE_PATH = "/cookies/"
|
||||||
|
COOKIES = {
|
||||||
|
"cat": "dog",
|
||||||
|
"cake": "is a lie!",
|
||||||
|
"badger": "mushroom, with 'ants'",
|
||||||
|
}
|
||||||
|
|
||||||
|
PAGE = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||||
|
\t"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
||||||
|
<head>
|
||||||
|
\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||||
|
\t<title>Sample test</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
{body}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class Handler(SimpleHTTPRequestHandler):
|
||||||
|
# Quieter logging; the launcher captures httrack's own log anyway.
|
||||||
|
def log_message(self, fmt, *args):
|
||||||
|
if os.environ.get("LOCAL_SERVER_VERBOSE"):
|
||||||
|
super().log_message(fmt, *args)
|
||||||
|
|
||||||
|
# --- helpers -----------------------------------------------------------
|
||||||
|
|
||||||
|
def request_cookies(self):
|
||||||
|
"""Parse the Cookie header into {name: decoded-value}.
|
||||||
|
|
||||||
|
Mirrors PHP's $_COOKIE: values are url-decoded, matching the encoding
|
||||||
|
applied when the cookie was set (see set_cookie)."""
|
||||||
|
jar = {}
|
||||||
|
raw = self.headers.get("Cookie", "")
|
||||||
|
for pair in raw.split(";"):
|
||||||
|
pair = pair.strip()
|
||||||
|
if "=" in pair:
|
||||||
|
name, value = pair.split("=", 1)
|
||||||
|
jar[name.strip()] = unquote(value.strip())
|
||||||
|
return jar
|
||||||
|
|
||||||
|
def set_cookie(self, name, value):
|
||||||
|
"""Queue a Set-Cookie header, url-encoding the value like PHP's
|
||||||
|
setcookie() so spaces/quotes/commas stay a single token that httrack
|
||||||
|
can store and replay verbatim."""
|
||||||
|
self._set_cookies.append(f"{name}={quote(value)}; Path={COOKIE_PATH}")
|
||||||
|
|
||||||
|
def send_html(self, body, status=200, extra_status=None):
|
||||||
|
encoded = PAGE.format(body=body).encode("utf-8")
|
||||||
|
self.send_response(status, extra_status)
|
||||||
|
self.send_header("Content-Type", "text/html; charset=utf-8")
|
||||||
|
self.send_header("Content-Length", str(len(encoded)))
|
||||||
|
for cookie in self._set_cookies:
|
||||||
|
self.send_header("Set-Cookie", cookie)
|
||||||
|
self.end_headers()
|
||||||
|
if self.command != "HEAD":
|
||||||
|
self.wfile.write(encoded)
|
||||||
|
|
||||||
|
def fail_cookie(self, what):
|
||||||
|
# The old PHPs answered 500 with the reason in the status line.
|
||||||
|
self.send_html("", status=500, extra_status=f"The {what} is missing or invalid")
|
||||||
|
|
||||||
|
# --- dynamic routes ----------------------------------------------------
|
||||||
|
|
||||||
|
def route_entrance(self):
|
||||||
|
self.set_cookie("cat", COOKIES["cat"])
|
||||||
|
self.set_cookie("cake", COOKIES["cake"])
|
||||||
|
self.send_html('\tThis is a <a href="second.php">link</a>')
|
||||||
|
|
||||||
|
def route_second(self):
|
||||||
|
jar = self.request_cookies()
|
||||||
|
if jar.get("cat") != COOKIES["cat"]:
|
||||||
|
return self.fail_cookie("cat")
|
||||||
|
if jar.get("cake") != COOKIES["cake"]:
|
||||||
|
return self.fail_cookie("cake")
|
||||||
|
self.set_cookie("badger", COOKIES["badger"])
|
||||||
|
self.send_html('\tThis is a <a href="third.php">link</a>')
|
||||||
|
|
||||||
|
def route_third(self):
|
||||||
|
jar = self.request_cookies()
|
||||||
|
if jar.get("cat") != COOKIES["cat"]:
|
||||||
|
return self.fail_cookie("cat")
|
||||||
|
if jar.get("cake") != COOKIES["cake"]:
|
||||||
|
return self.fail_cookie("cake")
|
||||||
|
if jar.get("badger") != COOKIES["badger"]:
|
||||||
|
return self.fail_cookie("badger")
|
||||||
|
self.send_html("\tThis is a test.")
|
||||||
|
|
||||||
|
def route_robots(self):
|
||||||
|
body = b"User-agent: *\nDisallow:\n"
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "text/plain")
|
||||||
|
self.send_header("Content-Length", str(len(body)))
|
||||||
|
self.end_headers()
|
||||||
|
if self.command != "HEAD":
|
||||||
|
self.wfile.write(body)
|
||||||
|
|
||||||
|
# --- type/extension matrix (issue #267 family) -------------------------
|
||||||
|
|
||||||
|
def send_raw(self, body, content_type):
|
||||||
|
"""Send a raw body with an explicit Content-Type, or none at all when
|
||||||
|
content_type is None (to observe httrack's typeless-file naming)."""
|
||||||
|
self.send_response(200)
|
||||||
|
if content_type is not None:
|
||||||
|
self.send_header("Content-Type", content_type)
|
||||||
|
self.send_header("Content-Length", str(len(body)))
|
||||||
|
self.end_headers()
|
||||||
|
if self.command != "HEAD":
|
||||||
|
self.wfile.write(body)
|
||||||
|
|
||||||
|
# Fake-binary blobs for the image/pdf/typeless cases.
|
||||||
|
FAKE_PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64
|
||||||
|
FAKE_PDF = b"%PDF-1.4\n" + b"\x00" * 64
|
||||||
|
|
||||||
|
# path -> (body, content_type); None sends no header, "" sends an empty
|
||||||
|
# Content-Type value (no usable type, must be treated like None).
|
||||||
|
TYPE_MATRIX = {
|
||||||
|
"/types/control.php": (b"<html><body>control</body></html>", "text/html"),
|
||||||
|
"/types/photo.png": (FAKE_PNG, "image/png"),
|
||||||
|
"/types/doc.pdf": (FAKE_PDF, "application/pdf"),
|
||||||
|
"/types/notype.png": (FAKE_PNG, None),
|
||||||
|
"/types/notype.pdf": (FAKE_PDF, None),
|
||||||
|
"/types/emptyct.png": (FAKE_PNG, ""),
|
||||||
|
"/types/lie.png": (FAKE_PNG, "text/html"),
|
||||||
|
"/types/report.pdf": (b"<html><body>real page</body></html>", "text/html"),
|
||||||
|
"/types/page.htm": (b"<html><body>htm page</body></html>", "text/html"),
|
||||||
|
"/types/script.js": (b"var x = 1;\n", "application/javascript"),
|
||||||
|
"/types/style.css": (b"body { color: red; }\n", "text/css"),
|
||||||
|
"/types/data.json": (b'{"k": "v"}\n', "application/json"),
|
||||||
|
"/types/gen.php": (FAKE_PNG, "image/png"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def route_types_index(self):
|
||||||
|
body = (
|
||||||
|
'\t<a href="control.php">control</a>\n'
|
||||||
|
'\t<img src="photo.png" />\n'
|
||||||
|
'\t<a href="doc.pdf">doc</a>\n'
|
||||||
|
'\t<img src="notype.png" />\n'
|
||||||
|
'\t<a href="notype.pdf">notypepdf</a>\n'
|
||||||
|
'\t<img src="emptyct.png" />\n'
|
||||||
|
'\t<img src="lie.png" />\n'
|
||||||
|
'\t<a href="report.pdf">report</a>\n'
|
||||||
|
'\t<a href="page.htm">htm</a>\n'
|
||||||
|
'\t<script src="script.js"></script>\n'
|
||||||
|
'\t<link rel="stylesheet" href="style.css" />\n'
|
||||||
|
'\t<a href="data.json">json</a>\n'
|
||||||
|
'\t<img src="gen.php?id=5" />\n'
|
||||||
|
)
|
||||||
|
self.send_html(body)
|
||||||
|
|
||||||
|
def route_types(self):
|
||||||
|
path = urlsplit(self.path).path
|
||||||
|
body, ctype = self.TYPE_MATRIX[path]
|
||||||
|
self.send_raw(body, ctype)
|
||||||
|
|
||||||
|
# --- special chars in URLs across an update (issue #157) ---------------
|
||||||
|
# A dotless, accented basename served as text/html (MediaWiki style). The
|
||||||
|
# name the first crawl picks (.html) must survive the update pass.
|
||||||
|
INTL_NAME = "Instalação_CVS_no_Ubuntu"
|
||||||
|
|
||||||
|
def route_intl_index(self):
|
||||||
|
self.send_html('\t<a href="%s">accented</a>\n' % self.INTL_NAME)
|
||||||
|
|
||||||
|
def route_intl_page(self):
|
||||||
|
self.send_raw(b"<html><body>accented page</body></html>\n", "text/html")
|
||||||
|
|
||||||
|
# resume / 416 loop (#206): the first GET stalls after a prefix so the crawl
|
||||||
|
# can be interrupted (partial + temp-ref); every later request is 416.
|
||||||
|
RESUME_PREFIX = b"PARTIAL-" + b"x" * 4096 # flushed before the stall
|
||||||
|
RESUME_LEN = len(RESUME_PREFIX) + 4096 # declared length never delivered
|
||||||
|
_resume_started = False
|
||||||
|
|
||||||
|
def route_resume_index(self):
|
||||||
|
self.send_html('\t<a href="blob.txt">blob</a>')
|
||||||
|
|
||||||
|
def route_resume(self):
|
||||||
|
counter = os.environ.get("RESUME_COUNTER")
|
||||||
|
if counter:
|
||||||
|
with open(counter, "a") as fp:
|
||||||
|
fp.write("x")
|
||||||
|
# First GET: stall mid-body so the crawl can be interrupted with a partial.
|
||||||
|
if not Handler._resume_started:
|
||||||
|
Handler._resume_started = True
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "image/png")
|
||||||
|
self.send_header("Content-Length", str(self.RESUME_LEN))
|
||||||
|
self.send_header("Accept-Ranges", "bytes")
|
||||||
|
self.end_headers()
|
||||||
|
if self.command != "HEAD":
|
||||||
|
self.wfile.write(self.RESUME_PREFIX)
|
||||||
|
self.wfile.flush()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(3600)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
self.send_response(416, "Requested Range Not Satisfiable")
|
||||||
|
self.send_header("Content-Type", "image/png")
|
||||||
|
self.send_header("Content-Range", "bytes */%d" % self.RESUME_LEN)
|
||||||
|
self.send_header("Content-Length", "0")
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
ROUTES = {
|
||||||
|
"/cookies/entrance.php": route_entrance,
|
||||||
|
"/cookies/second.php": route_second,
|
||||||
|
"/cookies/third.php": route_third,
|
||||||
|
"/robots.txt": route_robots,
|
||||||
|
"/types/index.html": route_types_index,
|
||||||
|
"/types/control.php": route_types,
|
||||||
|
"/types/photo.png": route_types,
|
||||||
|
"/types/doc.pdf": route_types,
|
||||||
|
"/types/notype.png": route_types,
|
||||||
|
"/types/notype.pdf": route_types,
|
||||||
|
"/types/emptyct.png": route_types,
|
||||||
|
"/types/lie.png": route_types,
|
||||||
|
"/types/report.pdf": route_types,
|
||||||
|
"/types/page.htm": route_types,
|
||||||
|
"/types/script.js": route_types,
|
||||||
|
"/types/style.css": route_types,
|
||||||
|
"/types/data.json": route_types,
|
||||||
|
"/types/gen.php": route_types,
|
||||||
|
"/intl/index.html": route_intl_index,
|
||||||
|
"/intl/" + INTL_NAME: route_intl_page,
|
||||||
|
"/resume/index.html": route_resume_index,
|
||||||
|
"/resume/blob.txt": route_resume,
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- dispatch ----------------------------------------------------------
|
||||||
|
|
||||||
|
def dispatch(self):
|
||||||
|
self._set_cookies = []
|
||||||
|
path = urlsplit(self.path).path
|
||||||
|
# Match percent-encoded paths (accented #157 route) by their decoded form.
|
||||||
|
handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
|
||||||
|
if handler is not None:
|
||||||
|
handler(self)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
if not self.dispatch():
|
||||||
|
super().do_GET()
|
||||||
|
|
||||||
|
def do_HEAD(self):
|
||||||
|
if not self.dispatch():
|
||||||
|
super().do_HEAD()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument("--root", required=True, help="docroot for static files")
|
||||||
|
parser.add_argument("--bind", default="127.0.0.1", help="bind address")
|
||||||
|
parser.add_argument("--tls", action="store_true", help="serve HTTPS")
|
||||||
|
parser.add_argument("--cert", help="TLS certificate (PEM)")
|
||||||
|
parser.add_argument("--key", help="TLS private key (PEM)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
root = os.path.abspath(args.root)
|
||||||
|
|
||||||
|
def factory(*a, **kw):
|
||||||
|
return Handler(*a, directory=root, **kw)
|
||||||
|
|
||||||
|
httpd = ThreadingHTTPServer((args.bind, 0), factory)
|
||||||
|
|
||||||
|
if args.tls:
|
||||||
|
import ssl
|
||||||
|
|
||||||
|
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||||
|
ctx.load_cert_chain(certfile=args.cert, keyfile=args.key)
|
||||||
|
httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
|
||||||
|
|
||||||
|
port = httpd.socket.getsockname()[1]
|
||||||
|
# The launcher reads this line to discover the ephemeral port.
|
||||||
|
print(f"PORT {port}", flush=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
httpd.serve_forever()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
151
tests/proxy-https-server.py
Normal file
151
tests/proxy-https-server.py
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Local CONNECT proxy + self-signed HTTPS origin for the issue #85 test.
|
||||||
|
|
||||||
|
Starts a TLS origin server and an HTTP proxy that honours CONNECT, on ephemeral
|
||||||
|
ports. Every request line the proxy receives (and any Proxy-Authorization) is
|
||||||
|
appended to the proxy log; every header the origin receives over the tunnel is
|
||||||
|
appended to the origin log. That lets the test assert both that an https crawl
|
||||||
|
tunneled through the proxy and that proxy credentials never leaked to the origin.
|
||||||
|
|
||||||
|
Proxy modes (argv[3], default "ok"):
|
||||||
|
ok - honour CONNECT and tunnel to the origin
|
||||||
|
flood - answer 200 then stream headers forever with no blank line, to exercise
|
||||||
|
the client's bound on the proxy response (must not hang the crawl)
|
||||||
|
|
||||||
|
Usage: proxy-https-server.py <cert.pem> <logdir> [mode]
|
||||||
|
Prints "ORIGIN <port>", "PROXY <port>", then "ready" (one per line) on stdout.
|
||||||
|
"""
|
||||||
|
import http.server
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import socketserver
|
||||||
|
import ssl
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
|
||||||
|
ORIGIN_BODY = b"<html><body>ORIGIN-PAGE-85</body></html>"
|
||||||
|
PROXY_LOG = "proxy.log"
|
||||||
|
ORIGIN_LOG = "origin-headers.log"
|
||||||
|
|
||||||
|
|
||||||
|
def make_origin(logdir):
|
||||||
|
class Origin(http.server.BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
with open(os.path.join(logdir, ORIGIN_LOG), "a") as handle:
|
||||||
|
for key in self.headers.keys():
|
||||||
|
handle.write(key + "\n")
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "text/html")
|
||||||
|
self.send_header("Content-Length", str(len(ORIGIN_BODY)))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(ORIGIN_BODY)
|
||||||
|
|
||||||
|
def log_message(self, *args):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return Origin
|
||||||
|
|
||||||
|
|
||||||
|
def start_origin(certfile, logdir):
|
||||||
|
httpd = socketserver.TCPServer(("127.0.0.1", 0), make_origin(logdir))
|
||||||
|
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||||
|
ctx.load_cert_chain(certfile)
|
||||||
|
httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
|
||||||
|
port = httpd.socket.getsockname()[1]
|
||||||
|
threading.Thread(target=httpd.serve_forever, daemon=True).start()
|
||||||
|
return port
|
||||||
|
|
||||||
|
|
||||||
|
def pipe(src, dst):
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
data = src.recv(65536)
|
||||||
|
if not data:
|
||||||
|
break
|
||||||
|
dst.sendall(data)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
for sock in (src, dst):
|
||||||
|
try:
|
||||||
|
sock.shutdown(socket.SHUT_RDWR)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def handle_client(conn, logdir, mode):
|
||||||
|
rfile = conn.makefile("rb")
|
||||||
|
request_line = rfile.readline().decode("latin-1").strip()
|
||||||
|
auth = None
|
||||||
|
while True:
|
||||||
|
line = rfile.readline().decode("latin-1")
|
||||||
|
if line in ("\r\n", "\n", ""):
|
||||||
|
break
|
||||||
|
key, _, value = line.partition(":")
|
||||||
|
if key.strip().lower() == "proxy-authorization":
|
||||||
|
auth = value.strip()
|
||||||
|
with open(os.path.join(logdir, PROXY_LOG), "a") as handle:
|
||||||
|
handle.write(request_line + "\n")
|
||||||
|
if auth is not None:
|
||||||
|
handle.write("AUTH " + auth + "\n")
|
||||||
|
parts = request_line.split()
|
||||||
|
if not (len(parts) >= 2 and parts[0] == "CONNECT"):
|
||||||
|
conn.sendall(b"HTTP/1.0 501 Not Implemented\r\n\r\n")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
if mode == "flood":
|
||||||
|
# 200, then an endless header stream with no terminating blank line: the
|
||||||
|
# client must bound this and give up, not hang.
|
||||||
|
try:
|
||||||
|
conn.sendall(b"HTTP/1.0 200 Connection established\r\n")
|
||||||
|
while True:
|
||||||
|
conn.sendall(b"X-Pad: 0123456789\r\n")
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
host, _, port = parts[1].partition(":")
|
||||||
|
try:
|
||||||
|
upstream = socket.create_connection((host, int(port or 443)))
|
||||||
|
except OSError:
|
||||||
|
conn.sendall(b"HTTP/1.0 502 Bad Gateway\r\n\r\n")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
conn.sendall(b"HTTP/1.0 200 Connection established\r\n\r\n")
|
||||||
|
threading.Thread(target=pipe, args=(conn, upstream), daemon=True).start()
|
||||||
|
pipe(upstream, conn)
|
||||||
|
|
||||||
|
|
||||||
|
def start_proxy(logdir, mode):
|
||||||
|
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||||
|
srv.bind(("127.0.0.1", 0))
|
||||||
|
srv.listen(16)
|
||||||
|
port = srv.getsockname()[1]
|
||||||
|
|
||||||
|
def serve():
|
||||||
|
while True:
|
||||||
|
conn, _ = srv.accept()
|
||||||
|
threading.Thread(
|
||||||
|
target=handle_client, args=(conn, logdir, mode), daemon=True
|
||||||
|
).start()
|
||||||
|
|
||||||
|
threading.Thread(target=serve, daemon=True).start()
|
||||||
|
return port
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
certfile, logdir = sys.argv[1], sys.argv[2]
|
||||||
|
mode = sys.argv[3] if len(sys.argv) > 3 else "ok"
|
||||||
|
for name in (PROXY_LOG, ORIGIN_LOG):
|
||||||
|
open(os.path.join(logdir, name), "w").close()
|
||||||
|
origin_port = start_origin(certfile, logdir)
|
||||||
|
proxy_port = start_proxy(logdir, mode)
|
||||||
|
print("ORIGIN %d" % origin_port, flush=True)
|
||||||
|
print("PROXY %d" % proxy_port, flush=True)
|
||||||
|
print("ready", flush=True)
|
||||||
|
threading.Event().wait()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -3,11 +3,11 @@
|
|||||||
|
|
||||||
error=0
|
error=0
|
||||||
for i in *.test; do
|
for i in *.test; do
|
||||||
if bash $i ; then
|
if bash "$i"; then
|
||||||
echo "$i: passed" >&2
|
echo "$i: passed" >&2
|
||||||
else
|
else
|
||||||
echo "$i: ERROR" >&2
|
echo "$i: ERROR" >&2
|
||||||
error=$[${error}+1]
|
error=$((error + 1))
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|||||||
18
tests/server-root/simple/basic.html
Normal file
18
tests/server-root/simple/basic.html
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||||
|
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" lang="fr">
|
||||||
|
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||||
|
<title>Sample test</title>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
|
||||||
|
This is a <a href="link.html?v=1">link</a>
|
||||||
|
This is a <a href='link.html?v=2'>link</a>
|
||||||
|
This is a <a href="./link.html?v=3">link</a>
|
||||||
|
This is a <a href=link.html?v=4>link</a>
|
||||||
|
|
||||||
|
</body>
|
||||||
3
tests/server-root/simple/link.html
Normal file
3
tests/server-root/simple/link.html
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
This is a link.
|
||||||
|
|
||||||
|
Go back to <a href="basic.html">home</a>.
|
||||||
21
tests/server.crt
Normal file
21
tests/server.crt
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIDbzCCAlegAwIBAgIUdWkDDomnY3WW95UqJ+UOASuR/i0wDQYJKoZIhvcNAQEL
|
||||||
|
BQAwODESMBAGA1UEAwwJMTI3LjAuMC4xMSIwIAYDVQQKDBlIVFRyYWNrIGxvY2Fs
|
||||||
|
IHRlc3Qgc2VydmVyMCAXDTI2MDYxNTE0NDQxMFoYDzIwNTYwNjA3MTQ0NDEwWjA4
|
||||||
|
MRIwEAYDVQQDDAkxMjcuMC4wLjExIjAgBgNVBAoMGUhUVHJhY2sgbG9jYWwgdGVz
|
||||||
|
dCBzZXJ2ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDx78mogNhT
|
||||||
|
noWwRa51NeGtapQ1PfTYLlIMUzuloFXOsR1/ozRkFucqHNftF22wf0gg4VQJSBSf
|
||||||
|
3rwj79vsnt3nyaD03bTAafpHXkd+IJxQowiG8TfOJF0R/Qg9g7DCE66R9agQpMJC
|
||||||
|
SGxIin9p/4ld4Hn6869d4hNq4fHxNf/qkj2cnf8DYxrldz2FGsi6yMed4tzz2Am4
|
||||||
|
ZbPgwep+fy843ZdYrVIms9vJluNa9E+6Vpw9FwdjzQ/IBBMLvGaC2pDkc95YelaE
|
||||||
|
nQrAlTO/0l5vjc8XuTQFlo3DbUg+WEld/pxvCqsd/q1mqjL0WbxtXl2zCwGzAoJx
|
||||||
|
rjVEPfA8QSbtAgMBAAGjbzBtMB0GA1UdDgQWBBTHE0KKW8REV4HxajzVsIBxz3iL
|
||||||
|
9zAfBgNVHSMEGDAWgBTHE0KKW8REV4HxajzVsIBxz3iL9zAPBgNVHRMBAf8EBTAD
|
||||||
|
AQH/MBoGA1UdEQQTMBGHBH8AAAGCCWxvY2FsaG9zdDANBgkqhkiG9w0BAQsFAAOC
|
||||||
|
AQEAYlTEftrwGJBXuPmtxhmtw2HO/VTC4TGnq67hH5H+ptwgZJuuxCQ5KW6flTyp
|
||||||
|
FTyMhha33WD4EBL3wqqJsWr9Y4BXqi4G0lRqXBcC1oIUa2VYIDMER7kaY1qTSqE8
|
||||||
|
ARpwdB2BhvngAzDLc+4Jt4jQMRGr8fHAwxpDBoIZ1knbyzYNP73Bajse6/8YtxUu
|
||||||
|
nB2BsldjZnLvyHvRxUpWp92OyQih4jYSrlN6olDFlKDg7++kMhkHtJQW9a1t54VN
|
||||||
|
0ZXrB1ZRuHUUvGBq26x71riTWor7HNOSQaGeCMQjZNQkh5tfshNygUGSZVXTEwhG
|
||||||
|
xSrOL7NqBt2+EkVwf7LjGzjmBw==
|
||||||
|
-----END CERTIFICATE-----
|
||||||
28
tests/server.key
Normal file
28
tests/server.key
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
-----BEGIN PRIVATE KEY-----
|
||||||
|
MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDx78mogNhTnoWw
|
||||||
|
Ra51NeGtapQ1PfTYLlIMUzuloFXOsR1/ozRkFucqHNftF22wf0gg4VQJSBSf3rwj
|
||||||
|
79vsnt3nyaD03bTAafpHXkd+IJxQowiG8TfOJF0R/Qg9g7DCE66R9agQpMJCSGxI
|
||||||
|
in9p/4ld4Hn6869d4hNq4fHxNf/qkj2cnf8DYxrldz2FGsi6yMed4tzz2Am4ZbPg
|
||||||
|
wep+fy843ZdYrVIms9vJluNa9E+6Vpw9FwdjzQ/IBBMLvGaC2pDkc95YelaEnQrA
|
||||||
|
lTO/0l5vjc8XuTQFlo3DbUg+WEld/pxvCqsd/q1mqjL0WbxtXl2zCwGzAoJxrjVE
|
||||||
|
PfA8QSbtAgMBAAECggEACgNK4klq1T3IpKdNoBY5yoE7CbUQZBNkBpSPRxHgBezj
|
||||||
|
SVFfgrZGnOySrIJSt4JHtuynG2Hl+0ku74HRep/ck+eOsh5W3mZvGvMLnGxhwR3u
|
||||||
|
Or99osTIgU0VQTkpC0SLQ16FCnih0uJycNIikdLR7uuya1tt1OyIBzK7XlNGIywT
|
||||||
|
p85zJc7/6TfTC9eM7lqh7JGR7KplBxSvgZL1pUr7y4rNpKms6uzOvPND79CcKnbU
|
||||||
|
BBA9Tu4qdOkoOljsZKkvh3pihxyG9X6d8QTZ/uX3pkvliwSFBc+Sz9EootA3/4r5
|
||||||
|
gVWpQ2t/AY7fY4hqzLIX/HivVaPj3cWk1G+SHm0XNQKBgQD5I9rijqFvV/p6FmUl
|
||||||
|
FbnjJFFHHgZLivlGxAC5vOyJNQQaqdeDzg7yMotNmQTggVGjT6sjdosQb3n+ctuk
|
||||||
|
EhQnZSU5VkNKv1+PTR35WrRkaECCaqz3Pv79pV9GVcX3it7UuYjNiOeSPqINWe+X
|
||||||
|
49JwnJFz+qQ1BchAwOis4zkENwKBgQD4mShDaYLOO97VpgZj4cGxHHWyEK9CRQvp
|
||||||
|
I7HxRmfaWS3JHwb88lOmALEU6pAj5cYJPAznv8BnUWcVHalZbkQ1JWYtUJRqj6OI
|
||||||
|
Ym7rw/nm4Ay5ijbdEism173dSk3IjOe+PdAlxzsOuVzYdBTqElmeQWtBzhY9aHvX
|
||||||
|
r+A02C2j+wKBgHHDo6Gsi57yR5gUPd9vSlCkNtEIrss0DJv5yHMIB+KnaNZcE+NF
|
||||||
|
5qFF30Jxyz5RDtxJ9tXcvaeln8lG3XDQKI/MqfDCqTuqo5ImHrfMaW8oA70JxS2p
|
||||||
|
gHqGVzkg1aMxsIrmpcdk6olnPExocvWivGdbtzeEjhMALu8Sp6y6nUCFAoGBAK5h
|
||||||
|
KLgYw/OMVaQCIMthaa+l6f0s7PMMYe1453H6VBD6qz4/8HPwO7LfG1gzrUYxADgs
|
||||||
|
ElVh0UHn/On383nS+i9Ze5Hfyyvwc+LQQURKJPrJQMPJavCptPE7NmiKnYNHK6vr
|
||||||
|
yh0l4oxShAklbCJBGvICq4zuVfVfXDeQnDIVTfaPAoGBAMCrZqYdOUhUu+aUqxZq
|
||||||
|
qO/TTQxrxftU63jGUg+o042TdgI4KWLn07wvHJ8/E2OqF35eXenvcuKbNLI1l72J
|
||||||
|
4cp+3cUv8iAXThTRYEztr5CS/wta4o4CNN8zfjn5dV9AI4Hmt4V7EaGWpBcViGbj
|
||||||
|
n0Mhag+dO8DHuenqi1yfMrAt
|
||||||
|
-----END PRIVATE KEY-----
|
||||||
152
tools/mk-sbuild-chroot.sh
Executable file
152
tools/mk-sbuild-chroot.sh
Executable file
@@ -0,0 +1,152 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
#
|
||||||
|
# Bootstrap an sbuild chroot for the clean-room build gate (mkdeb.sh --sbuild).
|
||||||
|
#
|
||||||
|
# Uses the rootless unshare backend: no root, no schroot daemon. It builds a
|
||||||
|
# minimal buildd chroot tarball into ~/.cache/sbuild/<dist>-<arch>.tar.zst, where
|
||||||
|
# sbuild --dist=<dist> finds it automatically in unshare mode.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# tools/mk-sbuild-chroot.sh [options]
|
||||||
|
#
|
||||||
|
# Options:
|
||||||
|
# -d, --dist DIST suite to bootstrap (default: unstable)
|
||||||
|
# -a, --arch ARCH architecture (default: dpkg --print-architecture)
|
||||||
|
# -m, --mirror URL apt mirror (default: http://deb.debian.org/debian)
|
||||||
|
# --components LIST comma-separated components (default: main)
|
||||||
|
# -f, --force rebuild even if the tarball already exists
|
||||||
|
# --write-sbuildrc add "$chroot_mode = 'unshare';" to ~/.sbuildrc if absent
|
||||||
|
# -h, --help show this help
|
||||||
|
#
|
||||||
|
# One-time setup; refresh later with sbuild-update or by rerunning with --force.
|
||||||
|
# Requires mmdebstrap and the uidmap tools (newuidmap) for the unshare backend.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
readonly PROGNAME=${0##*/}
|
||||||
|
|
||||||
|
die() {
|
||||||
|
printf '%s: error: %s\n' "$PROGNAME" "$*" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
info() {
|
||||||
|
printf '==> %s\n' "$*" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
sed -n '2,/^set -euo/{/^set -euo/!p}' "$0" | sed 's/^# \{0,1\}//'
|
||||||
|
}
|
||||||
|
|
||||||
|
need() {
|
||||||
|
local tool
|
||||||
|
for tool in "$@"; do
|
||||||
|
command -v "$tool" >/dev/null 2>&1 || die "required tool not found: $tool"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
local dist=unstable
|
||||||
|
local arch=""
|
||||||
|
local mirror=http://deb.debian.org/debian
|
||||||
|
local components=main
|
||||||
|
local force=0
|
||||||
|
local write_sbuildrc=0
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
-d | --dist)
|
||||||
|
[[ $# -ge 2 ]] || die "missing argument for $1"
|
||||||
|
dist=$2
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-a | --arch)
|
||||||
|
[[ $# -ge 2 ]] || die "missing argument for $1"
|
||||||
|
arch=$2
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-m | --mirror)
|
||||||
|
[[ $# -ge 2 ]] || die "missing argument for $1"
|
||||||
|
mirror=$2
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--components)
|
||||||
|
[[ $# -ge 2 ]] || die "missing argument for $1"
|
||||||
|
components=$2
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-f | --force)
|
||||||
|
force=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--write-sbuildrc)
|
||||||
|
write_sbuildrc=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
-h | --help)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
die "unknown option: $1 (try --help)"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
need mmdebstrap dpkg
|
||||||
|
# Unshare needs the setuid uid/gid mappers; mmdebstrap fails cryptically without.
|
||||||
|
command -v newuidmap >/dev/null 2>&1 ||
|
||||||
|
die "newuidmap not found; install the uidmap package for the unshare backend"
|
||||||
|
|
||||||
|
# Unshare maps a whole UID range, not just the caller's: the base install
|
||||||
|
# creates system users, and without an /etc/subuid+subgid range the install
|
||||||
|
# crashes (dpkg SIGSEGV) instead of erroring cleanly. Root uses mode=root and
|
||||||
|
# needs no range.
|
||||||
|
if [[ $(id -u) -ne 0 ]]; then
|
||||||
|
local me
|
||||||
|
me=$(id -un)
|
||||||
|
if ! grep -qs "^$me:" /etc/subuid || ! grep -qs "^$me:" /etc/subgid; then
|
||||||
|
# Suggest a range starting past every allocation in either file.
|
||||||
|
local start
|
||||||
|
start=$(awk -F: '{e = $2 + $3; if (e > m) m = e} END {print (m ? m : 100000)}' \
|
||||||
|
/etc/subuid /etc/subgid 2>/dev/null)
|
||||||
|
die "no /etc/subuid+subgid range for $me; the unshare backend needs one:
|
||||||
|
sudo usermod --add-subuids $start-$((start + 65535)) --add-subgids $start-$((start + 65535)) $me"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
: "${arch:=$(dpkg --print-architecture)}"
|
||||||
|
local cache=$HOME/.cache/sbuild
|
||||||
|
local tarball=$cache/${dist}-${arch}.tar.zst
|
||||||
|
|
||||||
|
if [[ -e $tarball && $force -eq 0 ]]; then
|
||||||
|
info "chroot already exists: $tarball (use --force to rebuild)"
|
||||||
|
else
|
||||||
|
info "bootstrapping $dist/$arch chroot into $tarball"
|
||||||
|
mkdir -p "$cache"
|
||||||
|
mmdebstrap --variant=buildd --arch="$arch" --components="$components" \
|
||||||
|
"$dist" "$tarball" "$mirror"
|
||||||
|
info "chroot ready: $tarball"
|
||||||
|
fi
|
||||||
|
|
||||||
|
local rc=$HOME/.sbuildrc
|
||||||
|
local mode_line="\$chroot_mode = 'unshare';"
|
||||||
|
# shellcheck disable=SC2016 # $chroot_mode is literal regex text, not a shell var.
|
||||||
|
if grep -qsE '^[[:space:]]*\$chroot_mode[[:space:]]*=.*unshare' "$rc"; then
|
||||||
|
: # already configured (active, non-commented line)
|
||||||
|
elif [[ $write_sbuildrc -eq 1 ]]; then
|
||||||
|
info "enabling the unshare backend in $rc"
|
||||||
|
printf '%s\n' "$mode_line" >>"$rc"
|
||||||
|
else
|
||||||
|
cat >&2 <<EOF
|
||||||
|
==> To use this chroot without passing --chroot-mode each time, add to $rc:
|
||||||
|
$mode_line
|
||||||
|
(or rerun with --write-sbuildrc). Then verify with:
|
||||||
|
sbuild --dist=$dist path/to/package.dsc
|
||||||
|
and build the release gate with:
|
||||||
|
tools/mkdeb.sh --source-only --sbuild
|
||||||
|
EOF
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
128
tools/mkdeb.sh
128
tools/mkdeb.sh
@@ -20,11 +20,27 @@
|
|||||||
# Options:
|
# Options:
|
||||||
# -k, --key KEYID GPG key for signing (default: $DEBSIGN_KEYID)
|
# -k, --key KEYID GPG key for signing (default: $DEBSIGN_KEYID)
|
||||||
# -o, --outdir DIR output directory (default: <repo>/dist)
|
# -o, --outdir DIR output directory (default: <repo>/dist)
|
||||||
|
# --orig FILE reuse this upstream orig tarball instead of
|
||||||
|
# regenerating it (required for a Debian revision
|
||||||
|
# >= 2, whose orig is frozen in the archive)
|
||||||
# -s, --source-only build only the source package
|
# -s, --source-only build only the source package
|
||||||
# -u, --unsigned do not sign anything (implies no release sigs)
|
# -u, --unsigned do not sign anything (implies no release sigs)
|
||||||
# --no-release-artifacts skip the orig tarball .asc/.md5/.sha1
|
# --no-release-artifacts skip the orig tarball .asc/.md5/.sha1
|
||||||
|
# --sbuild additionally build the .dsc in a clean sbuild
|
||||||
|
# chroot as a from-scratch verification gate
|
||||||
# -h, --help show this help
|
# -h, --help show this help
|
||||||
#
|
#
|
||||||
|
# --sbuild reproduces the buildd environment: it builds the source package in a
|
||||||
|
# minimal chroot holding only the declared Build-Depends, so an FTBFS or a
|
||||||
|
# missing dependency fails here instead of on the archive's buildds (which, with
|
||||||
|
# a source-only upload, are otherwise the first clean build). It needs an sbuild
|
||||||
|
# chroot for the changelog's distribution; create one once with the companion
|
||||||
|
# tools/mk-sbuild-chroot.sh (rootless unshare backend).
|
||||||
|
#
|
||||||
|
# The Debian revision in debian/changelog decides the orig: revision 1 builds a
|
||||||
|
# fresh upstream tarball; revision >= 2 must reuse the orig frozen at revision 1
|
||||||
|
# (the .dsc references it by checksum), so pass it with --orig.
|
||||||
|
#
|
||||||
# SOURCE_DATE_EPOCH is honored for reproducible output.
|
# SOURCE_DATE_EPOCH is honored for reproducible output.
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
@@ -57,9 +73,11 @@ need() {
|
|||||||
main() {
|
main() {
|
||||||
local key=${DEBSIGN_KEYID:-}
|
local key=${DEBSIGN_KEYID:-}
|
||||||
local outdir=""
|
local outdir=""
|
||||||
|
local orig_in=""
|
||||||
local source_only=0
|
local source_only=0
|
||||||
local unsigned=0
|
local unsigned=0
|
||||||
local release_artifacts=1
|
local release_artifacts=1
|
||||||
|
local sbuild=0
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
@@ -73,6 +91,11 @@ main() {
|
|||||||
outdir=$2
|
outdir=$2
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--orig)
|
||||||
|
[[ $# -ge 2 ]] || die "missing argument for $1"
|
||||||
|
orig_in=$2
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
-s | --source-only)
|
-s | --source-only)
|
||||||
source_only=1
|
source_only=1
|
||||||
shift
|
shift
|
||||||
@@ -85,6 +108,10 @@ main() {
|
|||||||
release_artifacts=0
|
release_artifacts=0
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
|
--sbuild)
|
||||||
|
sbuild=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
-h | --help)
|
-h | --help)
|
||||||
usage
|
usage
|
||||||
exit 0
|
exit 0
|
||||||
@@ -95,7 +122,8 @@ main() {
|
|||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
need git autoreconf debuild dcmd
|
need git autoreconf debuild dcmd dpkg-parsechangelog
|
||||||
|
[[ $sbuild -eq 1 ]] && need sbuild
|
||||||
if [[ $unsigned -eq 0 ]]; then
|
if [[ $unsigned -eq 0 ]]; then
|
||||||
need gpg
|
need gpg
|
||||||
[[ -n $key ]] || die "no signing key (pass --key or set DEBSIGN_KEYID, or use --unsigned)"
|
[[ -n $key ]] || die "no signing key (pass --key or set DEBSIGN_KEYID, or use --unsigned)"
|
||||||
@@ -107,6 +135,11 @@ main() {
|
|||||||
mkdir -p "$outdir"
|
mkdir -p "$outdir"
|
||||||
outdir=$(cd "$outdir" && pwd)
|
outdir=$(cd "$outdir" && pwd)
|
||||||
|
|
||||||
|
if [[ -n $orig_in ]]; then
|
||||||
|
[[ -r $orig_in ]] || die "--orig file not readable: $orig_in"
|
||||||
|
orig_in=$(cd "$(dirname "$orig_in")" && pwd)/$(basename "$orig_in")
|
||||||
|
fi
|
||||||
|
|
||||||
scratch=$(mktemp -d "${TMPDIR:-/tmp}/httrack-mkdeb.XXXXXX")
|
scratch=$(mktemp -d "${TMPDIR:-/tmp}/httrack-mkdeb.XXXXXX")
|
||||||
trap 'rm -rf -- "$scratch"' EXIT
|
trap 'rm -rf -- "$scratch"' EXIT
|
||||||
|
|
||||||
@@ -118,10 +151,31 @@ main() {
|
|||||||
git -C "$repo/src/coucal" archive --format=tar --prefix=src/coucal/ HEAD |
|
git -C "$repo/src/coucal" archive --format=tar --prefix=src/coucal/ HEAD |
|
||||||
tar -x -C "$export_dir"
|
tar -x -C "$export_dir"
|
||||||
|
|
||||||
# Refresh build system and man page, then build the tarball. We build here
|
# Upstream version and Debian revision drive the orig: revision 1 builds a
|
||||||
# only because regen-man needs the compiled binaries; the test suite is not
|
# fresh tarball, revision >= 2 reuses the one frozen at -1 (the .dsc pins it
|
||||||
# run in this pass. debuild (below) runs the full suite once, with the online
|
# by checksum, so a regenerated orig with new mtimes would be rejected).
|
||||||
# tests enabled, so a check here would just be a slower, offline-only repeat.
|
local fullver ver rev
|
||||||
|
fullver=$(cd "$export_dir" && dpkg-parsechangelog -S Version)
|
||||||
|
ver=${fullver%-*}
|
||||||
|
rev=${fullver##*-}
|
||||||
|
local orig=httrack_${ver}.orig.tar.gz
|
||||||
|
info "version $ver (Debian revision $rev)"
|
||||||
|
|
||||||
|
# A signed build is upload-bound, so a revision >= 2 must reuse the frozen
|
||||||
|
# orig (--orig); an unsigned build is a throwaway (CI, local) and may
|
||||||
|
# regenerate it, since it can never reach the archive.
|
||||||
|
if [[ -z $orig_in && $rev != 1 && $unsigned -eq 0 ]]; then
|
||||||
|
die "Debian revision $rev needs --orig FILE (the orig is frozen from revision 1)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $orig_in ]]; then
|
||||||
|
info "reusing upstream tarball $orig_in"
|
||||||
|
cp -- "$orig_in" "$scratch/$orig"
|
||||||
|
else
|
||||||
|
# Refresh build system and man page, then build the tarball. We build
|
||||||
|
# here only because regen-man needs the compiled binaries; the test
|
||||||
|
# suite is not run in this pass. debuild (below) runs the full suite
|
||||||
|
# once, online tests enabled, so a check here would just repeat it.
|
||||||
info "regenerating build system and man page"
|
info "regenerating build system and man page"
|
||||||
(
|
(
|
||||||
cd "$export_dir"
|
cd "$export_dir"
|
||||||
@@ -129,34 +183,33 @@ main() {
|
|||||||
./configure --quiet
|
./configure --quiet
|
||||||
make -s -j"$(nproc)"
|
make -s -j"$(nproc)"
|
||||||
make -s -C man regen-man
|
make -s -C man regen-man
|
||||||
# Build the tarball from a clean tree so no object files leak into it.
|
# Build the tarball from a clean tree so no object files leak in.
|
||||||
make -s clean
|
make -s clean
|
||||||
make -s dist
|
make -s dist
|
||||||
)
|
)
|
||||||
|
|
||||||
local tarball ver
|
|
||||||
local -a tarballs
|
local -a tarballs
|
||||||
shopt -s nullglob
|
shopt -s nullglob
|
||||||
tarballs=("$export_dir"/httrack-*.tar.gz)
|
tarballs=("$export_dir"/httrack-*.tar.gz)
|
||||||
shopt -u nullglob
|
shopt -u nullglob
|
||||||
[[ ${#tarballs[@]} -ge 1 ]] || die "make dist produced no tarball"
|
[[ ${#tarballs[@]} -ge 1 ]] || die "make dist produced no tarball"
|
||||||
tarball=${tarballs[0]##*/}
|
local tarball=${tarballs[0]##*/}
|
||||||
ver=${tarball#httrack-}
|
[[ $tarball == "httrack-$ver.tar.gz" ]] ||
|
||||||
ver=${ver%.tar.gz}
|
die "changelog version $ver disagrees with built tarball $tarball (configure.ac mismatch?)"
|
||||||
info "version $ver"
|
cp -- "$export_dir/$tarball" "$scratch/$orig"
|
||||||
|
fi
|
||||||
|
|
||||||
# 3.0 (quilt): orig tarball is upstream-only; debian/ is overlaid on top.
|
# 3.0 (quilt): orig tarball is upstream-only; debian/ is overlaid on top.
|
||||||
local orig=httrack_${ver}.orig.tar.gz
|
|
||||||
cp -- "$export_dir/$tarball" "$scratch/$orig"
|
|
||||||
(
|
(
|
||||||
cd "$scratch"
|
cd "$scratch"
|
||||||
tar -xf "$orig"
|
tar -xf "$orig"
|
||||||
|
[[ -d httrack-$ver ]] || die "orig tarball does not unpack to httrack-$ver/"
|
||||||
cp -a "$export_dir/debian" "httrack-$ver/debian"
|
cp -a "$export_dir/debian" "httrack-$ver/debian"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Build (debuild also runs lintian and signs). --fail-on aborts on a lintian
|
# Build and sign. debuild runs lintian too but does NOT propagate its exit
|
||||||
# error or warning, so neither a release nor CI produces an unclean package.
|
# status, so a broken package would pass unnoticed; disable it here and run
|
||||||
local -a debuild_opts=(--lintian-opts -I -i "--fail-on=error,warning")
|
# lintian ourselves below as the real gate.
|
||||||
|
local -a debuild_opts=(--no-lintian)
|
||||||
local -a build_opts=()
|
local -a build_opts=()
|
||||||
[[ $source_only -eq 1 ]] && build_opts+=(-S)
|
[[ $source_only -eq 1 ]] && build_opts+=(-S)
|
||||||
if [[ $unsigned -eq 1 ]]; then
|
if [[ $unsigned -eq 1 ]]; then
|
||||||
@@ -167,7 +220,8 @@ main() {
|
|||||||
info "building packages with debuild"
|
info "building packages with debuild"
|
||||||
(
|
(
|
||||||
cd "$scratch/httrack-$ver"
|
cd "$scratch/httrack-$ver"
|
||||||
debuild "${build_opts[@]}" "${debuild_opts[@]}"
|
# debuild options (--no-lintian) must precede the dpkg-buildpackage ones
|
||||||
|
debuild "${debuild_opts[@]}" "${build_opts[@]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Collect every file the .changes references (orig, dsc, debs, ddebs, buildinfo).
|
# Collect every file the .changes references (orig, dsc, debs, ddebs, buildinfo).
|
||||||
@@ -177,11 +231,49 @@ main() {
|
|||||||
changes=("$scratch"/*.changes)
|
changes=("$scratch"/*.changes)
|
||||||
shopt -u nullglob
|
shopt -u nullglob
|
||||||
[[ ${#changes[@]} -ge 1 ]] || die "debuild produced no .changes file"
|
[[ ${#changes[@]} -ge 1 ]] || die "debuild produced no .changes file"
|
||||||
|
|
||||||
|
# The real lintian gate (debuild only reports, it does not fail on tags).
|
||||||
|
# --profile debian: CI runners are Ubuntu, whose vendor data would wrongly
|
||||||
|
# reject the Debian "unstable" distribution. newer-standards-version only
|
||||||
|
# means the local lintian is older than the buildds', not a package
|
||||||
|
# defect, so suppress it. set -e turns any error/warning tag into a failure.
|
||||||
|
info "running lintian gate (--fail-on=error,warning)"
|
||||||
|
lintian --profile debian -I -i --fail-on=error,warning \
|
||||||
|
--suppress-tags newer-standards-version "${changes[@]}"
|
||||||
|
|
||||||
dcmd cp -- "${changes[@]}" "$outdir/"
|
dcmd cp -- "${changes[@]}" "$outdir/"
|
||||||
|
|
||||||
|
# Clean-room build gate: rebuild the source package in a minimal chroot that
|
||||||
|
# holds only the declared Build-Depends, the same way the buildds will. An
|
||||||
|
# undeclared dependency or any FTBFS aborts the release here instead of
|
||||||
|
# surfacing after a source-only upload. Logs and clean-built debs land in
|
||||||
|
# $outdir/sbuild for inspection.
|
||||||
|
if [[ $sbuild -eq 1 ]]; then
|
||||||
|
local -a dscs
|
||||||
|
shopt -s nullglob
|
||||||
|
dscs=("$scratch"/*.dsc)
|
||||||
|
shopt -u nullglob
|
||||||
|
[[ ${#dscs[@]} -ge 1 ]] || die "no .dsc to sbuild"
|
||||||
|
|
||||||
|
local dist
|
||||||
|
dist=$(cd "$scratch/httrack-$ver" && dpkg-parsechangelog -S Distribution)
|
||||||
|
[[ $dist == UNRELEASED ]] && dist=unstable
|
||||||
|
|
||||||
|
info "clean-room build with sbuild (dist $dist)"
|
||||||
|
local sbdir=$outdir/sbuild
|
||||||
|
rm -rf -- "$sbdir"
|
||||||
|
mkdir -p "$sbdir"
|
||||||
|
(cd "$sbdir" && sbuild --dist="$dist" -- "${dscs[0]}")
|
||||||
|
info "sbuild clean-room build passed; logs in $sbdir"
|
||||||
|
fi
|
||||||
|
|
||||||
# Release artifacts for the upstream tarball (detached sig + checksums).
|
# Release artifacts for the upstream tarball (detached sig + checksums).
|
||||||
|
# A Debian revision >= 2 .changes omits the orig (it is already in the
|
||||||
|
# archive), so dcmd above won't have copied it; place it from the build tree
|
||||||
|
# so the website artifacts are produced regardless of the revision.
|
||||||
if [[ $release_artifacts -eq 1 && $unsigned -eq 0 ]]; then
|
if [[ $release_artifacts -eq 1 && $unsigned -eq 0 ]]; then
|
||||||
info "signing upstream tarball"
|
info "signing upstream tarball"
|
||||||
|
cp -- "$scratch/$orig" "$outdir/$orig"
|
||||||
(
|
(
|
||||||
cd "$outdir"
|
cd "$outdir"
|
||||||
gpg --armor --detach-sign --yes -u "$key" -- "$orig"
|
gpg --armor --detach-sign --yes -u "$key" -- "$orig"
|
||||||
|
|||||||
Reference in New Issue
Block a user