mirror of
https://github.com/xroche/httrack.git
synced 2026-07-04 16:14:47 +03:00
Compare commits
39 Commits
fix/urlhac
...
wait-socke
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
75fd5ae90f | ||
|
|
1e7744865f | ||
|
|
dfafe28002 | ||
|
|
a3f04bde72 | ||
|
|
11beef52e1 | ||
|
|
d7c4eab1f5 | ||
|
|
2eac19655b | ||
|
|
83c231d50e | ||
|
|
9d29b8329b | ||
|
|
ac4a1ca48e | ||
|
|
9f2f2e52fa | ||
|
|
92db2f2b41 | ||
|
|
ec52112446 | ||
|
|
1eaddc9c0e | ||
|
|
d97a7bdfd9 | ||
|
|
d2d02d87c2 | ||
|
|
4958bb8666 | ||
|
|
07da404cb8 | ||
|
|
694e45c698 | ||
|
|
db9ec2cc3b | ||
|
|
6a9ab2a11f | ||
|
|
13b31986d5 | ||
|
|
bd7e0989f6 | ||
|
|
bd74ec7cab | ||
|
|
1ed8ffad64 | ||
|
|
b68de172fa | ||
|
|
aabfd34380 | ||
|
|
65ff9e0f11 | ||
|
|
730a1c8c5b | ||
|
|
f9ee4702a2 | ||
|
|
cca83e5f4a | ||
|
|
97f398e508 | ||
|
|
a62f93a107 | ||
|
|
799ec88dc7 | ||
|
|
71af4a24f0 | ||
|
|
e17f4f12a0 | ||
|
|
5be8ba4bbd | ||
|
|
247a46068e | ||
|
|
669947cd23 |
49
.github/workflows/ci.yml
vendored
49
.github/workflows/ci.yml
vendored
@@ -61,6 +61,50 @@ jobs:
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Reproduce the Debian buildds: they build in a minimal chroot with no
|
||||
# python3, so the local-server tests must SKIP (exit 77), not fail. GitHub
|
||||
# runners ship python3, so every other job hides this path; here we remove it
|
||||
# before `make check`. This is the guard that would have caught the 3.49.10-1
|
||||
# FTBFS (28_local-pause failed instead of skipping when python3 was absent).
|
||||
buildd-no-python3:
|
||||
name: build (no python3, Debian buildd)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
set -euo pipefail
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential autoconf automake libtool autoconf-archive \
|
||||
zlib1g-dev libssl-dev
|
||||
|
||||
- name: Configure
|
||||
run: |
|
||||
set -euo pipefail
|
||||
autoreconf -fi
|
||||
./configure
|
||||
|
||||
- name: Build
|
||||
run: make -j"$(nproc)"
|
||||
|
||||
- name: Test without python3
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Hide every python3* so `command -v python3` fails like it does in the
|
||||
# buildd chroot; masking with /bin/false would still resolve.
|
||||
sudo find /usr/bin /usr/local/bin -maxdepth 1 -name 'python3*' \
|
||||
-exec mv {} {}.hidden \;
|
||||
! command -v python3
|
||||
make check
|
||||
|
||||
- name: Print the test log on failure
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Portability: build and test on macOS (Darwin/clang) on a native runner --
|
||||
# no VM. The tree has no __APPLE__ branches, so Darwin exercises the
|
||||
# generic-Unix path on a second libc and kernel. brew's openssl@3 is keg-only,
|
||||
@@ -225,8 +269,9 @@ jobs:
|
||||
MSAN_OPTIONS: abort_on_error=1:halt_on_error=1
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Engine self-tests only; the cache trio pulls in uninstrumented zlib.
|
||||
tests="$(cd tests && ls 01_engine-*.test | grep -v -- '-cache' | tr '\n' ' ')"
|
||||
# 01_engine-* only; zlib-dependent self-tests are named 01_zlib-* and
|
||||
# skipped here (uninstrumented libz floods MSan with false positives).
|
||||
tests="$(cd tests && ls 01_engine-*.test | tr '\n' ' ')"
|
||||
make check TESTS="$tests"
|
||||
|
||||
- name: Print the test log on failure
|
||||
|
||||
@@ -39,6 +39,10 @@ Welcome, and nothing to disclose. Two rules:
|
||||
|
||||
The sign-off covers AI-assisted code too.
|
||||
|
||||
## Translations
|
||||
|
||||
Interface strings live in [`lang/`](lang/). See [lang/README.md](lang/README.md) for the file format and how to add or update a language.
|
||||
|
||||
## Bugs
|
||||
|
||||
Open an issue with the version, OS, command used, and expected vs actual result.
|
||||
|
||||
20
configure.ac
20
configure.ac
@@ -1,6 +1,6 @@
|
||||
AC_PREREQ([2.71])
|
||||
|
||||
AC_INIT([httrack], [3.49.9], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_INIT([httrack], [3.49.10], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_COPYRIGHT([
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2015 Xavier Roche and other contributors
|
||||
@@ -29,10 +29,10 @@ AC_CONFIG_SRCDIR(src/httrack.c)
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
AM_INIT_AUTOMAKE([subdir-objects])
|
||||
# 3:1:0: 3.49.9 changed code but not the exported interface vs 3.49.8 (same 164
|
||||
# symbols, no struct-layout change), so bump revision only. (3:0:0 was the htsblk
|
||||
# mime-buffer widening, an ABI break that moved the soname .so.2 -> .so.3.)
|
||||
VERSION_INFO="3:1:0"
|
||||
# 3:2:0: 3.49.10 only appends tail fields to the options struct (no existing
|
||||
# symbol or offset changed vs 3.49.9), so it stays soname .so.3; bump revision.
|
||||
# (3:0:0 was the htsblk mime-buffer widening, the ABI break that moved .so.2 -> .so.3.)
|
||||
VERSION_INFO="3:2:0"
|
||||
AM_MAINTAINER_MODE
|
||||
AC_USE_SYSTEM_EXTENSIONS
|
||||
|
||||
@@ -63,6 +63,16 @@ AC_SUBST(LT_CV_OBJDIR,$lt_cv_objdir)
|
||||
# Export version info
|
||||
AC_SUBST(VERSION_INFO)
|
||||
|
||||
# Versioned plugin name for dlopen() in hts_create_opt(); soname major is
|
||||
# libtool's current - age, so this tracks VERSION_INFO bumps automatically.
|
||||
HTS_SONAME_MAJOR=$((${VERSION_INFO%%:*} - ${VERSION_INFO##*:}))
|
||||
case "$host_os" in
|
||||
darwin*) HTS_LIBHTSJAVA_NAME="libhtsjava.$HTS_SONAME_MAJOR.dylib" ;;
|
||||
*) HTS_LIBHTSJAVA_NAME="libhtsjava.so.$HTS_SONAME_MAJOR" ;;
|
||||
esac
|
||||
AC_DEFINE_UNQUOTED([HTS_LIBHTSJAVA_NAME], ["$HTS_LIBHTSJAVA_NAME"],
|
||||
[Versioned libhtsjava runtime name, derived from VERSION_INFO])
|
||||
|
||||
### Default CFLAGS
|
||||
DEFAULT_CFLAGS="-Wall -Wformat -Wformat-security \
|
||||
-Wmultichar -Wwrite-strings -Wcast-qual -Wcast-align \
|
||||
|
||||
13
debian/changelog
vendored
13
debian/changelog
vendored
@@ -1,3 +1,16 @@
|
||||
httrack (3.49.10-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: new download-pacing and URL-handling options plus a
|
||||
batch of crawl and robustness fixes (full list in history.txt).
|
||||
* Rewrite debian/copyright in machine-readable DEP-5 format, crediting the
|
||||
bundled minizip, md5 and coucal sources (#415).
|
||||
* Lead the webhttrack browser dependency with chromium so httrack is not
|
||||
dragged into the firefox-esr autoremoval cascade (#436).
|
||||
* Override the embedded-library lint for the bundled minizip (#419).
|
||||
* Bump Standards-Version to 4.7.4 (no changes required).
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sun, 28 Jun 2026 14:01:53 +0200
|
||||
|
||||
httrack (3.49.9-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: Content-Type and file-type detection fixes (trust a
|
||||
|
||||
7
debian/control
vendored
7
debian/control
vendored
@@ -1,9 +1,8 @@
|
||||
Source: httrack
|
||||
Section: web
|
||||
Priority: optional
|
||||
Maintainer: Xavier Roche <roche@httrack.com>
|
||||
Standards-Version: 4.7.0
|
||||
Build-Depends: debhelper-compat (= 13), autoconf, autoconf-archive, automake, libtool, zlib1g-dev, libssl-dev
|
||||
Standards-Version: 4.7.4
|
||||
Build-Depends: debhelper-compat (= 14), autoconf, autoconf-archive, automake, libtool, zlib1g-dev, libssl-dev
|
||||
Rules-Requires-Root: no
|
||||
Homepage: http://www.httrack.com
|
||||
Vcs-Git: https://github.com/xroche/httrack.git
|
||||
@@ -30,7 +29,7 @@ Description: Copy websites to your computer (Offline browser)
|
||||
Package: webhttrack
|
||||
Architecture: any
|
||||
Multi-Arch: foreign
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}, webhttrack-common, sensible-utils, firefox-esr | chromium | www-browser
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}, webhttrack-common, sensible-utils, chromium | firefox-esr | www-browser
|
||||
Replaces: webhttrack-common (<< 3.43.9-2)
|
||||
Breaks: webhttrack-common (<< 3.43.9-2)
|
||||
Suggests: httrack, httrack-doc
|
||||
|
||||
4
debian/source/lintian-overrides
vendored
4
debian/source/lintian-overrides
vendored
@@ -1,4 +1,6 @@
|
||||
httrack source: changelog-should-mention-nmu
|
||||
# Maintainer uploads sign the changelog as xavier@debian.org while the control
|
||||
# Maintainer is roche@httrack.com; lintian reads the address mismatch as an NMU.
|
||||
httrack source: no-nmu-in-changelog
|
||||
httrack source: source-nmu-has-incorrect-version-number
|
||||
|
||||
# The bundled HTML pages are the genuine upstream documentation taken from
|
||||
|
||||
6
debian/upstream/metadata
vendored
Normal file
6
debian/upstream/metadata
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
Repository: https://github.com/xroche/httrack.git
|
||||
Repository-Browse: https://github.com/xroche/httrack
|
||||
Bug-Database: https://github.com/xroche/httrack/issues
|
||||
Bug-Submit: https://github.com/xroche/httrack/issues/new
|
||||
Contact: Xavier Roche <roche@httrack.com>
|
||||
20
history.txt
20
history.txt
@@ -4,7 +4,25 @@ HTTrack Website Copier release history:
|
||||
|
||||
This file lists all changes and fixes that have been made for HTTrack
|
||||
|
||||
3.49-9
|
||||
3.49-10
|
||||
+ New: --cookies-file to preload a Netscape cookies.txt before crawling (#215)
|
||||
+ New: --pause to space out file downloads by a random delay (#185)
|
||||
+ New: --strip-query to drop selected query keys from the dedup naming (#112)
|
||||
+ Changed: split the -%u URL hacks into independent --keep-www-prefix, --keep-double-slashes and --keep-query-order toggles (#271)
|
||||
+ Fixed: follow a redirect Location after dropping its #fragment, instead of requesting the fragment and polluting the saved name (#204)
|
||||
+ Fixed: escaped brackets inside a *[...] filter character class (#148)
|
||||
+ Fixed: honor the server's Content-Range when resuming a partial download, instead of appending overlapping bytes (#198)
|
||||
+ Fixed: abort the download as soon as the response type is excluded by -mime:, instead of fetching then discarding the body (#58)
|
||||
+ Fixed: keep size-based filter rules neutral until the file size is known (#143)
|
||||
+ Fixed: stop the mirror with a clean fatal error on a cache write failure, instead of crashing (#174, #219)
|
||||
+ Fixed: stop the 412/416 partial re-get loop on --continue and --update (#206)
|
||||
+ Fixed: keep an unrecognized URL tail instead of mangling it to .html (#115)
|
||||
+ Fixed: honor --tolerant (-%B) on a broken Content-Length, and fix an out-of-bounds read it exposed (#32, #41)
|
||||
+ Fixed: fall back to the next resolved address when a connection fails or stalls, instead of hanging on a dead IPv6 address
|
||||
+ Fixed: report why a -%L URL list could not be loaded (#49)
|
||||
+ Changed: multiple internal hardening, build and CI improvements
|
||||
|
||||
.49-9
|
||||
+ Fixed: file-type detection from the Content-Type header: trust a declared type over a binary URL extension, honor --assume under the delayed type check, and keep a known extension against a bogus or empty Content-Type (#267, #29, #56)
|
||||
+ Fixed: an uninitialized-buffer read when the Content-Type is empty (#411)
|
||||
+ Fixed: restored C++ source-compatibility of the installed headers so reverse dependencies (httraqt) build again (#413)
|
||||
|
||||
@@ -247,7 +247,7 @@ See also: The <a href="faq.html#VF1">FAQ</a><br>
|
||||
<td>the \ character</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td nowrap><tt>*[\[\]]</tt></td>
|
||||
<td nowrap><tt>*[\[,\]]</tt></td>
|
||||
<td>the [ or ] character</td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
||||
@@ -295,7 +295,7 @@ Max Depth
|
||||
Maximum external depth:
|
||||
Maximum external depth:
|
||||
Filters (refuse/accept links) :
|
||||
Filters (refuse/accept links) :
|
||||
Filters (refuse/accept links):
|
||||
Paths
|
||||
Paths
|
||||
Save prefs
|
||||
|
||||
37
lang/README.md
Normal file
37
lang/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Translating HTTrack
|
||||
|
||||
Interface strings live here, one `.txt` file per language. `English.txt` is the reference: every other file maps each English string to its translation.
|
||||
|
||||
## File format
|
||||
|
||||
Plain text, entries in consecutive pairs of lines:
|
||||
|
||||
```
|
||||
<English string>
|
||||
<translation>
|
||||
```
|
||||
|
||||
The first line of a pair is the lookup key and must stay identical to the one in `English.txt`; translate only the second line. Missing entries fall back to the English text at runtime, so a partial translation works.
|
||||
|
||||
Preserve any `\r\n`, `\t` and `printf` placeholders (`%s`, `%d`, ...) in the translation.
|
||||
|
||||
A few `LANGUAGE_*` entries at the top describe the file itself:
|
||||
|
||||
| Key | Meaning |
|
||||
| --- | --- |
|
||||
| `LANGUAGE_NAME` | Name shown in the language picker, in its own language (`Deutsch`, not `German`) |
|
||||
| `LANGUAGE_ISO` | ISO 639 code, with region if needed (`de`, `pt_BR`) |
|
||||
| `LANGUAGE_CHARSET` | Encoding the file is saved in (`ISO-8859-1`, `windows-1251`, `UTF-8`, ...) |
|
||||
| `LANGUAGE_AUTHOR` | Your name and contact |
|
||||
| `LANGUAGE_WINDOWSID` | Windows locale name used by WinHTTrack (`German (Standard)`) |
|
||||
|
||||
Save the file in exactly its declared `LANGUAGE_CHARSET`; an editor that rewrites it as UTF-8 will corrupt the non-ASCII bytes.
|
||||
|
||||
## Adding or updating a language
|
||||
|
||||
1. Copy `English.txt` to `<Language>.txt`, or edit the existing file.
|
||||
2. Translate each second line; leave the English keys untouched.
|
||||
3. Fill in the `LANGUAGE_*` header for a new file.
|
||||
4. Open a pull request, or attach the file to a GitHub issue.
|
||||
|
||||
When new strings land in `English.txt` they show up untranslated (as English) until a translator fills them in.
|
||||
@@ -24,6 +24,7 @@ httrack \- offline browser : copy websites to a local directory
|
||||
[ \fB\-EN, \-\-max\-time[=N]\fR ]
|
||||
[ \fB\-AN, \-\-max\-rate[=N]\fR ]
|
||||
[ \fB\-%cN, \-\-connection\-per\-second[=N]\fR ]
|
||||
[ \fB\-%G, \-\-pause\fR ]
|
||||
[ \fB\-GN, \-\-max\-pause[=N]\fR ]
|
||||
[ \fB\-cN, \-\-sockets[=N]\fR ]
|
||||
[ \fB\-TN, \-\-timeout[=N]\fR ]
|
||||
@@ -49,6 +50,7 @@ httrack \- offline browser : copy websites to a local directory
|
||||
[ \fB\-%p, \-\-preserve\fR ]
|
||||
[ \fB\-%T, \-\-utf8\-conversion\fR ]
|
||||
[ \fB\-bN, \-\-cookies[=N]\fR ]
|
||||
[ \fB\-%K, \-\-cookies\-file\fR ]
|
||||
[ \fB\-u, \-\-check\-type[=N]\fR ]
|
||||
[ \fB\-j, \-\-parse\-java[=N]\fR ]
|
||||
[ \fB\-sN, \-\-robots[=N]\fR ]
|
||||
@@ -154,6 +156,8 @@ maximum mirror time in seconds (60=1 minute, 3600=1 hour) (\-\-max\-time[=N])
|
||||
maximum transfer rate in bytes/seconds (1000=1KB/s max) (\-\-max\-rate[=N])
|
||||
.IP \-%cN
|
||||
maximum number of connections/seconds (*%c10) (\-\-connection\-per\-second[=N])
|
||||
.IP \-%G
|
||||
random pause of MIN[:MAX] seconds between files (e.g. %G5:10) (\-\-pause <param>)
|
||||
.IP \-GN
|
||||
pause transfer if N bytes reached, and wait until lock file is deleted (\-\-max\-pause[=N])
|
||||
.SS Flow control:
|
||||
@@ -212,6 +216,8 @@ links conversion to UTF\-8 (\-\-utf8\-conversion)
|
||||
.SS Spider options:
|
||||
.IP \-bN
|
||||
accept cookies in cookies.txt (0=do not accept,* 1=accept) (\-\-cookies[=N])
|
||||
.IP \-%K
|
||||
load extra cookies from a Netscape cookies.txt (\-\-cookies\-file <param>)
|
||||
.IP \-u
|
||||
check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always) (\-\-check\-type[=N])
|
||||
.IP \-j
|
||||
|
||||
@@ -62,7 +62,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
htsname.c htsrobots.c htstools.c htswizard.c \
|
||||
htsalias.c htsthread.c htsindex.c htsbauth.c \
|
||||
htsmd5.c htszlib.c htswrap.c htsconcat.c \
|
||||
htsmodules.c htscharset.c punycode.c htsencoding.c \
|
||||
htsmodules.c htscharset.c punycode.c htsencoding.c htssniff.c \
|
||||
md5.c \
|
||||
minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \
|
||||
hts-indextmpl.h htsalias.h htsback.h htsbase.h htssafe.h \
|
||||
@@ -70,7 +70,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \
|
||||
htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \
|
||||
htsfilters.h htsftp.h htsglobal.h htshash.h coucal/coucal.h \
|
||||
htshelp.h htsindex.h htslib.h htsmd5.h \
|
||||
htsmodules.h htsname.h htsnet.h \
|
||||
htsmodules.h htsname.h htsnet.h htssniff.h \
|
||||
htsopt.h htsrobots.h htsthread.h \
|
||||
htstools.h htswizard.h htswrap.h htszlib.h \
|
||||
htsstrings.h htsarrays.h httrack-library.h \
|
||||
|
||||
@@ -112,6 +112,10 @@ const char *hts_optalias[][4] = {
|
||||
{"include-query-string", "-%q", "single", ""},
|
||||
{"strip-query", "-%g", "param1",
|
||||
"strip [host/pattern=]key1,key2,... from URLs"},
|
||||
{"cookies-file", "-%K", "param1",
|
||||
"load extra cookies from a Netscape cookies.txt"},
|
||||
{"pause", "-%G", "param1",
|
||||
"random pause of MIN[:MAX] seconds between files"},
|
||||
{"generate-errors", "-o", "single", ""},
|
||||
{"do-not-generate-errors", "-o0", "single", ""},
|
||||
{"purge-old", "-X", "param", ""},
|
||||
|
||||
100
src/htsback.c
100
src/htsback.c
@@ -1359,6 +1359,18 @@ int back_flush_output(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
}
|
||||
|
||||
// effacer entrée
|
||||
/* Discard a cancelled mid-write .delayed placeholder (unusable across runs). */
|
||||
static void back_delayed_discard(httrackp *opt, lien_back *back) {
|
||||
if (back->r.out != NULL) {
|
||||
fclose(back->r.out);
|
||||
back->r.out = NULL;
|
||||
}
|
||||
back->r.is_write = 0;
|
||||
if (opt != NULL)
|
||||
url_savename_refname_remove(opt, back->url_adr, back->url_fil);
|
||||
(void) UNLINK(back->url_sav);
|
||||
}
|
||||
|
||||
int back_delete(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
const int p) {
|
||||
lien_back *const back = sback->lnk;
|
||||
@@ -1366,6 +1378,12 @@ int back_delete(httrackp * opt, cache_back * cache, struct_back * sback,
|
||||
|
||||
assertf(p >= 0 && p < back_max);
|
||||
if (p >= 0 && p < sback->count) { // on sait jamais..
|
||||
/* mid-write cancel: drop a .delayed placeholder; real-named partials
|
||||
survive for resume (--continue) */
|
||||
if (back[p].r.is_write && IS_DELAYED_EXT(back[p].url_sav) &&
|
||||
(back[p].status != STATUS_READY || back[p].r.statuscode <= 0)) {
|
||||
back_delayed_discard(opt, &back[p]);
|
||||
}
|
||||
// Vérificateur d'intégrité
|
||||
#if DEBUG_CHECKINT
|
||||
_CHECKINT(&back[p], "Appel back_delete")
|
||||
@@ -2237,12 +2255,13 @@ int host_wait(httrackp * opt, lien_back * back) {
|
||||
|
||||
static int slot_can_be_cleaned(const lien_back * back) {
|
||||
return (back->status == STATUS_READY) // ready
|
||||
/* Check autoclean */
|
||||
&& (!back->testmode) // not test mode
|
||||
&& (strnotempty(back->url_sav)) // filename exists
|
||||
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
|
||||
&& (back->r.size >= 0) // size>=0
|
||||
;
|
||||
/* Check autoclean */
|
||||
&& (!back->locked) // not held by hts_wait_delayed (name pending)
|
||||
&& (!back->testmode) // not test mode
|
||||
&& (strnotempty(back->url_sav)) // filename exists
|
||||
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
|
||||
&& (back->r.size >= 0) // size>=0
|
||||
;
|
||||
}
|
||||
|
||||
static int slot_can_be_finalized(httrackp * opt, const lien_back * back) {
|
||||
@@ -2418,6 +2437,34 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
back_clean(opt, cache, sback);
|
||||
#endif
|
||||
|
||||
/* Time limit exceeded past grace: abort in-flight transfers so no wait loop
|
||||
starves (#481). FTP slots stay, their thread owns the socket. */
|
||||
if (!back_checkmirror(opt)) {
|
||||
int aborted = 0;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < (unsigned int) back_max; i++) {
|
||||
if (back[i].status > 0 && back[i].status < STATUS_FTP_TRANSFER) {
|
||||
if (back[i].r.soc != INVALID_SOCKET) {
|
||||
deletehttp(&back[i].r);
|
||||
}
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
/* drop a .delayed placeholder; real partials survive for resume */
|
||||
if (back[i].r.is_write && IS_DELAYED_EXT(back[i].url_sav))
|
||||
back_delayed_discard(opt, &back[i]);
|
||||
back[i].r.statuscode = STATUSCODE_TIMEOUT;
|
||||
strcpybuff(back[i].r.msg, "Mirror Time Out");
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
aborted++;
|
||||
}
|
||||
}
|
||||
if (aborted > 0)
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"time limit reached, %d transfer(s) aborted", aborted);
|
||||
return;
|
||||
}
|
||||
|
||||
// recevoir tant qu'il y a des données (avec un maximum de max_loop boucles)
|
||||
do_wait = 0;
|
||||
gestion_timeout = 0;
|
||||
@@ -2891,10 +2938,10 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
// range size hack old location
|
||||
|
||||
#if HTS_DIRECTDISK
|
||||
// Court-circuit:
|
||||
// Peut-on stocker le fichier directement sur disque?
|
||||
// Ahh que ca serait vachement mieux et que ahh que la mémoire vous dit merci!
|
||||
if (back[i].status) {
|
||||
// Shortcut: store the file directly on disk when possible,
|
||||
// sparing memory
|
||||
if (back[i].status &&
|
||||
!back[i].locked) { // name still pending when locked
|
||||
if (back[i].r.is_write == 0) { // mode mémoire
|
||||
if (back[i].r.adr == NULL) { // rien n'a été écrit
|
||||
if (!back[i].testmode) { // pas mode test
|
||||
@@ -3960,8 +4007,12 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
&& (back[i].r.adr = (char *) malloct(2))) {
|
||||
back[i].r.adr[0] = 0;
|
||||
}
|
||||
hts_log_print(opt, LOG_TRACE, "finalizing empty");
|
||||
back_finalize(opt, cache, sback, i);
|
||||
/* locked = name pending; the waiter finalizes after
|
||||
patching url_sav (else: cached as .delayed, #5) */
|
||||
if (!back[i].locked) {
|
||||
hts_log_print(opt, LOG_TRACE, "finalizing empty");
|
||||
back_finalize(opt, cache, sback, i);
|
||||
}
|
||||
} else if (!back[i].r.is_chunk) { // pas de chunk
|
||||
//if (back[i].r.http11!=2) { // pas de chunk
|
||||
back[i].is_chunk = 0;
|
||||
@@ -4159,6 +4210,11 @@ int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Grace left to the smooth stop before in-flight transfers are aborted. */
|
||||
static int back_maxtime_grace(const int maxtime) {
|
||||
return maximum(5, minimum(30, maxtime / 10));
|
||||
}
|
||||
|
||||
int back_checkmirror(httrackp * opt) {
|
||||
// Check max size
|
||||
if ((opt->maxsite > 0) && (HTS_STAT.stat_bytes >= opt->maxsite)) {
|
||||
@@ -4175,13 +4231,19 @@ int back_checkmirror(httrackp * opt) {
|
||||
*/
|
||||
}
|
||||
// Check max time
|
||||
if ((opt->maxtime > 0)
|
||||
&& ((time_local() - HTS_STAT.stat_timestart) >= opt->maxtime)) {
|
||||
if (!opt->state.stop) { /* not yet stopped */
|
||||
hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up",
|
||||
opt->maxtime);
|
||||
/* cancel mirror smoothly */
|
||||
hts_request_stop(opt, 0);
|
||||
if (opt->maxtime > 0) {
|
||||
const TStamp elapsed = time_local() - HTS_STAT.stat_timestart;
|
||||
|
||||
if (elapsed >= opt->maxtime) {
|
||||
if (!opt->state.stop) { /* not yet stopped */
|
||||
hts_log_print(opt, LOG_ERROR, "More than %d seconds passed.. giving up",
|
||||
opt->maxtime);
|
||||
/* cancel mirror smoothly */
|
||||
hts_request_stop(opt, 0);
|
||||
}
|
||||
/* smooth stop starved past the grace period: stop waiting (#481) */
|
||||
if (elapsed - opt->maxtime >= back_maxtime_grace(opt->maxtime))
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1; /* Ok, go on */
|
||||
|
||||
@@ -136,6 +136,8 @@ void back_solve(httrackp * opt, lien_back * sback);
|
||||
int host_wait(httrackp * opt, lien_back * sback);
|
||||
#endif
|
||||
int back_checksize(httrackp * opt, lien_back * eback, int check_only_totalsize);
|
||||
/* Enforce -M/-E quotas: requests a smooth stop when reached; returns 0 once
|
||||
the -E deadline overran its grace period (callers must stop waiting). */
|
||||
int back_checkmirror(httrackp * opt);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -129,6 +129,8 @@ typedef enum HTTPStatusCode {
|
||||
HTTP_UNSUPPORTED_MEDIA_TYPE = 415,
|
||||
HTTP_REQUESTED_RANGE_NOT_SATISFIABLE = 416,
|
||||
HTTP_EXPECTATION_FAILED = 417,
|
||||
HTTP_TOO_MANY_REQUESTS = 429,
|
||||
HTTP_UNAVAILABLE_FOR_LEGAL_REASONS = 451,
|
||||
HTTP_INTERNAL_SERVER_ERROR = 500,
|
||||
HTTP_NOT_IMPLEMENTED = 501,
|
||||
HTTP_BAD_GATEWAY = 502,
|
||||
|
||||
@@ -3,12 +3,12 @@
|
||||
|
||||
# Change this to download files
|
||||
if false; then
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
|
||||
rm -f CP932.TXT CP936.TXT CP949.TXT CP950.TXT
|
||||
fi
|
||||
|
||||
|
||||
@@ -596,15 +596,18 @@ htsblk cache_read_ro(httrackp * opt, cache_back * cache, const char *adr,
|
||||
return cache_readex(opt, cache, adr, fil, save, location, NULL, 1);
|
||||
}
|
||||
|
||||
htsblk cache_read_including_broken(httrackp * opt, cache_back * cache,
|
||||
const char *adr, const char *fil) {
|
||||
htsblk r = cache_read(opt, cache, adr, fil, NULL, NULL);
|
||||
htsblk cache_read_including_broken(httrackp *opt, cache_back *cache,
|
||||
const char *adr, const char *fil,
|
||||
char *return_save) {
|
||||
htsblk r = cache_readex(opt, cache, adr, fil, NULL, NULL, return_save, 0);
|
||||
|
||||
if (r.statuscode == -1) {
|
||||
lien_back *itemback = NULL;
|
||||
|
||||
if (back_unserialize_ref(opt, adr, fil, &itemback) == 0) {
|
||||
r = itemback->r;
|
||||
if (return_save != NULL)
|
||||
strlcpybuff(return_save, itemback->url_sav, HTS_URLMAXSIZE * 2);
|
||||
/* cleanup */
|
||||
back_clear_entry(itemback); /* delete entry content */
|
||||
freet(itemback); /* delete item */
|
||||
|
||||
@@ -66,8 +66,11 @@ htsblk cache_read(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, const char *save, char *location);
|
||||
htsblk cache_read_ro(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, const char *save, char *location);
|
||||
htsblk cache_read_including_broken(httrackp * opt, cache_back * cache,
|
||||
const char *adr, const char *fil);
|
||||
/* Like cache_read, but also yields entries whose transfer broke; return_save
|
||||
(optional, HTS_URLMAXSIZE*2) receives the entry's recorded save name. */
|
||||
htsblk cache_read_including_broken(httrackp *opt, cache_back *cache,
|
||||
const char *adr, const char *fil,
|
||||
char *return_save);
|
||||
htsblk cache_readex(httrackp * opt, cache_back * cache, const char *adr,
|
||||
const char *fil, const char *save, char *location,
|
||||
char *return_save, int readonly);
|
||||
|
||||
@@ -64,7 +64,7 @@ Please visit our Website: http://www.httrack.com
|
||||
// catch_url_init(&port,&return_host);
|
||||
HTSEXT_API T_SOC catch_url_init_std(int *port_prox, char *adr_prox) {
|
||||
T_SOC soc;
|
||||
int try_to_listen_to[] = { 8080, 3128, 80, 81, 82, 8081, 3129, 31337, 0, -1 };
|
||||
int try_to_listen_to[] = {8080, 3128, 80, 81, 82, 8081, 3129, 0, -1};
|
||||
int i = 0;
|
||||
|
||||
do {
|
||||
@@ -175,7 +175,9 @@ HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||
//
|
||||
socinput(soc, line, 1000);
|
||||
if (strnotempty(line)) {
|
||||
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
|
||||
/* widths bound the caller buffers: method[32], url[HTS_URLMAXSIZE*2],
|
||||
protocol[256] */
|
||||
if (sscanf(line, "%31s %2047s %255s", method, url, protocol) == 3) {
|
||||
lien_adrfil af;
|
||||
|
||||
// méthode en majuscule
|
||||
|
||||
290
src/htscore.c
290
src/htscore.c
@@ -35,6 +35,7 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <ctype.h>
|
||||
#include <stdint.h> /* uint64_t for the pause mixer (already a hard dep via md5.h) */
|
||||
|
||||
/* File defs */
|
||||
#include "htscore.h"
|
||||
@@ -405,29 +406,106 @@ void hts_invalidate_link(httrackp * opt, int lpos) {
|
||||
opt->liens[lpos]->pass2 = -1;
|
||||
}
|
||||
|
||||
// Write the makeindex footer (refresh meta when makeindex_links==1), close
|
||||
// the file, then run usercommand.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil) {
|
||||
if (!*makeindex_done) {
|
||||
if (*makeindex_fp) {
|
||||
char BIGSTK tempo[1024];
|
||||
if (makeindex_links == 1) {
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE * 2];
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped));
|
||||
snprintf(tempo, sizeof(tempo),
|
||||
"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">" CRLF,
|
||||
link_escaped);
|
||||
} else
|
||||
tempo[0] = '\0';
|
||||
hts_template_format(*makeindex_fp, template_footer,
|
||||
"<!-- Mirror and index made by HTTrack Website "
|
||||
"Copier/" HTTRACK_VERSION " " HTTRACK_AFF_AUTHORS
|
||||
" -->",
|
||||
tempo, /* EOF */ NULL);
|
||||
fflush(*makeindex_fp);
|
||||
fclose(*makeindex_fp);
|
||||
*makeindex_fp = NULL;
|
||||
usercommand(opt, 0, NULL,
|
||||
fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_html_utf8), "index.html"),
|
||||
adr, fil);
|
||||
}
|
||||
}
|
||||
*makeindex_done = 1;
|
||||
}
|
||||
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF, link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),StringBuff(opt->path_html_utf8),"index.html"),"",""); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
/* Flush the parsed HTML output buffer to disk, skipping the rewrite when the
|
||||
* on-disk MD5 is unchanged. */
|
||||
void hts_finish_html_file(httrackp *opt, cache_back *cache, htsblk *r,
|
||||
FILE **fp, const char *ht_buff, size_t ht_len,
|
||||
const char *adr, const char *fil, const char *save) {
|
||||
char digest[32 + 2];
|
||||
off_t fsize_old =
|
||||
fsize(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), save));
|
||||
int ok = 0;
|
||||
|
||||
digest[0] = '\0';
|
||||
domd5mem(ht_buff, ht_len, digest, 1);
|
||||
if (fsize_old == (off_t) ht_len) {
|
||||
int mlen = 0;
|
||||
char *mbuff;
|
||||
|
||||
cache_readdata(cache, "//[HTML-MD5]//", save, &mbuff, &mlen);
|
||||
if (mlen)
|
||||
mbuff[mlen] = '\0';
|
||||
if ((mlen == 32) && (strcmp(((mbuff != NULL) ? mbuff : ""), digest) == 0)) {
|
||||
ok = 1;
|
||||
hts_log_print(opt, LOG_DEBUG, "File not re-written (md5): %s", save);
|
||||
}
|
||||
freet(mbuff);
|
||||
}
|
||||
if (!ok) {
|
||||
file_notify(opt, adr, fil, save, 1, 1, r->notmodified);
|
||||
*fp = filecreate(&opt->state.strc, save);
|
||||
if (*fp) {
|
||||
if (ht_len > 0 && fwrite(ht_buff, 1, ht_len, *fp) != ht_len) {
|
||||
int fcheck = check_fatal_io_errno();
|
||||
|
||||
if (fcheck)
|
||||
opt->state.exit_xh = -1;
|
||||
if (opt->log) {
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO,
|
||||
"Unable to write HTML file %s", save);
|
||||
if (fcheck)
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");
|
||||
}
|
||||
}
|
||||
fclose(*fp);
|
||||
*fp = NULL;
|
||||
if (strnotempty(r->lastmodified))
|
||||
set_filetime_rfc822(save, r->lastmodified);
|
||||
} else {
|
||||
int fcheck = check_fatal_io_errno();
|
||||
|
||||
if (fcheck) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
opt->state.exit_xh = -1;
|
||||
}
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to save file %s", save);
|
||||
if (fcheck)
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");
|
||||
}
|
||||
} else {
|
||||
file_notify(opt, adr, fil, save, 0, 0, r->notmodified);
|
||||
filenote(&opt->state.strc, save, NULL);
|
||||
}
|
||||
if (cache->ndx)
|
||||
cache_writedata(cache->ndx, cache->dat, "//[HTML-MD5]//", save, digest,
|
||||
(int) strlen(digest));
|
||||
}
|
||||
|
||||
/* does it look like XML ? (SVG et al.) */
|
||||
static int look_like_xml(const char *s) {
|
||||
@@ -523,9 +601,12 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
opt->cookie = &cookie;
|
||||
cookie.max_len = 30000; // max len
|
||||
strcpybuff(cookie.data, "");
|
||||
// Charger cookies.txt par défaut ou cookies.txt du miroir
|
||||
// Load the mirror's cookies.txt, then the one in the current directory
|
||||
cookie_load(opt->cookie, StringBuff(opt->path_log), "cookies.txt");
|
||||
cookie_load(opt->cookie, "", "cookies.txt");
|
||||
// A user-supplied cookie file is merged last so it wins on conflicts
|
||||
if (strnotempty(StringBuff(opt->cookies_file)))
|
||||
cookie_load(opt->cookie, "", StringBuff(opt->cookies_file));
|
||||
} else
|
||||
opt->cookie = NULL;
|
||||
|
||||
@@ -1792,90 +1873,18 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
if (strnotempty(savename()) == 0) { // pas de chemin de sauvegarde
|
||||
if (strcmp(urlfil(), "/robots.txt") == 0) { // robots.txt
|
||||
if (r.adr) {
|
||||
int bptr = 0;
|
||||
char BIGSTK line[1024];
|
||||
char BIGSTK buff[8192];
|
||||
char BIGSTK infobuff[8192];
|
||||
int record = 0;
|
||||
|
||||
line[0] = '\0';
|
||||
buff[0] = '\0';
|
||||
infobuff[0] = '\0';
|
||||
//
|
||||
#if DEBUG_ROBOTS
|
||||
printf("robots.txt dump:\n%s\n", r.adr);
|
||||
#endif
|
||||
do {
|
||||
char *comm;
|
||||
int llen;
|
||||
|
||||
bptr += binput(r.adr + bptr, line, sizeof(line) - 2);
|
||||
/* strip comment */
|
||||
comm = strchr(line, '#');
|
||||
if (comm != NULL) {
|
||||
*comm = '\0';
|
||||
}
|
||||
/* strip spaces */
|
||||
llen = (int) strlen(line);
|
||||
while(llen > 0 && is_realspace(line[llen - 1])) {
|
||||
line[llen - 1] = '\0';
|
||||
llen--;
|
||||
}
|
||||
if (strfield(line, "user-agent:")) {
|
||||
char *a;
|
||||
|
||||
a = line + 11;
|
||||
while(is_realspace(*a))
|
||||
a++; // sauter espace(s)
|
||||
if (*a == '*') {
|
||||
if (record != 2)
|
||||
record = 1; // c pour nous
|
||||
} else if (strfield(a, "httrack") || strfield(a, "winhttrack")
|
||||
|| strfield(a, "webhttrack")) {
|
||||
buff[0] = '\0'; // re-enregistrer
|
||||
infobuff[0] = '\0';
|
||||
record = 2; // locked
|
||||
#if DEBUG_ROBOTS
|
||||
printf("explicit disallow for httrack\n");
|
||||
#endif
|
||||
} else
|
||||
record = 0;
|
||||
} else if (record) {
|
||||
if (strfield(line, "disallow:")) {
|
||||
char *a = line + 9;
|
||||
|
||||
while(is_realspace(*a))
|
||||
a++; // sauter espace(s)
|
||||
if (strnotempty(a)) {
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
if (strcmp(a, "/") != 0 ||
|
||||
opt->robots >= HTS_ROBOTS_ALWAYS_STRICT)
|
||||
hts_boolean keep_root = (opt->robots >= HTS_ROBOTS_ALWAYS_STRICT)
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
#else
|
||||
hts_boolean keep_root = HTS_TRUE;
|
||||
#endif
|
||||
{ /* ignoring disallow: / */
|
||||
if ((strlen(buff) + strlen(a) + 8) < sizeof(buff)) {
|
||||
strcatbuff(buff, a);
|
||||
strcatbuff(buff, "\n");
|
||||
if ((strlen(infobuff) + strlen(a) + 8) <
|
||||
sizeof(infobuff)) {
|
||||
if (strnotempty(infobuff))
|
||||
strcatbuff(infobuff, ", ");
|
||||
strcatbuff(infobuff, a);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
else {
|
||||
hts_log_print(opt, LOG_NOTICE,
|
||||
"Note: %s robots.txt rules are too restrictive, ignoring /",
|
||||
urladr());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
} while((bptr < r.size) && (strlen(buff) < (sizeof(buff) - 32)));
|
||||
if (strnotempty(buff)) {
|
||||
checkrobots_set(&robots, urladr(), buff);
|
||||
|
||||
robots_parse(&robots, urladr(), r.adr, r.size, infobuff,
|
||||
sizeof(infobuff), keep_root);
|
||||
if (strnotempty(infobuff)) {
|
||||
hts_log_print(opt, LOG_INFO,
|
||||
"Note: robots.txt forbidden links for %s are: %s",
|
||||
urladr(), infobuff);
|
||||
@@ -2112,7 +2121,8 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
/*
|
||||
Ensure the index is being closed
|
||||
*/
|
||||
HT_INDEX_END;
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp, makeindex_links,
|
||||
makeindex_firstlink, template_footer, "", "");
|
||||
|
||||
/*
|
||||
updating-a-remotely-deteted-website hack
|
||||
@@ -3311,6 +3321,21 @@ HTS_INLINE int back_fillmax(struct_back * sback, httrackp * opt,
|
||||
return -1; /* plus de place */
|
||||
}
|
||||
|
||||
/* Seed-derived: stable within a gap, rerolls per launch; a per-call rand()
|
||||
would bias the delay toward min_ms (see header). Jitter, not crypto. */
|
||||
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms) {
|
||||
uint64_t z = (uint64_t) seed;
|
||||
|
||||
if (max_ms <= min_ms)
|
||||
return min_ms;
|
||||
/* SplitMix64 finalizer: scrambles the low-entropy ms timestamp. */
|
||||
z += 0x9E3779B97F4A7C15ULL;
|
||||
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL;
|
||||
z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL;
|
||||
z ^= z >> 31;
|
||||
return min_ms + (int) (z % (uint64_t) (max_ms - min_ms + 1));
|
||||
}
|
||||
|
||||
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
|
||||
int n = opt->maxsoc - back_nsoc(sback);
|
||||
|
||||
@@ -3331,9 +3356,56 @@ int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
|
||||
}
|
||||
}
|
||||
|
||||
// #185 randomized inter-file pause: non-blocking, one launch per gap
|
||||
if (n > 0 && opt->pause_max_ms > 0 && HTS_STAT.last_connect > 0) {
|
||||
TStamp opTime =
|
||||
HTS_STAT.last_request ? HTS_STAT.last_request : HTS_STAT.last_connect;
|
||||
TStamp lap = mtime_local() - opTime;
|
||||
|
||||
if (lap < hts_pause_target_ms(opTime, opt->pause_min_ms, opt->pause_max_ms))
|
||||
n = 0;
|
||||
else
|
||||
n = 1;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/* One engine-loop tick: refresh the transfer stats and run the loop callback
|
||||
for slot b (-1 = none). HTS_FALSE = the callback requested an abort. */
|
||||
hts_boolean hts_loop_tick(struct_back *sback, httrackp *opt, int b, int ptr) {
|
||||
engine_stats();
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
return RUN_CALLBACK7(
|
||||
opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
}
|
||||
|
||||
/* Single implementation of the historical WAIT_FOR_AVAILABLE_SOCKET macros. */
|
||||
hts_boolean hts_wait_available_socket(struct_back *sback, httrackp *opt,
|
||||
cache_back *cache, int ptr) {
|
||||
const int prev = opt->state._hts_in_html_parsing;
|
||||
|
||||
while (back_pluggable_sockets_strict(sback, opt) <= 0) {
|
||||
opt->state._hts_in_html_parsing = 6;
|
||||
back_wait(sback, opt, cache, 0);
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */
|
||||
if (!back_checkmirror(opt))
|
||||
break;
|
||||
if (!hts_loop_tick(sback, opt, -1, ptr))
|
||||
return HTS_FALSE;
|
||||
}
|
||||
opt->state._hts_in_html_parsing = prev;
|
||||
return HTS_TRUE;
|
||||
}
|
||||
|
||||
int back_pluggable_sockets(struct_back * sback, httrackp * opt) {
|
||||
int n;
|
||||
|
||||
@@ -3742,6 +3814,14 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
||||
if (StringNotEmpty(from->strip_query))
|
||||
StringCopyS(to->strip_query, from->strip_query);
|
||||
|
||||
if (StringNotEmpty(from->cookies_file))
|
||||
StringCopyS(to->cookies_file, from->cookies_file);
|
||||
|
||||
if (from->pause_max_ms > 0) {
|
||||
to->pause_min_ms = from->pause_min_ms;
|
||||
to->pause_max_ms = from->pause_max_ms;
|
||||
}
|
||||
|
||||
if (from->retry > -1)
|
||||
to->retry = from->retry;
|
||||
|
||||
|
||||
@@ -362,6 +362,20 @@ void usercommand(httrackp * opt, int exe, const char *cmd, const char *file,
|
||||
|
||||
void usercommand_exe(const char *cmd, const char *file);
|
||||
|
||||
// Finish the makeindex index.html (footer + refresh meta), run usercommand.
|
||||
// Updates *makeindex_done/*makeindex_fp in place; adr/fil are the mode strings.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil);
|
||||
|
||||
// Flush ht_buff[0..ht_len] to save on disk (skip if MD5 unchanged); *fp
|
||||
// closed+NULLed on write. Precondition: ht_len>0.
|
||||
void hts_finish_html_file(httrackp *opt, cache_back *cache, htsblk *r,
|
||||
FILE **fp, const char *ht_buff, size_t ht_len,
|
||||
const char *adr, const char *fil, const char *save);
|
||||
|
||||
int filters_init(char ***ptrfilters, int maxfilter, int filterinc);
|
||||
|
||||
int fspc(httrackp * opt, FILE * fp, const char *type);
|
||||
@@ -418,6 +432,19 @@ int back_pluggable_sockets(struct_back * sback, httrackp * opt);
|
||||
|
||||
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt);
|
||||
|
||||
/* One engine-loop tick: refresh the transfer stats and run the loop callback
|
||||
for slot b (-1 = none). HTS_FALSE = the callback requested an abort. */
|
||||
hts_boolean hts_loop_tick(struct_back *sback, httrackp *opt, int b, int ptr);
|
||||
|
||||
/* Wait until a test socket can be plugged, pumping transfers, stats and the
|
||||
loop callback; gives up past the -E deadline. HTS_FALSE = callback abort. */
|
||||
hts_boolean hts_wait_available_socket(struct_back *sback, httrackp *opt,
|
||||
cache_back *cache, int ptr);
|
||||
|
||||
/* Randomized inter-file pause target in [min_ms,max_ms] (#185), derived from a
|
||||
timestamp seed so it is stable within one gap and rerolls per launch. */
|
||||
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms);
|
||||
|
||||
/* Schedule more links from the heap into free slots. Returns the number queued,
|
||||
or <=0 if none could be added (no free slot / paused / stopped). */
|
||||
int back_fill(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
@@ -466,4 +493,8 @@ void voidf(void);
|
||||
/* HTML marker comment marking where the top index is spliced. */
|
||||
#define HTS_TOPINDEX "TOP_INDEX_HTTRACK"
|
||||
|
||||
/* Worst-case byte expansion HT_ADD_HTMLESCAPED* must reserve per escaper. */
|
||||
#define HTS_HTMLESCAPE_MAXEXP 5 /* escape_for_html_print: '&'->"&" */
|
||||
#define HTS_HTMLESCAPE_FULL_MAXEXP 6 /* _full: high byte->"&#xHH;" */
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1976,6 +1976,51 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
StringCat(opt->strip_query, argv[na]);
|
||||
}
|
||||
break;
|
||||
case 'K': // cookies-file: extra Netscape cookies.txt to preload
|
||||
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
|
||||
HTS_PANIC_PRINTF(
|
||||
"Option cookies-file needs a blank space and "
|
||||
"a cookies.txt path");
|
||||
printf("Example: --cookies-file \"/home/me/cookies.txt\"\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
na++;
|
||||
if (strlen(argv[na]) >= 1024) {
|
||||
HTS_PANIC_PRINTF("Cookie file path too long");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
}
|
||||
StringCopy(opt->cookies_file, argv[na]);
|
||||
}
|
||||
break;
|
||||
case 'G': // pause: randomized inter-file delay MIN[:MAX] seconds
|
||||
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
|
||||
HTS_PANIC_PRINTF("Option pause needs a blank space and a "
|
||||
"delay in seconds (MIN[:MAX])");
|
||||
printf("Example: --pause 5:10\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
double pmin = 0, pmax = 0;
|
||||
int nf;
|
||||
|
||||
na++;
|
||||
nf = sscanf(argv[na], "%lf:%lf", &pmin, &pmax);
|
||||
if (nf < 2)
|
||||
pmax = pmin; /* a single value means a fixed delay */
|
||||
/* positive-form bounds: NaN fails every comparison, so this
|
||||
rejects it before the undefined (int)(NaN*1000) cast */
|
||||
if (nf < 1 || !(pmin >= 0 && pmax >= pmin && pmax <= 86400)) {
|
||||
HTS_PANIC_PRINTF("Invalid --pause range (expected "
|
||||
"MIN[:MAX] seconds, 0<=MIN<=MAX<=86400)");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
}
|
||||
opt->pause_min_ms = (int) (pmin * 1000.0);
|
||||
opt->pause_max_ms = (int) (pmax * 1000.0);
|
||||
}
|
||||
break;
|
||||
case 't': /* do not change type (ending) of filenames according to the MIME type */
|
||||
opt->no_type_change = 1;
|
||||
if (*(com+1)=='0') { opt->no_type_change = 0; com++; }
|
||||
|
||||
@@ -69,11 +69,15 @@ typedef struct t_hts_callbackarg t_hts_callbackarg;
|
||||
typedef struct t_hts_callbackarg t_hts_callbackarg;
|
||||
#endif
|
||||
|
||||
/* Marks a symbol an external wrapper module exports back to the engine
|
||||
(dllexport on Windows, nothing elsewhere). */
|
||||
/* Marks a symbol an external wrapper module exports back to the engine.
|
||||
Must override -fvisibility=hidden on ELF, or dlopen()ed plugins (htsjava)
|
||||
hide their own hts_plug()/hts_unplug() entry points. */
|
||||
#ifndef EXTERNAL_FUNCTION
|
||||
#ifdef _WIN32
|
||||
#define EXTERNAL_FUNCTION __declspec(dllexport)
|
||||
#elif ((defined(__GNUC__) && (__GNUC__ >= 4)) || \
|
||||
(defined(HAVE_VISIBILITY) && HAVE_VISIBILITY))
|
||||
#define EXTERNAL_FUNCTION __attribute__((visibility("default")))
|
||||
#else
|
||||
#define EXTERNAL_FUNCTION
|
||||
#endif
|
||||
|
||||
@@ -30,12 +30,14 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htssafe.h"
|
||||
|
||||
/* static int decode_entity(const unsigned int hash, const size_t len);
|
||||
*/
|
||||
/* static int decode_entity(const uint64_t hash, const size_t len);
|
||||
*/
|
||||
#include "htsentities.h"
|
||||
|
||||
/* hexadecimal conversion */
|
||||
@@ -50,30 +52,31 @@ static int get_hex_value(char c) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Numerical Recipes,
|
||||
see <http://en.wikipedia.org/wiki/Linear_congruential_generator> */
|
||||
#define HASH_PRIME ( 1664525 )
|
||||
#define HASH_CONST ( 1013904223 )
|
||||
#define HASH_ADD(HASH, C) do { \
|
||||
(HASH) *= HASH_PRIME; \
|
||||
(HASH) += HASH_CONST; \
|
||||
(HASH) += (C); \
|
||||
} while(0)
|
||||
/* 64-bit FNV-1a; must match htsentities.sh, which keys the entity table on it.
|
||||
*/
|
||||
#define HASH_INIT 0xcbf29ce484222325ULL
|
||||
#define HASH_PRIME 0x100000001b3ULL
|
||||
#define HASH_ADD(HASH, C) \
|
||||
do { \
|
||||
(HASH) ^= (unsigned char) (C); \
|
||||
(HASH) *= HASH_PRIME; \
|
||||
} while (0)
|
||||
|
||||
int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) {
|
||||
size_t i, j, ampStart, ampStartDest;
|
||||
int uc;
|
||||
int hex;
|
||||
unsigned int hash;
|
||||
uint64_t hash;
|
||||
|
||||
assertf(max != 0);
|
||||
for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0,
|
||||
uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) {
|
||||
for (i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0, uc = -1, hex = 0,
|
||||
hash = HASH_INIT;
|
||||
src[i] != '\0'; i++) {
|
||||
/* start of entity */
|
||||
if (src[i] == '&') {
|
||||
ampStart = i;
|
||||
ampStartDest = j;
|
||||
hash = 0;
|
||||
hash = HASH_INIT;
|
||||
uc = -1;
|
||||
}
|
||||
/* inside a potential entity */
|
||||
@@ -174,14 +177,11 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
|
||||
}
|
||||
/* alphanumerical entity */
|
||||
else {
|
||||
/* alphanum and not too far ('ϑ' is the longest) */
|
||||
if (i <= ampStart + 10 &&
|
||||
(
|
||||
(src[i] >= '0' && src[i] <= '9')
|
||||
|| (src[i] >= 'A' && src[i] <= 'Z')
|
||||
|| (src[i] >= 'a' && src[i] <= 'z')
|
||||
)
|
||||
) {
|
||||
/* alphanum, capped at the longest name
|
||||
* '∳' (31) */
|
||||
if (i <= ampStart + 31 && ((src[i] >= '0' && src[i] <= '9') ||
|
||||
(src[i] >= 'A' && src[i] <= 'Z') ||
|
||||
(src[i] >= 'a' && src[i] <= 'z'))) {
|
||||
/* compute hash */
|
||||
HASH_ADD(hash, (unsigned char) src[i]);
|
||||
} else {
|
||||
@@ -190,9 +190,9 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* copy */
|
||||
if (j + 1 > max) {
|
||||
|
||||
/* reserve one byte for the trailing NUL written after the loop */
|
||||
if (j + 1 >= max) {
|
||||
/* overflow */
|
||||
return -1;
|
||||
}
|
||||
@@ -300,6 +300,11 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
|
||||
|
||||
/* Was the character read successfully ? */
|
||||
if (nRead == utfBufferSize) {
|
||||
/* the 'continue' below skips the NUL-reserve guard: re-check */
|
||||
if (utfBufferJ + utfBufferSize >= max) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Rollback write position to sequence start write position */
|
||||
j = utfBufferJ;
|
||||
|
||||
@@ -314,8 +319,8 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
|
||||
}
|
||||
}
|
||||
|
||||
/* Check for overflow */
|
||||
if (j + 1 > max) {
|
||||
/* reserve one byte for the trailing NUL written after the loop */
|
||||
if (j + 1 >= max) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
13586
src/htsentities.h
13586
src/htsentities.h
File diff suppressed because it is too large
Load Diff
@@ -1,75 +1,92 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Regenerate htsentities.h from the WHATWG named character references.
|
||||
|
||||
src=html40.txt
|
||||
url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt
|
||||
set -euo pipefail
|
||||
|
||||
src=entities.json
|
||||
url=https://html.spec.whatwg.org/entities.json
|
||||
dest=htsentities.h
|
||||
|
||||
(
|
||||
cat <<EOF
|
||||
/*
|
||||
-- ${dest} --
|
||||
FILE GENERATED BY $0, DO NOT MODIFY
|
||||
# 64-bit FNV-1a of $1, printed as a C constant. Must match the hash in
|
||||
# htsencoding.c. The offset basis is stored as its wrapped (signed) bit pattern;
|
||||
# bash arithmetic is 64-bit two's complement, so the result is bit-exact.
|
||||
fnv1a() {
|
||||
local s=$1 i c h=$((0xcbf29ce484222325))
|
||||
for ((i = 0; i < ${#s}; i++)); do
|
||||
printf -v c '%d' "'${s:i:1}"
|
||||
h=$(((h ^ (c & 0xff)) * 0x100000001b3))
|
||||
done
|
||||
printf '0x%016xULL' "$h"
|
||||
}
|
||||
|
||||
We compute the LCG hash
|
||||
(see <http://en.wikipedia.org/wiki/Linear_congruential_generator>)
|
||||
for each entity. We should in theory check using strncmp() that we
|
||||
actually have the correct entity, but this is actually statistically
|
||||
not needed.
|
||||
if [ ! -f "$src" ]; then
|
||||
curl -fsS "$url" -o "$src"
|
||||
fi
|
||||
|
||||
We may want to do better, but we expect the hash function to be uniform, and
|
||||
let the compiler be smart enough to optimize the switch (for example by
|
||||
checking in log2() intervals)
|
||||
|
||||
This code has been generated using the evil $0 script.
|
||||
*/
|
||||
# Keep ';'-terminated single-codepoint names; the ~93 multi-codepoint refs can't
|
||||
# fit decode_entity's single-codepoint return and are skipped (left verbatim).
|
||||
pairs=$(jq -r '
|
||||
to_entries
|
||||
| map(select((.key | endswith(";")) and (.value.codepoints | length == 1)))
|
||||
| sort_by(.key)
|
||||
| .[] | "\(.key | ltrimstr("&") | rtrimstr(";"))\t\(.value.codepoints[0])"' "$src")
|
||||
|
||||
static int decode_entity(const unsigned int hash, const size_t len) {
|
||||
# Skipped multi-codepoint names, kept to prove none aliases an emitted hash.
|
||||
skipped=$(jq -r '
|
||||
to_entries
|
||||
| map(select((.key | endswith(";")) and (.value.codepoints | length > 1)))
|
||||
| .[] | .key | ltrimstr("&") | rtrimstr(";")' "$src")
|
||||
|
||||
cases=""
|
||||
emit_hashes=""
|
||||
while IFS=$'\t' read -r name cp; do
|
||||
hash=$(fnv1a "$name")
|
||||
cases+=" /* $name */"$'\n'
|
||||
cases+=" case $hash:"$'\n'
|
||||
cases+=" if (len == ${#name}) {"$'\n'
|
||||
cases+=" return $cp;"$'\n'
|
||||
cases+=" }"$'\n'
|
||||
cases+=" break;"$'\n'
|
||||
emit_hashes+="$hash"$'\n'
|
||||
done <<<"$pairs"
|
||||
|
||||
skip_hashes=""
|
||||
while IFS= read -r name; do
|
||||
[ -n "$name" ] && skip_hashes+="$(fnv1a "$name")"$'\n'
|
||||
done <<<"$skipped"
|
||||
|
||||
# The switch keys on the hash alone, so the dispatch is correct only while every
|
||||
# emitted name hashes uniquely; prove it here, no runtime name compare needed.
|
||||
dups=$(printf '%s' "$emit_hashes" | sort | uniq -d || true)
|
||||
if [ -n "$dups" ]; then
|
||||
echo "FATAL: two entity names share a hash (duplicate switch case); change the hash:" >&2
|
||||
echo "$dups" >&2
|
||||
exit 1
|
||||
fi
|
||||
# A skipped name colliding with an emitted hash would mis-decode instead of
|
||||
# staying verbatim; forbid that too.
|
||||
aliased=$(comm -12 <(printf '%s' "$emit_hashes" | sort -u) <(printf '%s' "$skip_hashes" | sort -u) || true)
|
||||
if [ -n "$aliased" ]; then
|
||||
echo "FATAL: a skipped multi-codepoint name aliases an emitted hash:" >&2
|
||||
echo "$aliased" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cat >"$dest" <<EOF
|
||||
/* GENERATED by $0 from the WHATWG named character references
|
||||
(${url}). DO NOT EDIT.
|
||||
Dispatch keys on a 64-bit FNV-1a hash of the entity name; the generator
|
||||
aborts on any hash collision, so no runtime name compare is needed. */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
static int decode_entity(const uint64_t hash, const size_t len) {
|
||||
switch(hash) {
|
||||
EOF
|
||||
(
|
||||
if test -f ${src}; then
|
||||
cat ${src}
|
||||
else
|
||||
GET "${url}"
|
||||
fi
|
||||
) |
|
||||
grep -E '^<!ENTITY [a-zA-Z0-9_]' |
|
||||
sed \
|
||||
-e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
|
||||
-e 's/-->$//' \
|
||||
-e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/' |
|
||||
(
|
||||
read -r A
|
||||
while test -n "$A"; do
|
||||
ent="${A%% *}"
|
||||
code=$(echo "$A" | cut -f2 -d' ')
|
||||
# compute hash
|
||||
hash=0
|
||||
i=0
|
||||
a=1664525
|
||||
c=1013904223
|
||||
m="$((1 << 32))"
|
||||
while test "$i" -lt ${#ent}; do
|
||||
d="$(echo -n "${ent:${i}:1}" | hexdump -v -e '/1 "%d"')"
|
||||
hash="$((((hash * a) % (m) + d + c) % (m)))"
|
||||
i=$((i + 1))
|
||||
done
|
||||
echo -e " /* $A */"
|
||||
echo -e " case ${hash}u:"
|
||||
echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {"
|
||||
echo -e " return ${code};"
|
||||
echo -e " }"
|
||||
echo -e " break;"
|
||||
|
||||
# next
|
||||
read -r A
|
||||
done
|
||||
)
|
||||
cat <<EOF
|
||||
}
|
||||
${cases} }
|
||||
/* unknown */
|
||||
return -1;
|
||||
}
|
||||
EOF
|
||||
) >${dest}
|
||||
|
||||
echo "wrote $dest ($(grep -c '^ case ' "$dest") entities)" >&2
|
||||
|
||||
@@ -193,7 +193,12 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
|
||||
int len = (int) strlen(joker);
|
||||
|
||||
while((joker[i] != RIGHT) && (joker[i]) && (i < len)) {
|
||||
if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10]
|
||||
// '\' escapes the next char as a literal member, e.g. *[\[\]]
|
||||
if (joker[i] == '\\' && joker[i + 1] != '\0') {
|
||||
i++;
|
||||
pass[(int) (unsigned char) joker[i]] = 1;
|
||||
i++;
|
||||
} else if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10]
|
||||
int lsize = 0;
|
||||
int lverdict;
|
||||
|
||||
@@ -221,7 +226,9 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
|
||||
while(isdigit((unsigned char) joker[i]))
|
||||
i++;
|
||||
}
|
||||
} else if (joker[i + 1] == '-') { // 2 car, ex: *[A-Z]
|
||||
} else if (joker[i + 1] == '-' && joker[i + 2] != '\0') {
|
||||
// range *[A-Z]; the '\0' guard rejects a truncated *[a- (else
|
||||
// i+=3 overshoots the NUL)
|
||||
if ((int) (unsigned char) joker[i + 2] >
|
||||
(int) (unsigned char) joker[i]) {
|
||||
int j;
|
||||
@@ -233,10 +240,7 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
|
||||
}
|
||||
// else err=1;
|
||||
i += 3;
|
||||
} else { // 1 car, ex: *[ ]
|
||||
if (joker[i + 2] == '\\' && joker[i + 3] != 0) { // escaped char, such as *[\[] or *[\]]
|
||||
i++;
|
||||
}
|
||||
} else { // 1 car, ex: *[ ]
|
||||
pass[(int) (unsigned char) joker[i]] = 1;
|
||||
i++;
|
||||
}
|
||||
|
||||
50
src/htsftp.c
50
src/htsftp.c
@@ -128,6 +128,33 @@ void launch_ftp(FTPDownloadStruct * params) {
|
||||
return 0; \
|
||||
}
|
||||
|
||||
/* Bounded split of a hostile-URL "user[:pass]@" prefix (see htsftp.h). */
|
||||
void ftp_split_userpass(const char *src, const char *end, char *user,
|
||||
size_t user_size, char *pass, size_t pass_size) {
|
||||
size_t n = 0;
|
||||
|
||||
assertf(user_size > 0 && pass_size > 0); /* the size-1 math underflows on 0 */
|
||||
|
||||
while (src[n] != '\0' && src[n] != ':') {
|
||||
if (n < user_size - 1)
|
||||
user[n] = src[n];
|
||||
n++;
|
||||
}
|
||||
user[n < user_size ? n : user_size - 1] = '\0';
|
||||
pass[0] = '\0';
|
||||
if (src[n] == ':') { // password follows the colon
|
||||
const size_t base = n + 1;
|
||||
size_t k = 0;
|
||||
|
||||
while (&src[base + k + 1] < end && src[base + k] != '\0') {
|
||||
if (k < pass_size - 1)
|
||||
pass[k] = src[base + k];
|
||||
k++;
|
||||
}
|
||||
pass[k < pass_size ? k : pass_size - 1] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
// la véritable fonction une fois lancées les routines thread/fork
|
||||
int run_launch_ftp(FTPDownloadStruct * pStruct) {
|
||||
lien_back *back = pStruct->pBack;
|
||||
@@ -173,24 +200,7 @@ int run_launch_ftp(FTPDownloadStruct * pStruct) {
|
||||
while(*real_adr == '/')
|
||||
real_adr++; // sauter /
|
||||
if ((adr = jump_identification(real_adr)) != real_adr) { // user
|
||||
int i = -1;
|
||||
|
||||
pass[0] = '\0';
|
||||
do {
|
||||
i++;
|
||||
user[i] = real_adr[i];
|
||||
} while((real_adr[i] != ':') && (real_adr[i]));
|
||||
user[i] = '\0';
|
||||
if (real_adr[i] == ':') { // pass
|
||||
int j = -1;
|
||||
|
||||
i++; // oui on saute aussi le :
|
||||
do {
|
||||
j++;
|
||||
pass[j] = real_adr[i + j];
|
||||
} while(((&real_adr[i + j + 1]) < adr) && (real_adr[i + j]));
|
||||
pass[j] = '\0';
|
||||
}
|
||||
ftp_split_userpass(real_adr, adr, user, sizeof(user), pass, sizeof(pass));
|
||||
}
|
||||
// Calculer RETR <nom>
|
||||
{
|
||||
@@ -984,8 +994,8 @@ int get_ftp_line(T_SOC soc, char *ptrline, size_t line_size, int timeout) {
|
||||
//case 0: break; // pas encore --> erreur (on attend)!
|
||||
case 1:
|
||||
HTS_STAT.HTS_TOTAL_RECV += 1; // compter flux entrant
|
||||
if ((b != 10) && (b != 13))
|
||||
data[i++] = b;
|
||||
if ((b != 10) && (b != 13) && (i < (int) sizeof(data) - 1))
|
||||
data[i++] = b; // truncate hostile over-long reply lines
|
||||
break;
|
||||
default:
|
||||
if (ptrline)
|
||||
|
||||
@@ -70,6 +70,11 @@ int back_launch_ftp(FTPDownloadStruct * params);
|
||||
int run_launch_ftp(FTPDownloadStruct * params);
|
||||
int send_line(T_SOC soc, const char *data);
|
||||
int get_ftp_line(T_SOC soc, char *line, size_t line_size, int timeout);
|
||||
/* Split a "user[:pass]@" prefix (end = jump_identification result) into
|
||||
bounded, NUL-terminated user/pass buffers, truncating to fit.
|
||||
Both sizes must be nonzero. */
|
||||
void ftp_split_userpass(const char *src, const char *end, char *user,
|
||||
size_t user_size, char *pass, size_t pass_size);
|
||||
T_SOC get_datasocket(char *to_send, size_t to_send_size);
|
||||
int stop_ftp(lien_back * back);
|
||||
char *linejmp(char *line);
|
||||
|
||||
@@ -43,8 +43,8 @@ Please visit our Website: http://www.httrack.com
|
||||
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
||||
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
||||
LIB_VERSION the data/cache format generation. */
|
||||
#define HTTRACK_VERSION "3.49-9"
|
||||
#define HTTRACK_VERSIONID "3.49.9"
|
||||
#define HTTRACK_VERSION "3.49-10"
|
||||
#define HTTRACK_VERSIONID "3.49.10"
|
||||
#define HTTRACK_AFF_VERSION "3.x"
|
||||
#define HTTRACK_LIB_VERSION "2.0"
|
||||
|
||||
@@ -229,6 +229,10 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_DEFAULT_FOOTER \
|
||||
"<!-- Mirrored from %s%s by HTTrack Website Copier/" HTTRACK_AFF_VERSION \
|
||||
" " HTTRACK_AFF_AUTHORS ", %s -->"
|
||||
/* Honest crawler User-Agent; no fake OS/browser to go stale. */
|
||||
#define HTS_DEFAULT_USER_AGENT \
|
||||
"Mozilla/5.0 (compatible; HTTrack/" HTTRACK_AFF_VERSION \
|
||||
"; +https://www.httrack.com/)"
|
||||
#define HTTRACK_WEB "http://www.httrack.com"
|
||||
#define HTS_UPDATE_WEBSITE \
|
||||
"http://www.httrack.com/" \
|
||||
|
||||
@@ -521,6 +521,7 @@ void help(const char *app, int more) {
|
||||
infomsg(" EN maximum mirror time in seconds (60=1 minute, 3600=1 hour)");
|
||||
infomsg(" AN maximum transfer rate in bytes/seconds (1000=1KB/s max)");
|
||||
infomsg(" %cN maximum number of connections/seconds (*%c10)");
|
||||
infomsg(" %G random pause of MIN[:MAX] seconds between files (e.g. %G5:10)");
|
||||
infomsg
|
||||
(" GN pause transfer if N bytes reached, and wait until lock file is deleted");
|
||||
infomsg("");
|
||||
@@ -572,6 +573,7 @@ void help(const char *app, int more) {
|
||||
infomsg("");
|
||||
infomsg("Spider options:");
|
||||
infomsg(" bN accept cookies in cookies.txt (0=do not accept,* 1=accept)");
|
||||
infomsg(" %K load extra cookies from a Netscape cookies.txt");
|
||||
infomsg
|
||||
(" u check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always)");
|
||||
infomsg
|
||||
|
||||
@@ -63,6 +63,9 @@ Please visit our Website: http://www.httrack.com
|
||||
/* This file */
|
||||
#include "htsjava.h"
|
||||
|
||||
/* calloct/freet wrappers */
|
||||
#include "htssafe.h"
|
||||
|
||||
static int reverse_endian(void) {
|
||||
int endian = 1;
|
||||
|
||||
@@ -204,7 +207,16 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
|
||||
return 0;
|
||||
}
|
||||
|
||||
tab = (RESP_STRUCT *) calloc(header.count, sizeof(RESP_STRUCT));
|
||||
/* A constant-pool entry is >= 1 byte on disk; reject a count exceeding
|
||||
the file size (hostile .class ~68 MB alloc DoS). */
|
||||
if (!hts_count_fits(header.count, (LLint) fsize(file))) {
|
||||
fclose(fpout);
|
||||
sprintf(str->err_msg,
|
||||
"Invalid constant pool count %u (file len " LLintP ")",
|
||||
(unsigned) header.count, (LLint) fsize(file));
|
||||
return 0;
|
||||
}
|
||||
tab = (RESP_STRUCT *) calloct(header.count, sizeof(RESP_STRUCT));
|
||||
if (!tab) {
|
||||
sprintf(str->err_msg, "Unable to alloc %d bytes",
|
||||
(int) sizeof(RESP_STRUCT));
|
||||
@@ -230,7 +242,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
|
||||
} else { // ++ une erreur est survenue!
|
||||
if (strnotempty(str->err_msg) == 0)
|
||||
strcpy(str->err_msg, "Internal readtable error");
|
||||
free(tab);
|
||||
freet(tab);
|
||||
if (fpout) {
|
||||
fclose(fpout);
|
||||
fpout = NULL;
|
||||
@@ -288,7 +300,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
|
||||
#if JAVADEBUG
|
||||
printf("end\n");
|
||||
#endif
|
||||
free(tab);
|
||||
freet(tab);
|
||||
if (fpout) {
|
||||
fclose(fpout);
|
||||
fpout = NULL;
|
||||
|
||||
@@ -33,15 +33,19 @@ Please visit our Website: http://www.httrack.com
|
||||
#ifndef HTSJAVA_DEFH
|
||||
#define HTSJAVA_DEFH
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_JAVA_HEADER
|
||||
#define HTS_DEF_FWSTRUCT_JAVA_HEADER
|
||||
typedef struct JAVA_HEADER JAVA_HEADER;
|
||||
#endif
|
||||
/* 10-byte on-disk .class header image, fread() directly: fields need exact
|
||||
widths (LP64's 8-byte 'unsigned long' magic never matched 0xCAFEBABE). */
|
||||
struct JAVA_HEADER {
|
||||
unsigned long int magic;
|
||||
unsigned short int minor;
|
||||
unsigned short int major;
|
||||
unsigned short int count;
|
||||
uint32_t magic;
|
||||
uint16_t minor;
|
||||
uint16_t major;
|
||||
uint16_t count;
|
||||
};
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_RESP_STRUCT
|
||||
|
||||
167
src/htslib.c
167
src/htslib.c
@@ -563,6 +563,39 @@ const char *hts_mime[][2] = {
|
||||
{"", ""}
|
||||
};
|
||||
|
||||
/* Modern web formats (post-2010), kept in their own table: appending to the
|
||||
legacy hts_mime[] above makes clang-format reflow its whole initializer.
|
||||
Scanned after hts_mime[], so it never shadows a legacy mapping. */
|
||||
static const char *hts_mime_modern[][2] = {
|
||||
{"image/webp", "webp"},
|
||||
{"image/avif", "avif"},
|
||||
{"image/heic", "heic"},
|
||||
{"font/woff", "woff"},
|
||||
{"font/woff2", "woff2"},
|
||||
{"font/ttf", "ttf"},
|
||||
{"font/otf", "otf"},
|
||||
{"application/json", "json"},
|
||||
{"application/ld+json", "jsonld"},
|
||||
{"application/manifest+json", "webmanifest"},
|
||||
{"application/wasm", "wasm"},
|
||||
{"text/javascript", "js"},
|
||||
{"text/javascript", "mjs"},
|
||||
{"text/markdown", "md"},
|
||||
{"video/mp4", "mp4"},
|
||||
{"video/webm", "webm"},
|
||||
{"video/ogg", "ogv"},
|
||||
{"video/mp2t", "ts"},
|
||||
{"audio/mp4", "m4a"},
|
||||
{"audio/aac", "aac"},
|
||||
{"audio/ogg", "oga"},
|
||||
{"audio/opus", "opus"},
|
||||
{"audio/flac", "flac"},
|
||||
{"audio/webm", "weba"},
|
||||
{"application/x-7z-compressed", "7z"},
|
||||
{"application/x-rar-compressed", "rar"},
|
||||
{"application/zstd", "zst"},
|
||||
{"", ""}};
|
||||
|
||||
// Reserved (RFC2396)
|
||||
#define CIS(c,ch) ( ((unsigned char)(c)) == (ch) )
|
||||
#define CHAR_RESERVED(c) ( CIS(c,';') \
|
||||
@@ -1116,7 +1149,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
char BIGSTK protocol[256], url[HTS_URLMAXSIZE * 2], method[256];
|
||||
|
||||
linput(fp, line, 1000);
|
||||
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
|
||||
/* widths bound method[256], url[HTS_URLMAXSIZE*2], protocol[256] */
|
||||
if (sscanf(line, "%255s %2047s %255s", method, url, protocol) == 3) {
|
||||
size_t ret;
|
||||
// selon que l'on a ou pas un proxy
|
||||
if (retour->req.proxy.active) {
|
||||
@@ -1293,16 +1327,12 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
|
||||
// Compression accepted ?
|
||||
if (retour->req.http11) {
|
||||
hts_boolean compressible = HTS_FALSE;
|
||||
#if HTS_USEZLIB
|
||||
if ((!retour->req.range_used)
|
||||
&& (!retour->req.nocompression))
|
||||
print_buffer(&bstr, "Accept-Encoding: " "gzip" /* gzip if the preffered encoding */
|
||||
", " "identity;q=0.9" H_CRLF);
|
||||
else
|
||||
print_buffer(&bstr, "Accept-Encoding: identity" H_CRLF); /* no compression */
|
||||
#else
|
||||
print_buffer(&bstr, "Accept-Encoding: identity" H_CRLF); /* no compression */
|
||||
compressible = (!retour->req.range_used && !retour->req.nocompression);
|
||||
#endif
|
||||
print_buffer(&bstr, "Accept-Encoding: %s" H_CRLF,
|
||||
hts_acceptencoding(compressible));
|
||||
}
|
||||
|
||||
/* Authentification */
|
||||
@@ -1918,6 +1948,10 @@ HTSEXT_API const char *infostatuscode_const(int statuscode) {
|
||||
return "Requested Range Not Satisfiable";
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
case 501:
|
||||
@@ -4098,25 +4132,33 @@ DECLARE_APPEND_ESCAPE_VERSION(escape_uri)
|
||||
|
||||
#undef DECLARE_APPEND_ESCAPE_VERSION
|
||||
|
||||
// Same as above, but in-place
|
||||
#undef DECLARE_INPLACE_ESCAPE_VERSION
|
||||
#define DECLARE_INPLACE_ESCAPE_VERSION(NAME) \
|
||||
HTSEXT_API size_t inplace_ ##NAME(char *const dest, const size_t size) { \
|
||||
char buffer[256]; \
|
||||
const size_t len = strnlen(dest, size); \
|
||||
const int in_buffer = len + 1 < sizeof(buffer); \
|
||||
char *src = in_buffer ? buffer : malloct(len + 1); \
|
||||
size_t ret; \
|
||||
assertf(src != NULL); \
|
||||
assertf(len < size); \
|
||||
memcpy(src, dest, len + 1); \
|
||||
ret = NAME(src, dest, size); \
|
||||
if (!in_buffer) { \
|
||||
freet(src); \
|
||||
} \
|
||||
return ret; \
|
||||
// In-place escaping: copy dest aside, then escape that copy back into dest.
|
||||
typedef size_t (*escape_fn_t)(const char *src, char *dest, size_t size);
|
||||
|
||||
static size_t inplace_escape(char *const dest, const size_t size,
|
||||
escape_fn_t escape) {
|
||||
char buffer[256];
|
||||
const size_t len = strnlen(dest, size);
|
||||
const int in_buffer = len + 1 < sizeof(buffer);
|
||||
char *src = in_buffer ? buffer : malloct(len + 1);
|
||||
size_t ret;
|
||||
assertf(src != NULL);
|
||||
assertf(len < size);
|
||||
memcpy(src, dest, len + 1);
|
||||
ret = escape(src, dest, size);
|
||||
if (!in_buffer) {
|
||||
freet(src);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Thin exported wrappers binding inplace_escape() to each escaper (ABI).
|
||||
#undef DECLARE_INPLACE_ESCAPE_VERSION
|
||||
#define DECLARE_INPLACE_ESCAPE_VERSION(NAME) \
|
||||
HTSEXT_API size_t inplace_##NAME(char *const dest, const size_t size) { \
|
||||
return inplace_escape(dest, size, NAME); \
|
||||
}
|
||||
|
||||
DECLARE_INPLACE_ESCAPE_VERSION(escape_in_url)
|
||||
DECLARE_INPLACE_ESCAPE_VERSION(escape_spc_url)
|
||||
DECLARE_INPLACE_ESCAPE_VERSION(escape_uri_utf)
|
||||
@@ -4308,6 +4350,20 @@ void guess_httptype(httrackp * opt, char *s, const char *fil) {
|
||||
(void) get_httptype_sized(opt, s, HTS_MIMETYPE_SIZE, fil, 1);
|
||||
}
|
||||
|
||||
// first match in a NUL-terminated {mime,ext} table. key selects the lookup
|
||||
// column (0=mime, 1=ext); returns the other column, or NULL if no row matches
|
||||
// (a "*" partner means the row carries no value).
|
||||
static const char *hts_mime_lookup(const char *(*table)[2], int key,
|
||||
const char *needle) {
|
||||
int j;
|
||||
|
||||
for (j = 0; strnotempty(table[j][1]); j++) {
|
||||
if (strfield2(table[j][key], needle) && table[j][!key][0] != '*')
|
||||
return table[j][!key];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// write the mime type for fil into s (capacity ssize)
|
||||
// flag: 1 to always return a type (the "application/..." / octet-stream
|
||||
// fallback) returns 1 if a type was written to s, 0 otherwise
|
||||
@@ -4331,17 +4387,15 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
while ((a > fil) && (*a != '.') && (*a != '/'))
|
||||
a--;
|
||||
if (a >= fil && *a == '.' && strlen(a) < 32) {
|
||||
int j = 0;
|
||||
const char *mime;
|
||||
|
||||
a++;
|
||||
while(strnotempty(hts_mime[j][1])) {
|
||||
if (strfield2(hts_mime[j][1], a)) {
|
||||
if (hts_mime[j][0][0] != '*') { // a match exists
|
||||
strlcpybuff(s, hts_mime[j][0], ssize);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
j++;
|
||||
mime = hts_mime_lookup(hts_mime, 1, a);
|
||||
if (mime == NULL)
|
||||
mime = hts_mime_lookup(hts_mime_modern, 1, a);
|
||||
if (mime != NULL) {
|
||||
strlcpybuff(s, mime, ssize);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (flag) {
|
||||
@@ -4365,6 +4419,11 @@ HTSEXT_API void get_httptype(httrackp *opt, char *s, const char *fil,
|
||||
(void) get_httptype_sized(opt, s, HTS_MIMETYPE_SIZE, fil, flag);
|
||||
}
|
||||
|
||||
/* Advertised Accept-Encoding; gzip and deflate both decode via hts_zunpack */
|
||||
const char *hts_acceptencoding(hts_boolean compressible) {
|
||||
return compressible ? "gzip, deflate, identity;q=0.9" : "identity";
|
||||
}
|
||||
|
||||
// get type of fil (php)
|
||||
// s: buffer (text/html) or NULL
|
||||
// return: 1 if known by user
|
||||
@@ -4476,18 +4535,16 @@ int get_userhttptype(httrackp * opt, char *s, const char *fil) {
|
||||
// returns 1 if an extension was found (and written to s), 0 otherwise
|
||||
int give_mimext(char *s, size_t ssize, const char *st) {
|
||||
int ok = 0;
|
||||
int j = 0;
|
||||
const char *ext;
|
||||
|
||||
st = hts_effective_mime(st); /* no declared type: derive an html ext */
|
||||
s[0] = '\0';
|
||||
while((!ok) && (strnotempty(hts_mime[j][1]))) {
|
||||
if (strfield2(hts_mime[j][0], st)) {
|
||||
if (hts_mime[j][1][0] != '*') { // a match exists
|
||||
strlcpybuff(s, hts_mime[j][1], ssize);
|
||||
ok = 1;
|
||||
}
|
||||
}
|
||||
j++;
|
||||
ext = hts_mime_lookup(hts_mime, 0, st);
|
||||
if (ext == NULL)
|
||||
ext = hts_mime_lookup(hts_mime_modern, 0, st);
|
||||
if (ext != NULL) {
|
||||
strlcpybuff(s, ext, ssize);
|
||||
ok = 1;
|
||||
}
|
||||
// wrap "x" mimetypes, such as:
|
||||
// application/x-mp3
|
||||
@@ -5754,6 +5811,13 @@ HTSEXT_API int hts_init(void) {
|
||||
abortLog("unable to initialize TLS: SSL_CTX_new()");
|
||||
assertf("unable to initialize TLS" == NULL);
|
||||
}
|
||||
/* Pin a TLS floor (no SSLv3/TLS1.0/1.1); no cert verify, by design. */
|
||||
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
|
||||
SSL_CTX_set_min_proto_version(openssl_ctx, TLS1_2_VERSION);
|
||||
#else
|
||||
SSL_CTX_set_options(openssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
|
||||
SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -5959,9 +6023,11 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
"htsswf", "htsjava", "httrack-plugin", NULL
|
||||
};
|
||||
#else
|
||||
static const char *defaultModules[] = {
|
||||
"libhtsswf.so.1", "libhtsjava.so.2", "httrack-plugin", NULL
|
||||
};
|
||||
#ifndef HTS_LIBHTSJAVA_NAME
|
||||
#define HTS_LIBHTSJAVA_NAME "libhtsjava.so" /* non-autoconf fallback */
|
||||
#endif
|
||||
static const char *defaultModules[] = {"libhtsswf.so.1", HTS_LIBHTSJAVA_NAME,
|
||||
"httrack-plugin", NULL};
|
||||
#endif
|
||||
httrackp *opt = malloc(sizeof(httrackp));
|
||||
|
||||
@@ -6005,8 +6071,7 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
opt->shell = HTS_FALSE;
|
||||
opt->proxy.active = 0; // pas de proxy
|
||||
opt->user_agent_send = HTS_TRUE;
|
||||
StringCopy(opt->user_agent,
|
||||
"Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)");
|
||||
StringCopy(opt->user_agent, HTS_DEFAULT_USER_AGENT);
|
||||
StringCopy(opt->referer, "");
|
||||
StringCopy(opt->from, "");
|
||||
opt->savename_83 = HTS_SAVENAME_83_LONG; // long names by default
|
||||
@@ -6045,6 +6110,9 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
opt->no_query_dedup = HTS_FALSE;
|
||||
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
|
||||
StringCopy(opt->strip_query, "");
|
||||
StringCopy(opt->cookies_file, "");
|
||||
opt->pause_min_ms = 0;
|
||||
opt->pause_max_ms = 0;
|
||||
opt->ftp_proxy = HTS_TRUE;
|
||||
opt->convert_utf8 = HTS_TRUE;
|
||||
StringCopy(opt->filelist, "");
|
||||
@@ -6190,6 +6258,7 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
|
||||
StringFree(opt->footer);
|
||||
StringFree(opt->mod_blacklist);
|
||||
StringFree(opt->strip_query);
|
||||
StringFree(opt->cookies_file);
|
||||
|
||||
StringFree(opt->path_html);
|
||||
StringFree(opt->path_html_utf8);
|
||||
|
||||
@@ -285,6 +285,9 @@ int ishttperror(int err);
|
||||
int get_userhttptype(httrackp * opt, char *s, const char *fil);
|
||||
int give_mimext(char *s, size_t ssize, const char *st);
|
||||
|
||||
/* Advertised Accept-Encoding value (no header name/CRLF); see htslib.c. */
|
||||
const char *hts_acceptencoding(hts_boolean compressible);
|
||||
|
||||
int may_bogus_multiple(httrackp * opt, const char *mime, const char *filename);
|
||||
int may_unknown2(httrackp * opt, const char *mime, const char *filename);
|
||||
|
||||
|
||||
349
src/htsname.c
349
src/htsname.c
@@ -41,6 +41,10 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htstools.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htssniff.h"
|
||||
#if HTS_USEZLIB
|
||||
#include "htszlib.h"
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
|
||||
#define ADD_STANDARD_PATH \
|
||||
@@ -70,31 +74,6 @@ static const char *hts_tbdev[] = {
|
||||
""
|
||||
};
|
||||
|
||||
#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while(back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state. _hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback,opt,cache,0); \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket=back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \
|
||||
HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \
|
||||
HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \
|
||||
HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \
|
||||
HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \
|
||||
/* Check */ \
|
||||
{ \
|
||||
if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count,-1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while(0)
|
||||
|
||||
/* Strip all // */
|
||||
static void cleanDoubleSlash(char *s) {
|
||||
int i, j;
|
||||
@@ -138,37 +117,191 @@ static void cleanEndingSpaceOrDot(char *s) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Should the wire Content-Type override the URL's own extension when naming the
|
||||
saved file? True when the type is patchable (may_unknown2) and either the URL
|
||||
extension implies no specific type or the server declared a disagreeing one.
|
||||
A URL extension mapping to a specific non-HTML type is kept only when the
|
||||
server declared NO type (the HTS_UNKNOWN_MIME sentinel; the #267 mangle
|
||||
guard): a typeless .png stays .png, but a .pdf explicitly served as text/html
|
||||
is named .html. The sentinel rides the cache, so updates stay consistent. */
|
||||
static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
char urlmime[256];
|
||||
/* Wire Content-Type vs URL extension: a patchable wire type wins over an
|
||||
unspecific ext, the HTS_UNKNOWN_MIME sentinel keeps a specific non-HTML ext
|
||||
(#267 guard), a declared disagreement is CONTESTED (sniffed below). */
|
||||
typedef enum wire_verdict {
|
||||
WIRE_KEEPS_EXT,
|
||||
WIRE_WINS,
|
||||
WIRE_CONTESTED
|
||||
} wire_verdict;
|
||||
|
||||
static wire_verdict wire_ext_verdict(httrackp *opt, const char *wiremime,
|
||||
const char *file, char *urlmime,
|
||||
size_t urlmime_size) {
|
||||
if (may_unknown2(opt, wiremime, file))
|
||||
return 0; /* type kept verbatim (keep-list / bogus-multiple) */
|
||||
return WIRE_KEEPS_EXT; /* type kept verbatim (keep-list / bogus-multiple) */
|
||||
urlmime[0] = '\0';
|
||||
/* type implied by the URL extension, only when confidently known (flag 0) */
|
||||
if (!get_httptype_sized(opt, urlmime, sizeof(urlmime), file, 0))
|
||||
return 1; /* URL ext implies no known type: trust the wire type */
|
||||
if (!get_httptype_sized(opt, urlmime, urlmime_size, file, 0))
|
||||
return WIRE_WINS; /* URL ext implies no known type */
|
||||
if (strfield2(wiremime, urlmime))
|
||||
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
|
||||
/* wire disagrees with a specific non-HTML URL ext. Keep the ext only when
|
||||
the server declared no type (the sentinel); an explicitly declared type,
|
||||
even text/html, is trusted, so a binary-looking URL that really serves
|
||||
HTML (login/error interstitial, soft-404) is named .html. */
|
||||
return WIRE_KEEPS_EXT; /* agreement (no .htm->.html churn) */
|
||||
if (!is_hypertext_mime(opt, urlmime, file) &&
|
||||
strfield2(wiremime, HTS_UNKNOWN_MIME))
|
||||
return WIRE_KEEPS_EXT; /* no declared type */
|
||||
return WIRE_CONTESTED;
|
||||
}
|
||||
|
||||
/* Optional evidence for a contested wire-vs-ext verdict. */
|
||||
typedef struct sniff_src {
|
||||
struct_back *sback; /* live backing (looked up by adr/fil) */
|
||||
const lien_back *headers; /* snapshot: r.adr, else the url_sav file */
|
||||
const char *adr, *fil;
|
||||
const char *prev_save; /* previous run's save name (cache X-Save) */
|
||||
} sniff_src;
|
||||
|
||||
#if HTS_USEZLIB
|
||||
/* Inflate the head of a gzip/zlib stream; 0 when undecodable. */
|
||||
static size_t sniff_inflate_head(const void *in, size_t in_len, void *out,
|
||||
size_t out_len) {
|
||||
z_stream zs;
|
||||
size_t n = 0;
|
||||
int err;
|
||||
|
||||
memset(&zs, 0, sizeof(zs));
|
||||
if (inflateInit2(&zs, 47) != Z_OK) /* 47: gzip or zlib, autodetected */
|
||||
return 0;
|
||||
zs.next_in = (const Bytef *) in;
|
||||
zs.avail_in = (uInt) in_len;
|
||||
zs.next_out = (Bytef *) out;
|
||||
zs.avail_out = (uInt) out_len;
|
||||
err = inflate(&zs, Z_SYNC_FLUSH);
|
||||
if (err == Z_OK || err == Z_STREAM_END || err == Z_BUF_ERROR)
|
||||
n = out_len - zs.avail_out;
|
||||
inflateEnd(&zs);
|
||||
return n;
|
||||
}
|
||||
#endif
|
||||
|
||||
static size_t sniff_read_head(const char *path, void *buf, size_t len) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
FILE *const fp = FOPEN(fconv(catbuff, sizeof(catbuff), path), "rb");
|
||||
size_t n = 0;
|
||||
|
||||
if (fp != NULL) {
|
||||
n = fread(buf, 1, len, fp);
|
||||
fclose(fp);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Body head of one slot: memory, else its flushed on-disk file (url_sav, or
|
||||
tmpfile for a compressed stream); inflated so the sniff sees the final body.
|
||||
*/
|
||||
static size_t sniff_slot_head(const lien_back *slot, void *buf, size_t len) {
|
||||
const htsblk *const r = &slot->r;
|
||||
size_t n = 0;
|
||||
|
||||
if (r->adr != NULL && r->size > 0) {
|
||||
n = (size_t) r->size < len ? (size_t) r->size : len;
|
||||
memcpy(buf, r->adr, n);
|
||||
} else {
|
||||
if (r->out != NULL)
|
||||
fflush(r->out);
|
||||
if (slot->url_sav[0] != '\0')
|
||||
n = sniff_read_head(slot->url_sav, buf, len);
|
||||
if (n == 0 && slot->tmpfile != NULL && slot->tmpfile[0] != '\0')
|
||||
n = sniff_read_head(slot->tmpfile, buf, len);
|
||||
}
|
||||
if (n > 0 && r->compressed) {
|
||||
#if HTS_USEZLIB
|
||||
unsigned char raw[HTS_SNIFF_LEN];
|
||||
|
||||
if (n > sizeof(raw))
|
||||
n = sizeof(raw);
|
||||
memcpy(raw, buf, n);
|
||||
n = sniff_inflate_head(raw, n, buf, len);
|
||||
#else
|
||||
n = 0;
|
||||
#endif
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Up to len leading body bytes; 0 when unavailable, and always in
|
||||
non-delayed mode (its HEAD-probe first run couldn't sniff either). */
|
||||
static size_t sniff_body_head(httrackp *opt, const sniff_src *src, void *buf,
|
||||
size_t len) {
|
||||
size_t n = 0;
|
||||
|
||||
if (src == NULL || opt->savename_delayed == HTS_SAVENAME_DELAYED_NONE)
|
||||
return 0;
|
||||
/* live backing slot: a snapshot (back_copy_static) loses r.adr/r.out */
|
||||
if (src->sback != NULL && src->adr != NULL && src->fil != NULL) {
|
||||
const int b = back_index(opt, src->sback, src->adr, src->fil, NULL);
|
||||
|
||||
if (b >= 0)
|
||||
n = sniff_slot_head(&src->sback->lnk[b], buf, len);
|
||||
}
|
||||
if (n == 0 && src->headers != NULL)
|
||||
n = sniff_slot_head(src->headers, buf, len);
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Contested verdicts: magic proving the URL ext keeps it, else wire wins. */
|
||||
static int wire_patches_ext(httrackp *opt, const sniff_src *src,
|
||||
const char *wiremime, const char *file) {
|
||||
char urlmime[256];
|
||||
|
||||
switch (wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime))) {
|
||||
case WIRE_KEEPS_EXT:
|
||||
return 0;
|
||||
case WIRE_WINS:
|
||||
return 1;
|
||||
case WIRE_CONTESTED:
|
||||
break;
|
||||
}
|
||||
if (src != NULL) {
|
||||
if (hts_sniff_mime_known(urlmime)) {
|
||||
unsigned char head[HTS_SNIFF_LEN];
|
||||
const size_t n = sniff_body_head(opt, src, head, sizeof(head));
|
||||
|
||||
if (n > 0)
|
||||
return hts_sniff_mime_consistent(head, n, urlmime) ? 0 : 1;
|
||||
}
|
||||
/* no bytes: reproduce the previous run's verdict (cached X-Save name) */
|
||||
if (src->prev_save != NULL && src->prev_save[0] != '\0') {
|
||||
char prevmime[256];
|
||||
|
||||
prevmime[0] = '\0';
|
||||
if (get_httptype_sized(opt, prevmime, sizeof(prevmime), src->prev_save,
|
||||
0) &&
|
||||
strfield2(prevmime, urlmime))
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
// forme le nom du fichier à sauver (save) à partir de fil et adr
|
||||
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
|
||||
int hts_ext_sniff_wanted(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
char urlmime[256];
|
||||
|
||||
return wiremime != NULL && strnotempty(wiremime) &&
|
||||
wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime)) ==
|
||||
WIRE_CONTESTED &&
|
||||
hts_sniff_mime_known(urlmime);
|
||||
}
|
||||
|
||||
/* Wire-metadata name change: a Content-Disposition filename wins (returns 2),
|
||||
else the declared type's ext when wire_patches_ext() allows (returns 1),
|
||||
else 0. ext receives the new extension or replacement filename. */
|
||||
static int resolve_extension(httrackp *opt, const sniff_src *src,
|
||||
const char *cdispo, const char *contenttype,
|
||||
const char *fil, char *ext, size_t ext_size) {
|
||||
if (strnotempty(cdispo)) {
|
||||
strlcpybuff(ext, cdispo, ext_size);
|
||||
return 2;
|
||||
}
|
||||
if (wire_patches_ext(opt, src, contenttype, fil) &&
|
||||
give_mimext(ext, ext_size, contenttype))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Build the local save name (save) from adr/fil; renames on collision
|
||||
// (e.g. INDEX.HTML vs index.html).
|
||||
int url_savename(lien_adrfilsave *const afs,
|
||||
lien_adrfil *const former,
|
||||
const char *referer_adr, const char *referer_fil,
|
||||
@@ -405,45 +538,30 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
|
||||
// si option check_type activée
|
||||
if (is_html < 0 && opt->check_type && !ext_chg) {
|
||||
int ishtest = 0;
|
||||
|
||||
if (protocol != PROTOCOL_FILE
|
||||
&& protocol != PROTOCOL_FTP
|
||||
) {
|
||||
// tester type avec requète HEAD si on ne connait pas le type du fichier
|
||||
if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD ||
|
||||
(ishtest = ishtml(opt, fil)) <
|
||||
0) { // unsure whether it's html or a file
|
||||
ishtml(opt, fil) < 0) { // unsure whether it's html or a file
|
||||
// lire dans le cache
|
||||
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
|
||||
char BIGSTK previous_save[HTS_URLMAXSIZE * 2];
|
||||
htsblk r;
|
||||
|
||||
if (r.statuscode != -1) { // pas d'erreur de lecture cache
|
||||
char s[32];
|
||||
previous_save[0] = '\0';
|
||||
r = cache_read_including_broken(opt, cache, adr, fil,
|
||||
previous_save); // test uniquement
|
||||
|
||||
s[0] = '\0';
|
||||
if (r.statuscode != -1) { // cache entry read OK
|
||||
hts_log_print(opt, LOG_DEBUG, "Testing link type (from cache) %s%s",
|
||||
adr_complete, fil_complete);
|
||||
if (!HTTP_IS_REDIRECT(r.statuscode)) {
|
||||
if (strnotempty(r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, r.cdispo);
|
||||
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
|
||||
if (give_mimext(s, sizeof(s),
|
||||
r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, s);
|
||||
}
|
||||
}
|
||||
const sniff_src src = {sback, NULL, adr, fil, previous_save};
|
||||
|
||||
ext_chg = resolve_extension(opt, &src, r.cdispo, r.contenttype,
|
||||
fil, ext, sizeof(ext));
|
||||
}
|
||||
#ifdef DEFAULT_BIN_EXT
|
||||
// no extension and potentially bogus
|
||||
else if (ishtest == -2) {
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
|
||||
}
|
||||
#endif
|
||||
//
|
||||
} else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD &&
|
||||
is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
||||
Lookup mimetype not only by extension,
|
||||
@@ -467,22 +585,13 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
// fail later
|
||||
else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
|
||||
!opt->state.stop) {
|
||||
// Check if the file is ready in backing. We basically take the same logic as later.
|
||||
// FIXME: we should cleanup and factorize this unholy mess
|
||||
// Check if the file is ready in backing.
|
||||
if (headers != NULL && headers->status >= 0 && !is_redirect) {
|
||||
if (strnotempty(headers->r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, headers->r.cdispo);
|
||||
} else if (wire_patches_ext(opt, headers->r.contenttype,
|
||||
headers->url_fil)) {
|
||||
char s[16];
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
headers->r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, s);
|
||||
}
|
||||
}
|
||||
const sniff_src src = {sback, headers, adr, fil, NULL};
|
||||
|
||||
ext_chg = resolve_extension(opt, &src, headers->r.cdispo,
|
||||
headers->r.contenttype,
|
||||
headers->url_fil, ext, sizeof(ext));
|
||||
}
|
||||
else if (mime_type != NULL) {
|
||||
ext[0] = '\0';
|
||||
@@ -500,13 +609,6 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (!may_unknown2(opt, mime_type, fil)) {
|
||||
ext_chg = 1;
|
||||
}
|
||||
#ifdef DEFAULT_BIN_EXT
|
||||
// no extension and potentially bogus
|
||||
else if (ishtml(opt, fil) == -2) {
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
ext_chg = 0;
|
||||
}
|
||||
@@ -525,11 +627,10 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
int has_been_moved = 0;
|
||||
lien_adrfil current;
|
||||
|
||||
/* Ensure we don't use too many sockets by using a "testing" one
|
||||
If we have only 1 simultaneous connection authorized, wait for pending download
|
||||
Wait for an available slot
|
||||
/* Wait for an available test slot, honoring the connection limits
|
||||
*/
|
||||
URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
|
||||
if (!hts_wait_available_socket(sback, opt, cache, ptr))
|
||||
return -1;
|
||||
|
||||
/* Rock'in */
|
||||
current.adr[0] = current.fil[0] = '\0';
|
||||
@@ -559,24 +660,11 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (ptr >= 0) {
|
||||
back_fillmax(sback, opt, cache, ptr, numero_passe);
|
||||
}
|
||||
// on est obligé d'appeler le shell pour le refresh..
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart),
|
||||
&HTS_STAT)) {
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
|
||||
} else if (opt->state._hts_cancel ||
|
||||
!back_checkmirror(
|
||||
opt)) { // cancel level 2 or 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
stop_looping = 1;
|
||||
}
|
||||
@@ -641,8 +729,9 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
"Loop with HEAD request (during prefetch) at %s%s",
|
||||
current.adr, current.fil);
|
||||
}
|
||||
// Ajouter
|
||||
URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
|
||||
if (!hts_wait_available_socket(sback, opt,
|
||||
cache, ptr))
|
||||
return -1;
|
||||
if (back_add(sback, opt, cache, moved.adr, moved.fil, methode, referer_adr, referer_fil, 1) != -1) { // OK
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"(during prefetch) %s (%d) to link %s at %s%s",
|
||||
@@ -696,30 +785,10 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
// libérer emplacement backing
|
||||
}
|
||||
|
||||
{ // pas d'erreur, changer type?
|
||||
char s[16];
|
||||
|
||||
s[0] = '\0';
|
||||
if (strnotempty(back[b].r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, back[b].r.cdispo);
|
||||
} else if (wire_patches_ext(opt, back[b].r.contenttype,
|
||||
back[b].url_fil)) {
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
back[b].r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, s);
|
||||
}
|
||||
}
|
||||
#ifdef DEFAULT_BIN_EXT
|
||||
// no extension and potentially bogus
|
||||
else if (ishtest == -2) {
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// no error: change the type?
|
||||
ext_chg = resolve_extension(
|
||||
opt, NULL, back[b].r.cdispo, back[b].r.contenttype,
|
||||
back[b].url_fil, ext, sizeof(ext));
|
||||
}
|
||||
// FIN Si non déplacé, forcer type?
|
||||
|
||||
|
||||
@@ -100,6 +100,8 @@ void standard_name(char *b, size_t bsize, const char *dot_pos,
|
||||
const char *nom_pos, const char *fil_complete,
|
||||
int short_ver);
|
||||
void url_savename_addstr(char *d, const char *s);
|
||||
/* Contested wire-vs-ext verdict that a body sniff could settle (htssniff.h). */
|
||||
int hts_ext_sniff_wanted(httrackp *opt, const char *wiremime, const char *file);
|
||||
char *url_md5(char *digest_buffer, const char *fil_complete);
|
||||
void url_savename_refname(const char *adr, const char *fil, char *filename);
|
||||
char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
|
||||
|
||||
@@ -535,6 +535,10 @@ struct httrackp {
|
||||
no_www_dedup; /**< with urlhack, keep www.host distinct from host */
|
||||
hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */
|
||||
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
|
||||
String cookies_file; /**< extra Netscape cookies.txt to preload
|
||||
(--cookies-file) */
|
||||
int pause_min_ms; /**< inter-file pause lower bound, ms (0=off, #185) */
|
||||
int pause_max_ms; /**< inter-file pause upper bound, ms */
|
||||
};
|
||||
|
||||
/* Running statistics for a mirror. */
|
||||
|
||||
494
src/htsparse.c
494
src/htsparse.c
@@ -49,6 +49,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htsindex.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htssniff.h"
|
||||
|
||||
/* external modules */
|
||||
#include "htsmodules.h"
|
||||
@@ -77,13 +78,14 @@ Please visit our Website: http://www.httrack.com
|
||||
/** Append to the output buffer the string 'A'. **/
|
||||
#define HT_ADD(A) TypedArrayAppend(output_buffer, A, strlen(A))
|
||||
|
||||
/** Append to the output buffer the string 'A', html-escaped. **/
|
||||
#define HT_ADD_HTMLESCAPED_ANY(A, FUNCTION) do { \
|
||||
/* clang-format off: an edit realigns all backslashes, churning the macro. */
|
||||
/* clang-format off */
|
||||
/** Append 'A' to the output buffer, html-escaped; FACTOR = max byte expansion. **/
|
||||
#define HT_ADD_HTMLESCAPED_ANY(A, FUNCTION, FACTOR) do { \
|
||||
if ((opt->getmode & 1) != 0 && ptr>0) { \
|
||||
const char *const str_ = (A); \
|
||||
size_t size_; \
|
||||
/* & is the maximum expansion */ \
|
||||
TypedArrayEnsureRoom(output_buffer, strlen(str_) * 5 + 1024); \
|
||||
TypedArrayEnsureRoom(output_buffer, strlen(str_) * (FACTOR) + 1024); \
|
||||
size_ = FUNCTION(str_, &TypedArrayTail(output_buffer), \
|
||||
TypedArrayRoom(output_buffer)); \
|
||||
TypedArraySize(output_buffer) += size_; \
|
||||
@@ -91,188 +93,113 @@ Please visit our Website: http://www.httrack.com
|
||||
} while(0)
|
||||
|
||||
/** Append to the output buffer the string 'A', html-escaped for &. **/
|
||||
#define HT_ADD_HTMLESCAPED(A) HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print)
|
||||
#define HT_ADD_HTMLESCAPED(A) \
|
||||
HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print, HTS_HTMLESCAPE_MAXEXP)
|
||||
|
||||
/**
|
||||
* Append to the output buffer the string 'A', html-escaped for & and
|
||||
* Append to the output buffer the string 'A', html-escaped for & and
|
||||
* high chars.
|
||||
**/
|
||||
#define HT_ADD_HTMLESCAPED_FULL(A) HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print_full)
|
||||
#define HT_ADD_HTMLESCAPED_FULL(A) \
|
||||
HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print_full, HTS_HTMLESCAPE_FULL_MAXEXP)
|
||||
/* clang-format on */
|
||||
|
||||
// does nothing
|
||||
#define XH_uninit do {} while(0)
|
||||
|
||||
#define HT_ADD_END { \
|
||||
int ok=0;\
|
||||
if (TypedArraySize(output_buffer) != 0) { \
|
||||
const size_t ht_len = TypedArraySize(output_buffer); \
|
||||
const char *const ht_buff = TypedArrayElts(output_buffer); \
|
||||
char digest[32+2];\
|
||||
off_t fsize_old = fsize(fconv(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),savename()));\
|
||||
digest[0] = '\0';\
|
||||
domd5mem(TypedArrayElts(output_buffer), ht_len, digest, 1);\
|
||||
if (fsize_old == (off_t) ht_len) { \
|
||||
int mlen = 0;\
|
||||
char* mbuff;\
|
||||
cache_readdata(cache,"//[HTML-MD5]//",savename(),&mbuff,&mlen);\
|
||||
if (mlen) \
|
||||
mbuff[mlen]='\0';\
|
||||
if ((mlen == 32) && (strcmp(((mbuff!=NULL)?mbuff:""),digest)==0)) {\
|
||||
ok=1;\
|
||||
hts_log_print(opt, LOG_DEBUG, "File not re-written (md5): %s",savename());\
|
||||
} else {\
|
||||
ok=0;\
|
||||
} \
|
||||
}\
|
||||
if (!ok) { \
|
||||
file_notify(opt,urladr(), urlfil(), savename(), 1, 1, r->notmodified); \
|
||||
fp=filecreate(&opt->state.strc, savename()); \
|
||||
if (fp) { \
|
||||
if (ht_len>0) {\
|
||||
if (fwrite(ht_buff,1,ht_len,fp) != ht_len) { \
|
||||
int fcheck;\
|
||||
if ((fcheck=check_fatal_io_errno())) {\
|
||||
opt->state.exit_xh=-1;\
|
||||
}\
|
||||
if (opt->log) { \
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to write HTML file %s", savename());\
|
||||
if (fcheck) {\
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
|
||||
}\
|
||||
}\
|
||||
}\
|
||||
}\
|
||||
fclose(fp); fp=NULL; \
|
||||
if (strnotempty(r->lastmodified)) \
|
||||
set_filetime_rfc822(savename(),r->lastmodified); \
|
||||
} else {\
|
||||
int fcheck;\
|
||||
if ((fcheck=check_fatal_io_errno())) {\
|
||||
hts_log_print(opt, LOG_ERROR, "Mirror aborted: disk full or filesystem problems"); \
|
||||
opt->state.exit_xh=-1;\
|
||||
}\
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to save file %s", savename());\
|
||||
if (fcheck) {\
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
|
||||
}\
|
||||
}\
|
||||
} else {\
|
||||
file_notify(opt,urladr(), urlfil(), savename(), 0, 0, r->notmodified); \
|
||||
filenote(&opt->state.strc, savename(),NULL); \
|
||||
}\
|
||||
if (cache->ndx)\
|
||||
cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename(),digest,(int)strlen(digest));\
|
||||
} \
|
||||
TypedArrayFree(output_buffer); \
|
||||
}
|
||||
#define HT_ADD_FOP
|
||||
|
||||
// COPY IN HTSCORE.C
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_html_utf8),"index.html"),"primary","primary"); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
/* Mutable extended-context fields: one source of truth so the DEFINE/SET/SAVE
|
||||
load and store lists can't drift apart. */
|
||||
/* clang-format off */
|
||||
#define ENGINE_MUTABLE_FIELDS(X) \
|
||||
X(int, error, stre->error_) \
|
||||
X(int, store_errpage, stre->store_errpage_) \
|
||||
X(int, makeindex_done, stre->makeindex_done_) \
|
||||
X(FILE *, makeindex_fp, stre->makeindex_fp_) \
|
||||
X(int, makeindex_links, stre->makeindex_links_) \
|
||||
X(LLint, stat_fragment, stre->stat_fragment_)
|
||||
|
||||
#define ENGINE_FIELD_DECLARE(type, name, src) type name = *(src);
|
||||
#define ENGINE_FIELD_LOAD(type, name, src) name = *(src);
|
||||
#define ENGINE_FIELD_STORE(type, name, src) *(src) = name;
|
||||
|
||||
#define ENGINE_DEFINE_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
htsblk* const r HTS_UNUSED = stre->r_; \
|
||||
hash_struct* const hash HTS_UNUSED = stre->hash_; \
|
||||
char* const codebase HTS_UNUSED = stre->codebase; \
|
||||
char* const base HTS_UNUSED = stre->base; \
|
||||
/* */ \
|
||||
const char * const template_header HTS_UNUSED = stre->template_header_; \
|
||||
const char * const template_body HTS_UNUSED = stre->template_body_; \
|
||||
const char * const template_footer HTS_UNUSED = stre->template_footer_; \
|
||||
/* */ \
|
||||
HTS_UNUSED char* const makeindex_firstlink = stre->makeindex_firstlink_; \
|
||||
/* */ \
|
||||
/* */ \
|
||||
int error = * stre->error_; \
|
||||
int store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
int makeindex_done = *stre->makeindex_done_; \
|
||||
FILE* makeindex_fp = *stre->makeindex_fp_; \
|
||||
int makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
LLint stat_fragment = *stre->stat_fragment_; \
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_DECLARE) \
|
||||
/* load-once (kept out of SET/SAVE): re-reading would reset the throttle */ \
|
||||
HTS_UNUSED TStamp makestat_time = stre->makestat_time; \
|
||||
HTS_UNUSED FILE* makestat_fp = stre->makestat_fp
|
||||
|
||||
#define ENGINE_SET_CONTEXT() \
|
||||
ENGINE_SET_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
error = * stre->error_; \
|
||||
store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
makeindex_done = *stre->makeindex_done_; \
|
||||
makeindex_fp = *stre->makeindex_fp_; \
|
||||
makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
stat_fragment = *stre->stat_fragment_; \
|
||||
makestat_time = stre->makestat_time; \
|
||||
makestat_fp = stre->makestat_fp
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_LOAD)
|
||||
|
||||
#define ENGINE_LOAD_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT()
|
||||
|
||||
#define ENGINE_SAVE_CONTEXT() \
|
||||
ENGINE_SAVE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
* stre->error_ = error; \
|
||||
* stre->store_errpage_ = store_errpage; \
|
||||
/* */ \
|
||||
*stre->makeindex_done_ = makeindex_done; \
|
||||
*stre->makeindex_fp_ = makeindex_fp; \
|
||||
*stre->makeindex_links_ = makeindex_links; \
|
||||
/* */ \
|
||||
*stre->stat_fragment_ = stat_fragment
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_STORE)
|
||||
/* clang-format on */
|
||||
|
||||
#define _FILTERS (*opt->filters.filters)
|
||||
#define _FILTERS_PTR (opt->filters.filptr)
|
||||
#define _ROBOTS ((robots_wizard*)opt->robotsptr)
|
||||
|
||||
/* Apply current *adr character for the script automate */
|
||||
#define AUTOMATE_LOOKUP_CURRENT_ADR() do { \
|
||||
if (inscript) { \
|
||||
int new_state_pos; \
|
||||
new_state_pos=inscript_state[inscript_state_pos][(unsigned char)*html]; \
|
||||
if (new_state_pos < 0) { \
|
||||
new_state_pos=inscript_state[inscript_state_pos][INSCRIPT_DEFAULT]; \
|
||||
} \
|
||||
assertf(new_state_pos >= 0); \
|
||||
assertf(new_state_pos*sizeof(inscript_state[0]) < sizeof(inscript_state)); \
|
||||
inscript_state_pos=new_state_pos; \
|
||||
} \
|
||||
} while(0)
|
||||
/* JS-detection automaton states; INSCRIPT_DEFAULT is the synthetic "any other
|
||||
char" column of the transition table. */
|
||||
typedef enum {
|
||||
INSCRIPT_START = 0,
|
||||
INSCRIPT_ANTISLASH,
|
||||
INSCRIPT_INQUOTE,
|
||||
INSCRIPT_INQUOTE2,
|
||||
INSCRIPT_SLASH,
|
||||
INSCRIPT_SLASHSLASH,
|
||||
INSCRIPT_COMMENT,
|
||||
INSCRIPT_COMMENT2,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE2,
|
||||
INSCRIPT_DEFAULT = 256
|
||||
} INSCRIPT;
|
||||
|
||||
/* Increment current pointer to 'steps' characters, modifying automate if necessary */
|
||||
#define INCREMENT_CURRENT_ADR(steps) do { \
|
||||
int steps__ = (int) ( steps ); \
|
||||
while(steps__ > 0) { \
|
||||
html++; \
|
||||
AUTOMATE_LOOKUP_CURRENT_ADR(); \
|
||||
steps__ --; \
|
||||
} \
|
||||
} while(0)
|
||||
#define INSCRIPT_NSTATES 10 /* rows in the transition table */
|
||||
|
||||
/* Live view of the parser's automaton locals, set up once so the helpers below
|
||||
can drive it without capturing them by lexical scope. */
|
||||
typedef struct {
|
||||
const int *inscript; /* nonzero while inside a script body */
|
||||
const signed char (*table)[257]; /* [INSCRIPT_NSTATES][257] transitions */
|
||||
INSCRIPT *pos; /* current state */
|
||||
const char **html; /* parse cursor */
|
||||
} script_automate;
|
||||
|
||||
/* Feed the current *html byte to the automaton. No-op outside a script body. */
|
||||
static void hts_automate_lookup(const script_automate *aut) {
|
||||
if (*aut->inscript) {
|
||||
int next = aut->table[*aut->pos][(unsigned char) **aut->html];
|
||||
if (next < 0) {
|
||||
next = aut->table[*aut->pos][INSCRIPT_DEFAULT];
|
||||
}
|
||||
assertf(next >= 0 && next < INSCRIPT_NSTATES);
|
||||
*aut->pos = (INSCRIPT) next;
|
||||
}
|
||||
}
|
||||
|
||||
/* Advance the cursor by 'steps' bytes, feeding each to the automaton. */
|
||||
static void hts_automate_increment(const script_automate *aut, int steps) {
|
||||
while (steps > 0) {
|
||||
(*aut->html)++;
|
||||
hts_automate_lookup(aut);
|
||||
steps--;
|
||||
}
|
||||
}
|
||||
|
||||
/* Percent-encode the angle brackets of a string so it is safe to embed inside
|
||||
an HTML comment (the default footer) or any other HTML context. A URL holding
|
||||
@@ -302,6 +229,14 @@ static HTS_INLINE char html_prevc(const char *html, const char *start) {
|
||||
return html > start ? html[-1] : ' ';
|
||||
}
|
||||
|
||||
/* Drop a redirect Location's #fragment: a UA anchor, never part of the fetched
|
||||
* resource (#204). */
|
||||
static void url_drop_fragment(char *const url) {
|
||||
char *const frag = strchr(url, '#');
|
||||
if (frag != NULL)
|
||||
*frag = '\0';
|
||||
}
|
||||
|
||||
/* True if [s, s+len) is exactly an HTTP method token (XHR.open's first
|
||||
argument is a method, not a URL: #218). Case-insensitive. */
|
||||
static int is_http_method(const char *s, size_t len) {
|
||||
@@ -409,20 +344,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int incomment = 0; // dans un <!--
|
||||
int inscript = 0; // dans un scipt pour applets javascript)
|
||||
int inscript_locked = 0; // in locked script (ie. js file)
|
||||
signed char inscript_state[10][257];
|
||||
typedef enum {
|
||||
INSCRIPT_START = 0,
|
||||
INSCRIPT_ANTISLASH,
|
||||
INSCRIPT_INQUOTE,
|
||||
INSCRIPT_INQUOTE2,
|
||||
INSCRIPT_SLASH,
|
||||
INSCRIPT_SLASHSLASH,
|
||||
INSCRIPT_COMMENT,
|
||||
INSCRIPT_COMMENT2,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE2,
|
||||
INSCRIPT_DEFAULT = 256
|
||||
} INSCRIPT;
|
||||
signed char inscript_state[INSCRIPT_NSTATES][257];
|
||||
INSCRIPT inscript_state_pos = INSCRIPT_START;
|
||||
const char *inscript_name = NULL; // script tag name
|
||||
int inscript_tag = 0; // on est dans un <body onLoad="... terminé par >
|
||||
@@ -483,6 +405,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
inscript_state[INSCRIPT_COMMENT2]['*'] = INSCRIPT_COMMENT2;
|
||||
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE; /* #8: escape in '' */
|
||||
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE2][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE2; /* #9: escape in "" */
|
||||
const script_automate saut = {&inscript, inscript_state,
|
||||
&inscript_state_pos, &html};
|
||||
|
||||
/* Primary list or URLs */
|
||||
if (ptr == 0) {
|
||||
@@ -681,13 +605,14 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
// Decode title with encoding
|
||||
if (str->page_charset_ != NULL
|
||||
&& *str->page_charset_ != '\0') {
|
||||
char *const sUtf =
|
||||
hts_convertStringToUTF8(s, strlen(s), str->page_charset_);
|
||||
if (str->page_charset_ != NULL &&
|
||||
*str->page_charset_ != '\0') {
|
||||
char *sUtf = hts_convertStringToUTF8(
|
||||
s, strlen(s), str->page_charset_);
|
||||
if (sUtf != NULL) {
|
||||
strcpy(s, sUtf);
|
||||
free(sUtf);
|
||||
/* UTF-8 can expand past s[]; truncate to fit */
|
||||
snprintf(s, sizeof(s), "%s", sUtf);
|
||||
freet(sUtf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -701,7 +626,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
} else if (heap(ptr)->depth < opt->depth) { // on a sauté level1+1 et level1
|
||||
HT_INDEX_END;
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp,
|
||||
makeindex_links, makeindex_firstlink,
|
||||
template_footer, "primary", "primary");
|
||||
}
|
||||
} // if (opt->makeindex)
|
||||
}
|
||||
@@ -919,7 +846,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
/* automate */
|
||||
AUTOMATE_LOOKUP_CURRENT_ADR();
|
||||
hts_automate_lookup(&saut);
|
||||
|
||||
// Note:
|
||||
// Certaines pages ne respectent pas le html
|
||||
@@ -1835,7 +1762,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
// sauter espaces
|
||||
// adr+=p;
|
||||
INCREMENT_CURRENT_ADR(p);
|
||||
hts_automate_increment(&saut, p);
|
||||
while((is_space(*html)
|
||||
|| (inscriptgen && html[0] == '\\' && is_space(html[1])
|
||||
)
|
||||
@@ -1850,7 +1777,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
// puis quitter
|
||||
// html++; // sauter les espaces, "" et cie
|
||||
INCREMENT_CURRENT_ADR(1);
|
||||
hts_automate_increment(&saut, 1);
|
||||
}
|
||||
|
||||
/* Stop at \n (LF) if primary links or link lists */
|
||||
@@ -1865,7 +1792,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (*html == '\\') {
|
||||
if ((*(html + 1) == '\'') || (*(html + 1) == '"')) { // \" ou \'
|
||||
// html+=2; // sauter
|
||||
INCREMENT_CURRENT_ADR(2);
|
||||
hts_automate_increment(&saut, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1913,7 +1840,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (srcset_p) {
|
||||
while(html < r->adr + r->size
|
||||
&& (is_realspace(*html) || *html == ','))
|
||||
INCREMENT_CURRENT_ADR(1);
|
||||
hts_automate_increment(&saut, 1);
|
||||
}
|
||||
eadr = html;
|
||||
|
||||
@@ -3373,7 +3300,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
|
||||
assertf(eadr - html >= 0); // Should not go back
|
||||
if (eadr > html) {
|
||||
INCREMENT_CURRENT_ADR(eadr - 1 - html);
|
||||
hts_automate_increment(&saut, (int) (eadr - 1 - html));
|
||||
}
|
||||
// adr=eadr-1; // ** sauter
|
||||
|
||||
@@ -3392,7 +3319,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
q++; // skip whitespace and empty candidates
|
||||
if (q < endp && *q != '\0' && *q != ',' && *q != quote
|
||||
&& *q != '<' && *q != '>' && (unsigned char) *q >= 32) {
|
||||
INCREMENT_CURRENT_ADR(q - html); // keep the automate in sync
|
||||
hts_automate_increment(
|
||||
&saut, (int) (q - html)); // keep the automate in sync
|
||||
ok = 1;
|
||||
goto srcset_next;
|
||||
}
|
||||
@@ -3471,20 +3399,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
|
||||
back_fillmax(sback, opt, cache, ptr, numero_passe);
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, 0, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
if (!hts_loop_tick(sback, opt, 0, ptr)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
@@ -3495,7 +3410,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
nofollow = 1; // moins violent
|
||||
opt->state._hts_cancel = 0;
|
||||
}
|
||||
|
||||
}
|
||||
// refresh the backing system each 2 seconds
|
||||
if (engine_stats()) {
|
||||
@@ -3532,7 +3446,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
/* Flush and save to disk */
|
||||
HT_ADD_END; // achever
|
||||
if (TypedArraySize(output_buffer) != 0) {
|
||||
hts_finish_html_file(
|
||||
opt, cache, r, &fp, TypedArrayElts(output_buffer),
|
||||
TypedArraySize(output_buffer), urladr(), urlfil(), savename());
|
||||
}
|
||||
TypedArrayFree(output_buffer);
|
||||
}
|
||||
//
|
||||
//
|
||||
@@ -3557,6 +3476,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
|
||||
* contract in htsparse.h. */
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil) {
|
||||
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
|
||||
const int norm_query = opt->urlhack && !opt->no_query_dedup;
|
||||
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
if (strcasecmp(jump_identification_const(moved_adr),
|
||||
jump_identification_const(cur_adr)) != 0)
|
||||
return HTS_FALSE;
|
||||
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
|
||||
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
|
||||
return strcasecmp(n_fil, pn_fil) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Check 301, 302, .. statuscodes (moved)
|
||||
*/
|
||||
@@ -3596,41 +3533,15 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
//
|
||||
|
||||
strcpybuff(mov_url, r->location);
|
||||
url_drop_fragment(mov_url);
|
||||
|
||||
// url qque -> adresse+fichier
|
||||
if ((reponse =
|
||||
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
|
||||
int set_prio_to = 0; // pas de priotité fixéd par wizard
|
||||
|
||||
// check whether URLHack is harmless or not (per the effective
|
||||
// sub-flags)
|
||||
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
|
||||
!opt->no_query_dedup)) {
|
||||
const int norm_host = !opt->no_www_dedup;
|
||||
const int norm_slash = !opt->no_slash_dedup;
|
||||
const int norm_query = !opt->no_query_dedup;
|
||||
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
strlcpybuff(n_adr,
|
||||
norm_host ? jump_normalized_const(moved->adr)
|
||||
: jump_identification_const(moved->adr),
|
||||
sizeof(n_adr));
|
||||
strlcpybuff(pn_adr,
|
||||
norm_host ? jump_normalized_const(urladr())
|
||||
: jump_identification_const(urladr()),
|
||||
sizeof(pn_adr));
|
||||
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
if (strcasecmp(n_adr, pn_adr) == 0
|
||||
&& strcasecmp(n_fil, pn_fil) == 0) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
}
|
||||
// A same-file alias redirect must be followed, not stubbed (#159).
|
||||
const hts_boolean same_savefile = hts_redirect_same_savefile(
|
||||
opt, urladr(), urlfil(), moved->adr, moved->fil);
|
||||
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
|
||||
// c'est (en gros) la même URL..
|
||||
// si c'est un problème de casse dans le host c'est que le serveur est buggé
|
||||
@@ -3658,7 +3569,17 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
|
||||
moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
} else if (same_savefile) {
|
||||
// A stub would point at itself; follow the redirect instead.
|
||||
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
|
||||
&set_prio_to, NULL) != 1) {
|
||||
get_it = 1;
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirect to a same-file alias, fetching real "
|
||||
"content: %s%s -> %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
}
|
||||
|
||||
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
|
||||
@@ -3681,7 +3602,11 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
heap(heap(ptr)->precedent)->adr,
|
||||
heap(heap(ptr)->precedent)->fil, opt,
|
||||
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
|
||||
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// Same-file alias: the reserved name is the invalidated source,
|
||||
// so record anyway.
|
||||
if (same_savefile ||
|
||||
hash_read(hash, savedmoved.save, NULL,
|
||||
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// enregistrer lien avec SAV IDENTIQUE
|
||||
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||
// mode test?
|
||||
@@ -3705,7 +3630,6 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
"moving %s to an existing file %s",
|
||||
heap(ptr)->fil, urlfil());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4022,22 +3946,8 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
|
||||
{
|
||||
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
b = 0;
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)
|
||||
|| !back_checkmirror(opt)) {
|
||||
if (!hts_loop_tick(sback, opt, b, ptr) || !back_checkmirror(opt)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
@@ -4139,21 +4049,11 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
|
||||
while(opt->state._hts_setpause || back_pluggable_sockets_strict(sback, opt) <= 0) { // on fait la pause..
|
||||
opt->state._hts_in_html_parsing = 6;
|
||||
back_wait(sback, opt, cache, HTS_STAT.stat_timestart);
|
||||
/* time limit (-E) exceeded: stop waiting for a socket (#481) */
|
||||
if (!back_checkmirror(opt))
|
||||
break;
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
@@ -4340,26 +4240,12 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
|
||||
freet(s);
|
||||
}
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
*stre->exit_xh_ = 1; // exit requested
|
||||
XH_uninit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#if HTS_POLL
|
||||
@@ -4592,10 +4478,9 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
IS_DELAYED_EXT(afs->save) && continue_loop && loops < 7; loops++) {
|
||||
continue_loop = 0;
|
||||
|
||||
/*
|
||||
Wait for an available slot
|
||||
*/
|
||||
WAIT_FOR_AVAILABLE_SOCKET();
|
||||
/* Wait for an available slot */
|
||||
if (!hts_wait_available_socket(sback, opt, cache, ptr))
|
||||
return -1;
|
||||
|
||||
/* We can lookup directly in the cache to speedup this mess */
|
||||
if (opt->delayed_cached) {
|
||||
@@ -4741,39 +4626,28 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
if (ptr >= 0) {
|
||||
back_fillmax(sback, opt, cache, ptr, numero_passe);
|
||||
}
|
||||
// on est obligé d'appeler le shell pour le refresh..
|
||||
{
|
||||
|
||||
// Transfer rate
|
||||
engine_stats();
|
||||
|
||||
// Refresh various stats
|
||||
HTS_STAT.stat_nsocket = back_nsoc(sback);
|
||||
HTS_STAT.stat_errors = fspc(opt, NULL, "error");
|
||||
HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
|
||||
HTS_STAT.stat_infos = fspc(opt, NULL, "info");
|
||||
HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
|
||||
HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
|
||||
|
||||
if (!RUN_CALLBACK7
|
||||
(opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
|
||||
(int) (time_local() - HTS_STAT.stat_timestart), &HTS_STAT)) {
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
break;
|
||||
}
|
||||
if (!hts_loop_tick(sback, opt, b, ptr)) {
|
||||
back_set_unlocked(sback, b);
|
||||
return -1;
|
||||
} else if (opt->state._hts_cancel ||
|
||||
!back_checkmirror(
|
||||
opt)) { // cancel level 2 or 1 (cancel parsing)
|
||||
back_delete(opt, cache, sback, b); // cancel test
|
||||
break;
|
||||
}
|
||||
} while(
|
||||
/* dns/connect/request */
|
||||
(back[b].status >= 99 && back[b].status <= 101)
|
||||
||
|
||||
/* For redirects, wait for request to be terminated */
|
||||
(HTTP_IS_REDIRECT(back[b].r.statuscode) && back[b].status > 0)
|
||||
||
|
||||
/* Same for errors */
|
||||
(HTTP_IS_ERROR(back[b].r.statuscode) && back[b].status > 0)
|
||||
);
|
||||
} while (
|
||||
/* dns/connect/request */
|
||||
(back[b].status >= 99 && back[b].status <= 101) ||
|
||||
/* For redirects, wait for request to be terminated */
|
||||
(HTTP_IS_REDIRECT(back[b].r.statuscode) && back[b].status > 0) ||
|
||||
/* Same for errors */
|
||||
(HTTP_IS_ERROR(back[b].r.statuscode) && back[b].status > 0) ||
|
||||
/* Contested type: wait for a sniffable body head (or EOF) */
|
||||
(back[b].r.statuscode == HTTP_OK && back[b].status > 0 &&
|
||||
strnotempty(back[b].r.cdispo) == 0 &&
|
||||
back[b].r.size < HTS_SNIFF_LEN &&
|
||||
hts_ext_sniff_wanted(opt, back[b].r.contenttype,
|
||||
back[b].url_fil)));
|
||||
if (b >= 0) {
|
||||
back_set_unlocked(sback, b); // Unlocked entry
|
||||
}
|
||||
@@ -4803,6 +4677,7 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
|
||||
mov_url[0] = '\0';
|
||||
strcpybuff(mov_url, back[b].r.location); // copier URL
|
||||
url_drop_fragment(mov_url);
|
||||
|
||||
/* Remove (temporarily created) file if it was created */
|
||||
UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), back[b].url_sav));
|
||||
@@ -4907,6 +4782,9 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
|
||||
/* Still have a back reference */
|
||||
if (b >= 0) {
|
||||
/* patch url_sav BEFORE finalize: it records/caches under this name
|
||||
*/
|
||||
strcpybuff(back[b].url_sav, afs->save);
|
||||
/* Finalize now as we have the type */
|
||||
if (back[b].status == STATUS_READY) {
|
||||
if (!back[b].finalized) {
|
||||
@@ -4914,8 +4792,6 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
back_finalize(opt, cache, sback, b);
|
||||
}
|
||||
}
|
||||
/* Patch destination filename for direct-to-disk mode */
|
||||
strcpybuff(back[b].url_sav, afs->save);
|
||||
}
|
||||
|
||||
} // b >= 0
|
||||
|
||||
@@ -116,6 +116,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
|
||||
int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
htsmoduleStructExtended * stre);
|
||||
|
||||
/*
|
||||
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
|
||||
same local file, so it must be followed rather than turned into a
|
||||
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
|
||||
stripped, www kept (www dedup is the crawl layer's job), path
|
||||
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
|
||||
on the dedup hash, which folds www and never collapses http<->https.
|
||||
*/
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil);
|
||||
|
||||
/*
|
||||
Process user intercations: pause, add link, delete link..
|
||||
*/
|
||||
@@ -162,27 +175,4 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
/* Apply changes */ \
|
||||
* str->ptr_ = ptr
|
||||
|
||||
#define WAIT_FOR_AVAILABLE_SOCKET() do { \
|
||||
int prev = opt->state._hts_in_html_parsing; \
|
||||
while(back_pluggable_sockets_strict(sback, opt) <= 0) { \
|
||||
opt->state._hts_in_html_parsing = 6; \
|
||||
/* Wait .. */ \
|
||||
back_wait(sback,opt,cache,0); \
|
||||
/* Transfer rate */ \
|
||||
engine_stats(); \
|
||||
/* Refresh various stats */ \
|
||||
HTS_STAT.stat_nsocket=back_nsoc(sback); \
|
||||
HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \
|
||||
HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \
|
||||
HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \
|
||||
HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \
|
||||
HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \
|
||||
/* Check */ \
|
||||
if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count, -1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
|
||||
return -1; \
|
||||
} \
|
||||
} \
|
||||
opt->state._hts_in_html_parsing = prev; \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
167
src/htsrobots.c
167
src/htsrobots.c
@@ -44,28 +44,84 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
// -- robots --
|
||||
|
||||
/* RFC 9309 path-prefix match; '*' any run, '$' anchors end; linear. */
|
||||
static hts_boolean robots_pattern_match(const char *pattern, const char *path) {
|
||||
size_t patlen = strlen(pattern);
|
||||
hts_boolean anchored = HTS_FALSE;
|
||||
const char *p, *pend, *s;
|
||||
const char *star = NULL, *star_s = NULL;
|
||||
|
||||
if (patlen > 0 && pattern[patlen - 1] == '$') {
|
||||
anchored = HTS_TRUE;
|
||||
patlen--;
|
||||
}
|
||||
p = pattern;
|
||||
pend = pattern + patlen;
|
||||
s = path;
|
||||
while (*s != '\0') {
|
||||
if (p == pend) {
|
||||
if (!anchored)
|
||||
return HTS_TRUE; // prefix matched
|
||||
if (star != NULL) { // anchored: '*' must eat the rest
|
||||
p = star + 1;
|
||||
s = ++star_s;
|
||||
continue;
|
||||
}
|
||||
return HTS_FALSE;
|
||||
}
|
||||
if (*p == '*') {
|
||||
star = p++;
|
||||
star_s = s;
|
||||
} else if (*p == *s) {
|
||||
p++;
|
||||
s++;
|
||||
} else if (star != NULL) {
|
||||
p = star + 1;
|
||||
s = ++star_s;
|
||||
} else {
|
||||
return HTS_FALSE;
|
||||
}
|
||||
}
|
||||
while (p < pend && *p == '*')
|
||||
p++;
|
||||
return (p == pend) ? HTS_TRUE : HTS_FALSE;
|
||||
}
|
||||
|
||||
// fil="" : vérifier si règle déja enregistrée
|
||||
int checkrobots(robots_wizard * robots, const char *adr, const char *fil) {
|
||||
while(robots) {
|
||||
if (strfield2(robots->adr, adr)) {
|
||||
if (fil[0]) {
|
||||
/* RFC 9309: longest pattern wins, Allow beats Disallow on ties. */
|
||||
int ptr = 0;
|
||||
char line[250];
|
||||
char line[HTS_ROBOTS_TOKEN_SIZE];
|
||||
size_t toklen = strlen(robots->token);
|
||||
size_t best_len = 0;
|
||||
hts_boolean matched = HTS_FALSE;
|
||||
hts_boolean best_allow = HTS_FALSE;
|
||||
|
||||
if (strnotempty(robots->token)) {
|
||||
do {
|
||||
ptr += binput(robots->token + ptr, line, 200);
|
||||
if (line[0] == '/') { // absolu
|
||||
if (strfield(fil, line)) { // commence avec ligne
|
||||
return -1; // interdit
|
||||
}
|
||||
} else { // relatif
|
||||
if (strstrcase(fil, line)) {
|
||||
return -1;
|
||||
while (ptr < (int) toklen) {
|
||||
ptr += binput(robots->token + ptr, line, sizeof(line) - 1);
|
||||
if (line[0] != 'A' && line[0] != 'D')
|
||||
continue;
|
||||
{
|
||||
const hts_boolean is_allow =
|
||||
(line[0] == 'A') ? HTS_TRUE : HTS_FALSE;
|
||||
const char *pat = line + 1;
|
||||
|
||||
if (robots_pattern_match(pat, fil)) {
|
||||
const size_t len = strlen(pat);
|
||||
|
||||
if (!matched || len > best_len || (len == best_len && is_allow)) {
|
||||
matched = HTS_TRUE;
|
||||
best_len = len;
|
||||
best_allow = is_allow;
|
||||
}
|
||||
}
|
||||
} while((strnotempty(line)) && (ptr < (int) strlen(robots->token)));
|
||||
}
|
||||
}
|
||||
if (matched && !best_allow)
|
||||
return -1; // forbidden
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
@@ -74,6 +130,93 @@ int checkrobots(robots_wizard * robots, const char *adr, const char *fil) {
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Append "<marker><pattern>\n" to the bounded rule blob if it fits. */
|
||||
static void robots_blob_add(char *blob, size_t blobsize, char marker,
|
||||
const char *pat) {
|
||||
const size_t used = strlen(blob);
|
||||
const size_t need = strlen(pat) + 2; // marker + '\n'
|
||||
|
||||
if (need < blobsize - used) { // overflow-safe: used <= blobsize-1
|
||||
blob[used] = marker;
|
||||
blob[used + 1] = '\0';
|
||||
strlcatbuff(blob, pat, blobsize);
|
||||
strlcatbuff(blob, "\n", blobsize);
|
||||
}
|
||||
}
|
||||
|
||||
void robots_parse(robots_wizard *robots, const char *adr, const char *body,
|
||||
size_t bodysize, char *info, size_t infosize,
|
||||
hts_boolean keep_root_disallow) {
|
||||
size_t bptr = 0;
|
||||
int record = 0;
|
||||
char BIGSTK line[1024];
|
||||
char BIGSTK blob[HTS_ROBOTS_TOKEN_SIZE];
|
||||
|
||||
blob[0] = '\0';
|
||||
if (info != NULL && infosize > 0)
|
||||
info[0] = '\0';
|
||||
#if DEBUG_ROBOTS
|
||||
printf("robots.txt dump:\n%s\n", body);
|
||||
#endif
|
||||
while (bptr < bodysize) {
|
||||
char *comm;
|
||||
int llen;
|
||||
|
||||
bptr += binput(body + bptr, line, sizeof(line) - 2);
|
||||
comm = strchr(line, '#'); // strip comment
|
||||
if (comm != NULL)
|
||||
*comm = '\0';
|
||||
llen = (int) strlen(line); // strip trailing spaces
|
||||
while (llen > 0 && is_realspace(line[llen - 1])) {
|
||||
line[llen - 1] = '\0';
|
||||
llen--;
|
||||
}
|
||||
if (strfield(line, "user-agent:")) {
|
||||
char *a = line + 11;
|
||||
|
||||
while (is_realspace(*a))
|
||||
a++;
|
||||
if (*a == '*') {
|
||||
if (record != 2)
|
||||
record = 1; // generic group applies to us
|
||||
} else if (strfield(a, "httrack") || strfield(a, "winhttrack") ||
|
||||
strfield(a, "webhttrack")) {
|
||||
blob[0] = '\0'; // explicit group: restart capture
|
||||
if (info != NULL && infosize > 0)
|
||||
info[0] = '\0';
|
||||
record = 2; // locked to the httrack group
|
||||
} else
|
||||
record = 0;
|
||||
} else if (record) {
|
||||
hts_boolean is_allow = strfield(line, "allow:");
|
||||
hts_boolean is_disallow = !is_allow && strfield(line, "disallow:");
|
||||
|
||||
if (is_allow || is_disallow) {
|
||||
char *a = line + (is_allow ? 6 : 9);
|
||||
|
||||
while (is_realspace(*a))
|
||||
a++;
|
||||
if (strnotempty(a)) {
|
||||
if (is_disallow && !keep_root_disallow && strcmp(a, "/") == 0) {
|
||||
// dropped: site-wide disallow ignored by option
|
||||
} else {
|
||||
robots_blob_add(blob, sizeof(blob), is_allow ? 'A' : 'D', a);
|
||||
if (is_disallow && info != NULL &&
|
||||
strlen(a) + 2 < infosize - strlen(info)) {
|
||||
if (strnotempty(info))
|
||||
strlcatbuff(info, ", ", infosize);
|
||||
strlcatbuff(info, a, infosize);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (strnotempty(blob))
|
||||
checkrobots_set(robots, adr, blob);
|
||||
}
|
||||
|
||||
int checkrobots_set(robots_wizard * robots, const char *adr, const char *data) {
|
||||
if (((int) strlen(adr)) >= sizeof(robots->adr) - 2)
|
||||
return 0;
|
||||
|
||||
@@ -39,17 +39,27 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_DEF_FWSTRUCT_robots_wizard
|
||||
typedef struct robots_wizard robots_wizard;
|
||||
#endif
|
||||
|
||||
/* Per-host blob: one rule per line, first byte 'A'/'D' then path pattern. */
|
||||
#define HTS_ROBOTS_TOKEN_SIZE 4096
|
||||
|
||||
struct robots_wizard {
|
||||
char adr[128];
|
||||
char token[4096];
|
||||
char token[HTS_ROBOTS_TOKEN_SIZE];
|
||||
struct robots_wizard *next;
|
||||
};
|
||||
|
||||
/* Library internal definictions */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
/* -1 if `fil` disallowed for `adr` (RFC 9309); empty: -1 if rules exist. */
|
||||
int checkrobots(robots_wizard * robots, const char *adr, const char *fil);
|
||||
void checkrobots_free(robots_wizard * robots);
|
||||
int checkrobots_set(robots_wizard * robots, const char *adr, const char *data);
|
||||
/* Parse robots.txt `body` for `adr`, storing the HTTrack group's rules; `info`
|
||||
gets a disallow summary, `keep_root_disallow` FALSE drops "Disallow: /". */
|
||||
void robots_parse(robots_wizard *robots, const char *adr, const char *body,
|
||||
size_t bodysize, char *info, size_t infosize,
|
||||
hts_boolean keep_root_disallow);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -456,6 +456,13 @@ static HTS_INLINE HTS_UNUSED const char *htsbuff_str(const htsbuff *b) {
|
||||
return b->buf;
|
||||
}
|
||||
|
||||
/** True if 'count' records of >= 1 byte each fit in 'available' bytes; guards
|
||||
an attacker-controlled count driving a large allocation. */
|
||||
static HTS_INLINE HTS_UNUSED hts_boolean hts_count_fits(size_t count,
|
||||
LLint available) {
|
||||
return (available >= 0 && (LLint) count <= available) ? HTS_TRUE : HTS_FALSE;
|
||||
}
|
||||
|
||||
/* Thin aliases over the libc allocator/memcpy (historical "t" suffix); no
|
||||
added bounds checking. freet() also NULLs the freed pointer and tolerates
|
||||
NULL. memcpybuff() despite the name is a raw memcpy: the caller owns the
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -358,12 +358,12 @@ int smallserver(T_SOC soc, char *url, char *method, char *data, char *path) {
|
||||
{NULL, 0}
|
||||
};
|
||||
initStrElt initStr[] = {
|
||||
{"user", "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)"},
|
||||
{"footer",
|
||||
"<!-- Mirrored from %s%s by HTTrack Website Copier/3.x [XR&CO'2014], %s -->"},
|
||||
{"url2", "+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"},
|
||||
{NULL, NULL}
|
||||
};
|
||||
{"user", HTS_DEFAULT_USER_AGENT},
|
||||
{"footer", "<!-- Mirrored from %s%s by HTTrack Website Copier/3.x "
|
||||
"[XR&CO'2014], %s -->"},
|
||||
{"url2",
|
||||
"+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"},
|
||||
{NULL, NULL}};
|
||||
int i = 0;
|
||||
|
||||
for(i = 0; initInt[i].name; i++) {
|
||||
|
||||
352
src/htssniff.c
Normal file
352
src/htssniff.c
Normal file
@@ -0,0 +1,352 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2017 Xavier Roche and other contributors
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Important notes:
|
||||
|
||||
- We hereby ask people using this source NOT to use it in purpose of grabbing
|
||||
emails addresses, or collecting any other private information on persons.
|
||||
This would disgrace our work, and spoil the many hours we spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: MIME magic-byte consistency checks */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#include "htssniff.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "htslib.h"
|
||||
|
||||
/* One magic rule: `len` bytes at `off` confirm `mime`. */
|
||||
typedef struct sniff_magic {
|
||||
const char *mime;
|
||||
unsigned short off;
|
||||
unsigned char len;
|
||||
const char *bytes;
|
||||
} sniff_magic;
|
||||
|
||||
/* Direction is mime -> magic (verify a claim, never classify); types with
|
||||
no reliable magic (plain text, css, js..) are deliberately absent. Patterns
|
||||
follow the WHATWG MIME Sniffing Standard tables where it defines them
|
||||
(https://mimesniff.spec.whatwg.org/); the rest covers httrack's wider MIME
|
||||
set. Spec-only types absent from our MIME tables (EOT, font/collection)
|
||||
are omitted as unreachable. */
|
||||
static const sniff_magic sniff_table[] = {
|
||||
/* images */
|
||||
{"image/jpeg", 0, 3, "\xff\xd8\xff"},
|
||||
{"image/pipeg", 0, 3, "\xff\xd8\xff"},
|
||||
{"image/pjpeg", 0, 3, "\xff\xd8\xff"},
|
||||
{"image/png", 0, 8, "\x89PNG\r\n\x1a\n"},
|
||||
{"image/gif", 0, 6, "GIF87a"},
|
||||
{"image/gif", 0, 6, "GIF89a"},
|
||||
{"image/bmp", 0, 2, "BM"},
|
||||
{"image/tiff", 0, 4, "II*\0"},
|
||||
{"image/tiff", 0, 4, "MM\0*"},
|
||||
{"image/x-icon", 0, 4, "\0\0\1\0"},
|
||||
{"image/x-icon", 0, 4, "\0\0\2\0"}, /* Windows cursor, per the spec */
|
||||
{"image/x-portable-bitmap", 0, 2, "P1"},
|
||||
{"image/x-portable-bitmap", 0, 2, "P4"},
|
||||
{"image/x-portable-pixmap", 0, 2, "P3"},
|
||||
{"image/x-portable-pixmap", 0, 2, "P6"},
|
||||
{"image/x-xpixmap", 0, 9, "/* XPM */"},
|
||||
{"image/x-xbitmap", 0, 7, "#define"},
|
||||
{"image/x-rgb", 0, 2, "\x01\xda"},
|
||||
{"image/x-cmu-raster", 0, 4, "\xf1\x00\x40\xbb"},
|
||||
/* audio */
|
||||
{"audio/mpeg", 0, 3, "ID3"},
|
||||
{"audio/basic", 0, 4, ".snd"},
|
||||
{"audio/mid", 0, 8, "MThd\0\0\0\6"},
|
||||
{"audio/midi", 0, 8, "MThd\0\0\0\6"},
|
||||
{"audio/x-pn-realaudio", 0, 4, ".ra\xfd"},
|
||||
{"audio/x-pn-realaudio", 0, 4, ".RMF"},
|
||||
{"audio/x-pn-realaudio-plugin", 0, 4, ".ra\xfd"},
|
||||
{"audio/x-pn-realaudio-plugin", 0, 4, ".RMF"},
|
||||
{"audio/flac", 0, 4, "fLaC"},
|
||||
{"audio/aac", 0, 4, "ADIF"},
|
||||
/* video */
|
||||
{"video/mpeg", 0, 4, "\x00\x00\x01\xba"},
|
||||
{"video/mpeg", 0, 4, "\x00\x00\x01\xb3"},
|
||||
{"video/x-sgi-movie", 0, 4, "MOVI"},
|
||||
/* archives / compression */
|
||||
{"application/x-gzip", 0, 3, "\x1f\x8b\x08"},
|
||||
{"multipart/x-gzip", 0, 3, "\x1f\x8b\x08"},
|
||||
{"application/x-compressed", 0, 3, "\x1f\x8b\x08"},
|
||||
{"application/x-compress", 0, 2, "\x1f\x9d"},
|
||||
{"application/x-bzip2", 0, 3, "BZh"},
|
||||
{"application/x-7z-compressed", 0, 6, "7z\xbc\xaf\x27\x1c"},
|
||||
/* 6-byte prefix common to RAR4 (spec) and RAR5 */
|
||||
{"application/x-rar-compressed", 0, 6, "Rar!\x1a\x07"},
|
||||
{"application/zstd", 0, 4, "\x28\xb5\x2f\xfd"},
|
||||
{"application/arj", 0, 2, "\x60\xea"},
|
||||
{"application/x-cpio", 0, 6, "070701"},
|
||||
{"application/x-cpio", 0, 6, "070707"},
|
||||
{"application/x-cpio", 0, 2, "\xc7\x71"},
|
||||
{"application/x-sv4cpio", 0, 6, "070701"},
|
||||
{"application/x-sv4crc", 0, 6, "070702"},
|
||||
{"application/x-stuffit", 0, 8, "StuffIt "},
|
||||
{"application/x-stuffit", 0, 4, "SIT!"},
|
||||
{"application/mac-binhex40", 0, 10, "(This file"},
|
||||
/* documents */
|
||||
{"application/pdf", 0, 5, "%PDF-"},
|
||||
{"application/postscript", 0, 2, "%!"},
|
||||
{"application/rtf", 0, 5, "{\\rtf"},
|
||||
{"application/x-dvi", 0, 2, "\xf7\x02"},
|
||||
{"application/x-hdf", 0, 4, "\x0e\x03\x13\x01"},
|
||||
{"application/x-hdf", 0, 8, "\x89HDF\r\n\x1a\n"},
|
||||
{"application/x-netcdf", 0, 4, "CDF\x01"},
|
||||
{"application/x-netcdf", 0, 4, "CDF\x02"},
|
||||
{"application/x-msaccess", 0, 19, "\0\1\0\0Standard Jet DB"},
|
||||
/* fonts */
|
||||
{"font/woff", 0, 4, "wOFF"},
|
||||
{"font/woff2", 0, 4, "wOF2"},
|
||||
{"font/ttf", 0, 4, "\0\1\0\0"},
|
||||
{"font/ttf", 0, 4, "true"},
|
||||
{"font/otf", 0, 4, "OTTO"},
|
||||
/* misc */
|
||||
{"application/x-shockwave-flash", 0, 3, "FWS"},
|
||||
{"application/x-shockwave-flash", 0, 3, "CWS"},
|
||||
{"application/x-shockwave-flash", 0, 3, "ZWS"},
|
||||
{"application/futuresplash", 0, 3, "FWS"},
|
||||
{"application/x-director", 0, 4, "RIFX"},
|
||||
{"application/x-director", 0, 4, "XFIR"},
|
||||
{"application/x-java-vm", 0, 4, "\xca\xfe\xba\xbe"},
|
||||
{"application/wasm", 0, 4, "\0asm"},
|
||||
{"application/x-msmetafile", 0, 4, "\xd7\xcd\xc6\x9a"},
|
||||
{"application/x-msmetafile", 0, 4, "\x01\x00\x09\x00"},
|
||||
{"application/x-x509-ca-cert", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs12", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs7-mime", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs7-signature", 0, 2, "\x30\x82"},
|
||||
{"application/x-pkcs7-certificates", 0, 2, "\x30\x82"},
|
||||
{"x-world/x-vrml", 0, 5, "#VRML"},
|
||||
{"application/x-bittorrent", 0, 11, "d8:announce"},
|
||||
{"drawing/x-dwf", 0, 4, "(DWF"},
|
||||
{"application/acad", 0, 4, "AC10"},
|
||||
{NULL, 0, 0, NULL}};
|
||||
|
||||
/* MIME families sharing a container magic */
|
||||
static const char *const zip_mimes[] = {
|
||||
"application/zip", "application/x-zip-compressed", "multipart/x-zip", NULL};
|
||||
static const char *const zip_mime_prefixes[] = {
|
||||
"application/vnd.openxmlformats-officedocument.",
|
||||
"application/vnd.oasis.opendocument.", NULL};
|
||||
static const char *const ole_mimes[] = {"application/msword",
|
||||
"application/excel",
|
||||
"application/vnd.ms-excel",
|
||||
"application/powerpoint",
|
||||
"application/vnd.ms-powerpoint",
|
||||
"application/vnd.ms-project",
|
||||
"application/vnd.ms-works",
|
||||
"application/x-msmoney",
|
||||
"application/x-mspublisher",
|
||||
NULL};
|
||||
static const char *const tar_mimes[] = {
|
||||
"application/x-tar", "application/x-ustar", "application/x-gtar", NULL};
|
||||
static const char *const ogg_mimes[] = {"application/ogg", "audio/ogg",
|
||||
"video/ogg", "audio/opus", NULL};
|
||||
static const char *const ebml_mimes[] = {"video/webm", "audio/webm", NULL};
|
||||
/* ISO-BMFF, any 'ftyp' brand: containers overlap too much to split */
|
||||
static const char *const bmff_mimes[] = {"video/mp4", "audio/mp4",
|
||||
"video/quicktime", NULL};
|
||||
static const char *const avif_mimes[] = {"image/avif", NULL};
|
||||
static const char *const heic_mimes[] = {"image/heic", NULL};
|
||||
static const char *const asf_mimes[] = {"video/x-ms-asf", "video/x-ms-wmv",
|
||||
"video/x-la-asf", NULL};
|
||||
static const char *const xml_mimes[] = {"application/xml", "text/xml",
|
||||
"image/svg+xml", "image/svg-xml", NULL};
|
||||
static const char *const svg_mimes[] = {"image/svg+xml", "image/svg-xml", NULL};
|
||||
static const char *const html_mimes[] = {"text/html", NULL};
|
||||
static const char *const pem_mimes[] = {
|
||||
"application/x-x509-ca-cert", "application/x-pkcs7-certificates",
|
||||
"application/x-pkcs7-mime", "application/x-pkcs7-signature", NULL};
|
||||
|
||||
static hts_boolean mime_in(const char *const *list, const char *mime) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; list[i] != NULL; i++)
|
||||
if (strfield2(list[i], mime))
|
||||
return HTS_TRUE;
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static hts_boolean mime_in_prefix(const char *const *list, const char *mime) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; list[i] != NULL; i++)
|
||||
if (strfield(mime, list[i]))
|
||||
return HTS_TRUE;
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static hts_boolean has_bytes(const unsigned char *d, size_t n, size_t off,
|
||||
const char *bytes, size_t len) {
|
||||
/* overflow-safe: untrusted n alone on one side */
|
||||
return n >= off && len <= n - off && memcmp(d + off, bytes, len) == 0
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
}
|
||||
|
||||
static unsigned char ascii_lower(unsigned char c) {
|
||||
return c >= 'A' && c <= 'Z' ? (unsigned char) (c + 32) : c;
|
||||
}
|
||||
|
||||
/* Case-insensitive text prefix after an optional UTF-8 BOM and whitespace. */
|
||||
static hts_boolean has_text_prefix(const unsigned char *d, size_t n,
|
||||
const char *prefix) {
|
||||
const size_t len = strlen(prefix);
|
||||
size_t i, k;
|
||||
|
||||
i = n >= 3 && memcmp(d, "\xef\xbb\xbf", 3) == 0 ? 3 : 0;
|
||||
while (i < n && (d[i] == ' ' || d[i] == '\t' || d[i] == '\r' || d[i] == '\n'))
|
||||
i++;
|
||||
if (len > n - i) /* i <= n from the loop above */
|
||||
return HTS_FALSE;
|
||||
for (k = 0; k < len; k++)
|
||||
if (ascii_lower(d[i + k]) != ascii_lower((unsigned char) prefix[k]))
|
||||
return HTS_FALSE;
|
||||
return HTS_TRUE;
|
||||
}
|
||||
|
||||
typedef enum sniff_op {
|
||||
SNIFF_QUERY_KNOWN, /* is any rule defined for this MIME? */
|
||||
SNIFF_QUERY_MATCH /* do the bytes confirm this MIME? */
|
||||
} sniff_op;
|
||||
|
||||
/* Single walk for both queries so the rule set can't drift apart. */
|
||||
static hts_boolean sniff_eval(sniff_op op, const unsigned char *d, size_t n,
|
||||
const char *mime) {
|
||||
size_t i;
|
||||
|
||||
/* KNOWN short-circuits; MATCH tests the magic */
|
||||
#define SNIFF_RULE(cond) \
|
||||
do { \
|
||||
if (op == SNIFF_QUERY_KNOWN) \
|
||||
return HTS_TRUE; \
|
||||
if (cond) \
|
||||
return HTS_TRUE; \
|
||||
} while (0)
|
||||
|
||||
for (i = 0; sniff_table[i].mime != NULL; i++) {
|
||||
if (strfield2(sniff_table[i].mime, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, sniff_table[i].off, sniff_table[i].bytes,
|
||||
sniff_table[i].len));
|
||||
}
|
||||
}
|
||||
if (mime_in(zip_mimes, mime) || mime_in_prefix(zip_mime_prefixes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "PK\3\4", 4) ||
|
||||
has_bytes(d, n, 0, "PK\5\6", 4));
|
||||
}
|
||||
if (mime_in(ole_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8));
|
||||
}
|
||||
if (mime_in(tar_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 257, "ustar", 5));
|
||||
}
|
||||
if (mime_in(ogg_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "OggS\0", 5));
|
||||
}
|
||||
if (mime_in(ebml_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "\x1a\x45\xdf\xa3", 4));
|
||||
}
|
||||
if (mime_in(bmff_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 4, "ftyp", 4));
|
||||
}
|
||||
if (mime_in(avif_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 4, "ftypavif", 8) ||
|
||||
has_bytes(d, n, 4, "ftypavis", 8));
|
||||
}
|
||||
if (mime_in(heic_mimes, mime)) {
|
||||
SNIFF_RULE(
|
||||
has_bytes(d, n, 4, "ftyphei", 7) || has_bytes(d, n, 4, "ftyphev", 7) ||
|
||||
has_bytes(d, n, 4, "ftypmif1", 8) || has_bytes(d, n, 4, "ftypmsf1", 8));
|
||||
}
|
||||
if (mime_in(asf_mimes, mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "\x30\x26\xb2\x75\x8e\x66\xcf\x11", 8));
|
||||
}
|
||||
if (strfield2("audio/x-wav", mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) && has_bytes(d, n, 8, "WAVE", 4));
|
||||
}
|
||||
if (strfield2("video/x-msvideo", mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) && has_bytes(d, n, 8, "AVI ", 4));
|
||||
}
|
||||
if (strfield2("image/webp", mime)) {
|
||||
SNIFF_RULE(has_bytes(d, n, 0, "RIFF", 4) &&
|
||||
has_bytes(d, n, 8, "WEBPVP", 6));
|
||||
}
|
||||
if (strfield2("image/x-portable-anymap", mime)) {
|
||||
SNIFF_RULE(n >= 2 && d[0] == 'P' && d[1] >= '1' && d[1] <= '6');
|
||||
}
|
||||
if (strfield2("audio/x-aiff", mime)) {
|
||||
SNIFF_RULE(
|
||||
has_bytes(d, n, 0, "FORM", 4) &&
|
||||
(has_bytes(d, n, 8, "AIFF", 4) || has_bytes(d, n, 8, "AIFC", 4)));
|
||||
}
|
||||
if (strfield2("audio/mpeg", mime)) {
|
||||
/* MPEG audio frame sync (11 bits), valid layer and bitrate fields */
|
||||
SNIFF_RULE(n >= 2 && d[0] == 0xff && (d[1] & 0xe0) == 0xe0 &&
|
||||
(d[1] & 0x06) != 0);
|
||||
}
|
||||
if (strfield2("audio/aac", mime)) {
|
||||
/* ADTS sync */
|
||||
SNIFF_RULE(n >= 2 && d[0] == 0xff && (d[1] & 0xf6) == 0xf0);
|
||||
}
|
||||
if (strfield2("video/mp2t", mime)) {
|
||||
SNIFF_RULE(n >= 1 && d[0] == 0x47 && (n <= 188 || d[188] == 0x47));
|
||||
}
|
||||
if (mime_in(xml_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "<?xml"));
|
||||
}
|
||||
if (mime_in(svg_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "<svg") ||
|
||||
has_text_prefix(d, n, "<!DOCTYPE svg"));
|
||||
}
|
||||
if (mime_in(html_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "<!DOCTYPE") ||
|
||||
has_text_prefix(d, n, "<html") ||
|
||||
has_text_prefix(d, n, "<head"));
|
||||
}
|
||||
if (mime_in(pem_mimes, mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "-----BEGIN"));
|
||||
}
|
||||
if (strfield2("audio/x-mpegurl", mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "#EXTM3U"));
|
||||
}
|
||||
if (strfield2("text/x-vcard", mime)) {
|
||||
SNIFF_RULE(has_text_prefix(d, n, "BEGIN:VCARD"));
|
||||
}
|
||||
#undef SNIFF_RULE
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
hts_boolean hts_sniff_mime_known(const char *mime) {
|
||||
if (mime == NULL || *mime == '\0')
|
||||
return HTS_FALSE;
|
||||
return sniff_eval(SNIFF_QUERY_KNOWN, NULL, 0, mime);
|
||||
}
|
||||
|
||||
hts_boolean hts_sniff_mime_consistent(const void *data, size_t size,
|
||||
const char *mime) {
|
||||
if (data == NULL || size == 0 || mime == NULL || *mime == '\0')
|
||||
return HTS_FALSE;
|
||||
return sniff_eval(SNIFF_QUERY_MATCH, (const unsigned char *) data, size,
|
||||
mime);
|
||||
}
|
||||
50
src/htssniff.h
Normal file
50
src/htssniff.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/* ------------------------------------------------------------ */
|
||||
/*
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2017 Xavier Roche and other contributors
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Important notes:
|
||||
|
||||
- We hereby ask people using this source NOT to use it in purpose of grabbing
|
||||
emails addresses, or collecting any other private information on persons.
|
||||
This would disgrace our work, and spoil the many hours we spent on it.
|
||||
|
||||
Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* File: MIME magic-byte consistency checks */
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#ifndef HTSSNIFF_DEFH
|
||||
#define HTSSNIFF_DEFH
|
||||
|
||||
#include <stddef.h>
|
||||
#include "htsglobal.h"
|
||||
|
||||
/* Leading-body window read to arbitrate a wire/extension MIME conflict. */
|
||||
#define HTS_SNIFF_LEN 512
|
||||
|
||||
/* Can a magic rule ever confirm this MIME? (whether sniffing is worth it) */
|
||||
hts_boolean hts_sniff_mime_known(const char *mime);
|
||||
|
||||
/* TRUE when the leading body bytes are consistent with the claimed MIME;
|
||||
FALSE on unknown MIME, unknown magic, or too-short data (fail-safe). */
|
||||
hts_boolean hts_sniff_mime_consistent(const void *data, size_t size,
|
||||
const char *mime);
|
||||
|
||||
#endif
|
||||
@@ -121,9 +121,6 @@ struct String {
|
||||
/** Byte at POS (read/write). No bounds check; POS must be < StringLength. **/
|
||||
#define StringSubRW(BLK, POS) (StringBuffRW(BLK)[POS])
|
||||
|
||||
/** Subcharacter (read/write) **/
|
||||
#define StringSubRW(BLK, POS) (StringBuffRW(BLK)[POS])
|
||||
|
||||
/** Byte POS positions from the end (read). POS==1 is the last byte. **/
|
||||
#define StringRight(BLK, POS) (StringBuff(BLK)[StringLength(BLK) - POS])
|
||||
|
||||
@@ -191,8 +188,9 @@ HTS_STATIC char *StringBuffN_(String *blk, int size) {
|
||||
asserts SIZE fits the existing content; does not (re)allocate. **/
|
||||
#define StringSetLength(BLK, SIZE) \
|
||||
do { \
|
||||
if (SIZE >= 0) { \
|
||||
(BLK).length_ = SIZE; \
|
||||
const int len__ = (SIZE); /* signed: negative means strlen(buffer_) */ \
|
||||
if (len__ >= 0) { \
|
||||
(BLK).length_ = len__; \
|
||||
} else { \
|
||||
(BLK).length_ = strlen((BLK).buffer_); \
|
||||
} \
|
||||
@@ -308,10 +306,11 @@ HTS_STATIC void StringAttach(String *blk, char **str) {
|
||||
#define StringCatN(BLK, STR, SIZE) \
|
||||
do { \
|
||||
const char *str__ = (STR); \
|
||||
const size_t usize__ = (SIZE); \
|
||||
if (str__ != NULL) { \
|
||||
size_t size__ = strlen(str__); \
|
||||
if (size__ > (SIZE)) { \
|
||||
size__ = (SIZE); \
|
||||
if (size__ > usize__) { \
|
||||
size__ = usize__; \
|
||||
} \
|
||||
StringMemcat(BLK, str__, size__); \
|
||||
} \
|
||||
|
||||
@@ -80,6 +80,10 @@ htspair_t hts_detect_embed[] = {
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
/* HTML5 media siblings of <img src>: same near-link treatment (#451) */
|
||||
static const htspair_t hts_detect_embed_html5[] = {
|
||||
{"source", "src"}, {"source", "srcset"}, {"track", "src"}, {NULL, NULL}};
|
||||
|
||||
/* Internal */
|
||||
static int hts_acceptlink_(httrackp * opt, int ptr, const char *adr,
|
||||
const char *fil, const char *tag,
|
||||
@@ -136,6 +140,17 @@ static int cmp_token(const char *tag, const char *cmp) {
|
||||
&& !isalnum((unsigned char) tag[p]));
|
||||
}
|
||||
|
||||
/* TRUE if (tag, attribute) matches an embedded-asset pair in the table */
|
||||
static hts_boolean is_embed_pair(const htspair_t *table, const char *tag,
|
||||
const char *attribute) {
|
||||
int i;
|
||||
for (i = 0; table[i].tag != NULL; i++) {
|
||||
if (cmp_token(tag, table[i].tag) && cmp_token(attribute, table[i].attr))
|
||||
return HTS_TRUE;
|
||||
}
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
const char *adr, const char *fil, const char *tag,
|
||||
const char *attribute, int *set_prio_to,
|
||||
@@ -163,15 +178,9 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
|
||||
/* Built-in known tags (<img src=..>, ..) */
|
||||
if (forbidden_url != 0 && opt->nearlink && tag != NULL && attribute != NULL) {
|
||||
int i;
|
||||
|
||||
for(i = 0; hts_detect_embed[i].tag != NULL; i++) {
|
||||
if (cmp_token(tag, hts_detect_embed[i].tag)
|
||||
&& cmp_token(attribute, hts_detect_embed[i].attr)
|
||||
) {
|
||||
embedded_triggered = 1;
|
||||
break;
|
||||
}
|
||||
if (is_embed_pair(hts_detect_embed, tag, attribute) ||
|
||||
is_embed_pair(hts_detect_embed_html5, tag, attribute)) {
|
||||
embedded_triggered = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
103
src/htszlib.c
103
src/htszlib.c
@@ -47,48 +47,89 @@ Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/*
|
||||
Unpack file into a new file
|
||||
Unpack file into a new file (gzip, zlib RFC1950 or raw deflate RFC1951).
|
||||
Return value: size of the new file, or -1 if an error occurred
|
||||
*/
|
||||
/* Note: utf-8 */
|
||||
int hts_zunpack(char *filename, char *newfile) {
|
||||
int ret = -1;
|
||||
|
||||
if (filename != NULL && newfile != NULL) {
|
||||
if (filename[0] && newfile[0]) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
FILE *const in = FOPEN(fconv(catbuff, sizeof(catbuff), filename), "rb");
|
||||
const int fd = in != NULL ? fileno(in) : -1;
|
||||
const int dup_fd = fd != -1 ? dup(fd) : -1;
|
||||
// Note: we must dup to be able to flose cleanly.
|
||||
const gzFile gz = dup_fd != -1 ? gzdopen(dup_fd, "rb") : NULL;
|
||||
if (filename != NULL && newfile != NULL && filename[0] && newfile[0]) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
FILE *const in = FOPEN(fconv(catbuff, sizeof(catbuff), filename), "rb");
|
||||
|
||||
if (gz) {
|
||||
FILE *const fpout = FOPEN(fconv(catbuff, sizeof(catbuff), newfile), "wb");
|
||||
int size = 0;
|
||||
if (in != NULL) {
|
||||
unsigned char BIGSTK inbuf[8192];
|
||||
size_t navail = fread(inbuf, 1, sizeof(inbuf), in);
|
||||
/* gzip/zlib headers -> +32 windowBits; else raw deflate (RFC1951) */
|
||||
const hts_boolean wrapped =
|
||||
(navail >= 2 &&
|
||||
((inbuf[0] == 0x1f && inbuf[1] == 0x8b) ||
|
||||
((inbuf[0] & 0x0f) == Z_DEFLATED &&
|
||||
(((unsigned) inbuf[0] << 8 | inbuf[1]) % 31) == 0)));
|
||||
int attempt;
|
||||
|
||||
if (fpout) {
|
||||
int nr;
|
||||
/* deflate is ambiguous; on failure retry with the other windowBits */
|
||||
for (attempt = 0; attempt < 2 && ret < 0; attempt++) {
|
||||
const int windowBits =
|
||||
(attempt == 0 ? wrapped : !wrapped) ? (32 + MAX_WBITS) : -MAX_WBITS;
|
||||
FILE *fpout;
|
||||
z_stream strm;
|
||||
|
||||
do {
|
||||
char BIGSTK buff[1024];
|
||||
|
||||
nr = gzread(gz, buff, sizeof(buff));
|
||||
if (nr > 0) {
|
||||
size += nr;
|
||||
if (fwrite(buff, 1, nr, fpout) != nr)
|
||||
nr = size = -1;
|
||||
}
|
||||
} while(nr > 0);
|
||||
if (attempt > 0) {
|
||||
/* rewind input; reopening fpout "wb" discards the partial output */
|
||||
if (fseek(in, 0, SEEK_SET) != 0)
|
||||
break;
|
||||
navail = fread(inbuf, 1, sizeof(inbuf), in);
|
||||
}
|
||||
fpout = FOPEN(fconv(catbuff, sizeof(catbuff), newfile), "wb");
|
||||
if (fpout == NULL)
|
||||
break;
|
||||
memset(&strm, 0, sizeof(strm));
|
||||
if (inflateInit2(&strm, windowBits) != Z_OK) {
|
||||
fclose(fpout);
|
||||
} else
|
||||
size = -1;
|
||||
gzclose(gz);
|
||||
ret = (int) size;
|
||||
}
|
||||
if (in != NULL) {
|
||||
fclose(in);
|
||||
break;
|
||||
}
|
||||
{
|
||||
hts_boolean ok = HTS_TRUE;
|
||||
int size = 0;
|
||||
int zerr = Z_OK;
|
||||
|
||||
/* chunked inflate; first chunk in inbuf, single member */
|
||||
do {
|
||||
strm.next_in = inbuf;
|
||||
strm.avail_in = (uInt) navail;
|
||||
do {
|
||||
unsigned char BIGSTK outbuf[8192];
|
||||
size_t produced;
|
||||
|
||||
strm.next_out = outbuf;
|
||||
strm.avail_out = sizeof(outbuf);
|
||||
zerr = inflate(&strm, Z_NO_FLUSH);
|
||||
if (zerr == Z_NEED_DICT || zerr == Z_DATA_ERROR ||
|
||||
zerr == Z_MEM_ERROR || zerr == Z_STREAM_ERROR) {
|
||||
ok = HTS_FALSE;
|
||||
break;
|
||||
}
|
||||
produced = sizeof(outbuf) - strm.avail_out;
|
||||
if (produced > 0 &&
|
||||
fwrite(outbuf, 1, produced, fpout) != produced) {
|
||||
ok = HTS_FALSE;
|
||||
break;
|
||||
}
|
||||
size += (int) produced;
|
||||
} while (strm.avail_out == 0);
|
||||
if (!ok || zerr == Z_STREAM_END)
|
||||
break;
|
||||
navail = fread(inbuf, 1, sizeof(inbuf), in);
|
||||
} while (navail > 0);
|
||||
if (ok && zerr == Z_STREAM_END)
|
||||
ret = size;
|
||||
}
|
||||
inflateEnd(&strm);
|
||||
fclose(fpout);
|
||||
}
|
||||
fclose(in);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
||||
@@ -497,6 +497,12 @@ static const char *GetHttpMessage(int statuscode) {
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
break;
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
break;
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
break;
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
break;
|
||||
|
||||
@@ -90,4 +90,16 @@ refused "dangling-quote argument not refused cleanly"
|
||||
run_only "$tmp/q-lone" '"'
|
||||
refused "lone-quote argument not refused cleanly"
|
||||
|
||||
# --pause (#185): valid MIN[:MAX] accepted; malformed, reversed, over-range and
|
||||
# non-finite values refused cleanly. NaN defeats naive `<`/`>` checks (it
|
||||
# compares false to everything), so it must not slip through to the int cast.
|
||||
run "$tmp/pause-ok" --pause 0.2:0.4
|
||||
accepted "$tmp/pause-ok" "#185: valid --pause range rejected"
|
||||
run "$tmp/pause-fix" --pause 0.2
|
||||
accepted "$tmp/pause-fix" "#185: valid fixed --pause rejected"
|
||||
for bad in nan nan:5 5:nan inf 10:5 99999; do
|
||||
run "$tmp/pause-bad" --pause "$bad"
|
||||
refused "#185: invalid --pause '$bad' not refused cleanly"
|
||||
done
|
||||
|
||||
exit 0
|
||||
|
||||
@@ -18,6 +18,21 @@ ent '&' '&'
|
||||
ent '<>' '<>'
|
||||
ent 'é' 'é'
|
||||
|
||||
# HTML5 names from the WHATWG set
|
||||
ent '…' '…'
|
||||
ent '⋃' '⋃'
|
||||
# longest name (31 chars) exercises the name-length cap
|
||||
ent '∳' '∳'
|
||||
# astral codepoint -> 4-byte UTF-8
|
||||
ent '𝔸' '𝔸'
|
||||
# multi-codepoint refs are skipped at generation, so left verbatim
|
||||
ent 'fj' 'fj'
|
||||
|
||||
# common HTML4 names still decode (regression guard against accidental drops)
|
||||
ent '©®™' '©®™'
|
||||
ent '—–' '—–'
|
||||
ent 'αβ' 'αβ'
|
||||
|
||||
# numeric: decimal and hex
|
||||
ent 'AB' 'AB'
|
||||
ent 'A' 'A'
|
||||
|
||||
7
tests/01_engine-escape-room.test
Normal file
7
tests/01_engine-escape-room.test
Normal file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# HT_ADD_HTMLESCAPED* must reserve the escaper's worst case (6 for _full).
|
||||
httrack -O /dev/null -#test=escape-room run | grep -q "escape-room self-test OK"
|
||||
@@ -50,27 +50,54 @@ match '*foo*bar' 'foozbar'
|
||||
# '?' is the query-string marker, not a single-char wildcard
|
||||
nomatch 'a?c' 'abc'
|
||||
|
||||
# backslash escapes a metacharacter inside a class so it is matched literally.
|
||||
# Quirk: the decoder also adds the backslash itself to the set, so '\X' matches
|
||||
# both X and '\'. These assertions pin that behavior.
|
||||
# Inside a class, backslash escapes the next char as a literal member (#148):
|
||||
# '\X' matches X only (not '\'), and an escaped ']' is a member, not the terminator.
|
||||
match '*[\*]' '*'
|
||||
match '*[\*]' "\\"
|
||||
nomatch '*[\*]' 'a'
|
||||
nomatch '*[\*]' "\\"
|
||||
match '*[\\]' "\\"
|
||||
nomatch '*[\\]' 'a'
|
||||
nomatch '*[\\]' '*'
|
||||
match '*[\[]' '['
|
||||
match '*[\[]' "\\"
|
||||
nomatch '*[\[]' 'a'
|
||||
nomatch '*[\[]' "\\"
|
||||
match '*[\]]' ']'
|
||||
nomatch '*[\]]' "\\"
|
||||
|
||||
# A literal ']' cannot be a class member: the class parser stops at the first
|
||||
# ']', escaped or not. So '*[\[\]]' does NOT mean "the [ or ] character" as the
|
||||
# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
|
||||
# by a trailing literal ']'. These assertions document the current (buggy)
|
||||
# behavior so any future matcher fix is a deliberate, visible change.
|
||||
nomatch '*[\[\]]' '[' # not matched, despite the docs
|
||||
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
|
||||
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
|
||||
nomatch '*[\[\]]' '[]x'
|
||||
# '*[\[\]]' is "the [ or ] character", as the filter guide documents.
|
||||
match '*[\[\]]' '['
|
||||
match '*[\[\]]' ']'
|
||||
nomatch '*[\[\]]' 'a'
|
||||
match '*[\[,\]]' '[' # comma between members is optional
|
||||
match '*[\[,\]]' ']'
|
||||
match '*[a,\[]' 'a' # an escaped member no longer eats the preceding one
|
||||
match '*[a,\[]' '['
|
||||
|
||||
# Escape is decoded before the range/separator/size checks, so '\-' '\,' '\<'
|
||||
# are literal members, not operators.
|
||||
match '*[a\-z]' 'a'
|
||||
match '*[a\-z]' 'z'
|
||||
nomatch '*[a\-z]' 'b' # not the a..z range
|
||||
match '*[\,]' ','
|
||||
nomatch '*[\,]' "\\" # the escape must not leak '\' into the class
|
||||
match '*[\<]' '<'
|
||||
nomatch '*[\<]' "\\"
|
||||
match '*[\[,\],a]' '['
|
||||
match '*[\[,\],a]' ']'
|
||||
match '*[\[,\],a]' 'a'
|
||||
|
||||
# A truncated range '*[a-' is the literal members {a,-}; the parser must not
|
||||
# read past the end decoding it (was a 1-byte heap over-read in the range arm).
|
||||
match '*[a-' 'a'
|
||||
nomatch '*[a-' 'b'
|
||||
|
||||
# *(...) matches exactly one char from the class; *[...] matches a run.
|
||||
match '*(a,b)' 'a'
|
||||
nomatch '*(a,b)' 'aa'
|
||||
nomatch '*(a,b)' 'c'
|
||||
|
||||
# documented composite filters (filters.html)
|
||||
match 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.zip'
|
||||
nomatch 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.tar'
|
||||
match '*.html*[]' 'page.html'
|
||||
nomatch '*.html*[]' 'page.html?x=1' # *[] forbids the trailing query
|
||||
|
||||
# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
|
||||
# means the size is still unknown (scan time). A size exclusion must stay neutral
|
||||
|
||||
7
tests/01_engine-ftp-line.test
Executable file
7
tests/01_engine-ftp-line.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# get_ftp_line bounds a hostile CRLF-less FTP reply into its 1024-byte buffer.
|
||||
httrack -O /dev/null -#test=ftp-line run | grep -q "ftp-line self-test OK"
|
||||
7
tests/01_engine-ftp-userpass.test
Executable file
7
tests/01_engine-ftp-userpass.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ftp_split_userpass bounds an over-long user:pass@ from a hostile ftp:// URL.
|
||||
httrack -O /dev/null -#test=ftp-userpass run | grep -q "ftp-userpass self-test OK"
|
||||
29
tests/01_engine-header.test
Normal file
29
tests/01_engine-header.test
Normal file
@@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Response header-line parsing (treathead via -#test=header <raw-line> ...).
|
||||
# Isolates the wire layer from url_savename, which strips traversal on its own.
|
||||
|
||||
hdr() {
|
||||
local want="$1"
|
||||
shift
|
||||
out="$(httrack -O /dev/null -#test=header "$@" | grep '^contenttype=')"
|
||||
test "$out" == "$want" || {
|
||||
echo "FAIL: $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
hdr 'contenttype=application/pdf cdispo=' 'Content-Type: application/pdf'
|
||||
|
||||
# filename= is honored quoted or bare.
|
||||
hdr 'contenttype= cdispo=report.pdf' \
|
||||
'Content-Disposition: attachment; filename="report.pdf"'
|
||||
hdr 'contenttype= cdispo=report.pdf' \
|
||||
'Content-Disposition: attachment; filename=report.pdf'
|
||||
|
||||
# Path components in the filename are dropped on the wire (RFC 2616).
|
||||
hdr 'contenttype= cdispo=evil.pdf' \
|
||||
'Content-Disposition: attachment; filename="../../evil.pdf"'
|
||||
7
tests/01_engine-inplace-escape.test
Executable file
7
tests/01_engine-inplace-escape.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# inplace_escape_*() must match escape_*() on a copy: guards the shared helper.
|
||||
httrack -O /dev/null -#test=inplace-escape run | grep -q "inplace-escape self-test OK"
|
||||
7
tests/01_engine-java.test
Executable file
7
tests/01_engine-java.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# .class constant-pool count is capped to the file size (calloc DoS).
|
||||
httrack -O /dev/null -#test=java run | grep -q "java constant-pool cap self-test OK"
|
||||
12
tests/01_engine-makeindex.test
Executable file
12
tests/01_engine-makeindex.test
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# hts_finish_makeindex writes the footer and gates the refresh meta on a single
|
||||
# first link (guards the macro->function extraction).
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
httrack -O /dev/null -#test=makeindex "$dir" run |
|
||||
grep -q "makeindex self-test OK"
|
||||
@@ -323,4 +323,33 @@ grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
|
||||
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
|
||||
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
|
||||
|
||||
# HTML5 <source>/<track> follow as embedded near-links past the -r2 depth boundary (#451).
|
||||
# img.gif positive control; plain.gif (bare <a href>) negative control proves the gate is selective.
|
||||
site10="$tmp/html5media"
|
||||
mkdir -p "$site10"
|
||||
for f in img ss plain; do gif "$site10/$f.gif"; done
|
||||
printf 'x' >"$site10/v.webm"
|
||||
printf 'x' >"$site10/subs.vtt"
|
||||
cat >"$site10/index.html" <<EOF
|
||||
<html><body><a href="leaf.html">leaf</a></body></html>
|
||||
EOF
|
||||
cat >"$site10/leaf.html" <<EOF
|
||||
<html><body>
|
||||
<img src="img.gif">
|
||||
<picture><source srcset="ss.gif 2x"></picture>
|
||||
<video><source src="v.webm"></video>
|
||||
<video><track src="subs.vtt"></video>
|
||||
<a href="plain.gif">plain link past the boundary</a>
|
||||
</body></html>
|
||||
EOF
|
||||
out10="$tmp/html5media-out"
|
||||
rm -rf "$out10"
|
||||
mkdir -p "$out10"
|
||||
httrack "file://$site10/index.html" -O "$out10" --quiet --near -r2 >"$out10/.log" 2>&1 || true
|
||||
found "img.gif" "$out10"
|
||||
found "ss.gif" "$out10"
|
||||
found "v.webm" "$out10"
|
||||
found "subs.vtt" "$out10"
|
||||
notfound "plain.gif" "$out10"
|
||||
|
||||
exit 0
|
||||
|
||||
15
tests/01_engine-pause.test
Executable file
15
tests/01_engine-pause.test
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# --pause (#185): the inter-file pause target must stay in [min,max] and spread
|
||||
# across it (a per-call rand() would collapse it toward min). Driven by the
|
||||
# in-process 'httrack -#test=pause' test. POSIX-portable ($(BASH) is /bin/sh on macOS).
|
||||
|
||||
set -eu
|
||||
|
||||
# 'run' is an ignored placeholder argument.
|
||||
out=$(httrack -#test=pause run)
|
||||
|
||||
test "$out" = "pause: OK" || {
|
||||
echo "expected 'pause: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
9
tests/01_engine-redirect.test
Normal file
9
tests/01_engine-redirect.test
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
|
||||
# followed through, not turned into a self-pointing "moved" stub. The decision
|
||||
# helper is exercised by the engine self-test.
|
||||
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"
|
||||
7
tests/01_engine-robots.test
Executable file
7
tests/01_engine-robots.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# robots.txt RFC 9309 Allow/Disallow precedence (#452): longest match wins.
|
||||
httrack -O /dev/null -#test=robots run | grep -q "robots self-test OK"
|
||||
@@ -3,13 +3,38 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Local save-name extension resolution (url_savename via -#test=savename <fil> <content-type>).
|
||||
# Asserts on the basename of "savename: <path>".
|
||||
# Local save-name resolution (url_savename via -#test=savename <fil> <content-type> [key=value ...]).
|
||||
# name() asserts on the basename, full() on the whole path; prior= registers an
|
||||
# already-crawled link whose sav is rooted under the -O path (/dev/null here).
|
||||
|
||||
# resolve httrack before cd: make check puts a RELATIVE ../src on PATH
|
||||
httrack_bin=$(cd "$(dirname "$(command -v httrack)")" && pwd)/httrack
|
||||
|
||||
# scratch dir: body= and cached= write temp files (st-savename-body.tmp, hts-cache/)
|
||||
scratch=$(mktemp -d)
|
||||
trap 'rm -rf "$scratch"' EXIT
|
||||
cd "$scratch"
|
||||
|
||||
run() {
|
||||
"$httrack_bin" -O /dev/null -#test=savename "$@" | sed -n 's/^savename: //p'
|
||||
}
|
||||
|
||||
name() {
|
||||
out="$(httrack -O /dev/null -#test=savename "$1" "$2" | sed -n 's/^savename: //p')"
|
||||
test "${out##*/}" == "$3" || {
|
||||
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
|
||||
local fil="$1" ctype="$2" want="$3"
|
||||
shift 3
|
||||
out="$(run "$fil" "$ctype" "$@")"
|
||||
test "${out##*/}" == "$want" || {
|
||||
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
full() {
|
||||
local fil="$1" ctype="$2" want="$3"
|
||||
shift 3
|
||||
out="$(run "$fil" "$ctype" "$@")"
|
||||
test "$out" == "$want" || {
|
||||
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
@@ -39,3 +64,95 @@ name '/types/data.json' 'application/json' 'data.json'
|
||||
|
||||
# Agreeing type must not rewrite the extension's casing (no strip-and-reappend).
|
||||
name '/x.JPG' 'image/jpeg' 'x.JPG'
|
||||
|
||||
# A Content-Disposition filename replaces the URL name outright.
|
||||
name '/x.php' 'application/pdf' 'report.pdf' cdispo=report.pdf
|
||||
name '/download' 'text/html' 'setup.exe' cdispo=setup.exe
|
||||
|
||||
# Reserved characters in a hostile Content-Disposition name are sanitized.
|
||||
name '/x.php' 'application/pdf' 'set_up.exe' 'cdispo=set:up.exe'
|
||||
|
||||
# The md5-of-query suffix lands inside a Content-Disposition name too.
|
||||
name '/x.php?id=1' 'application/pdf' 'report681a.pdf' cdispo=report.pdf
|
||||
|
||||
# Still-downloading path (status=-1): mime drives the ext, cdispo is ignored
|
||||
# there (the deliberately unfolded 4th resolve_extension variant).
|
||||
name '/x.pdf' 'text/html' 'x.html' status=-1
|
||||
name '/x.html' 'text/html' 'x.html' status=-1
|
||||
name '/x.php' 'application/pdf' 'x.pdf' status=-1 cdispo=report.pdf
|
||||
|
||||
# Contested type (wire disagrees with a specific ext): magic bytes proving the
|
||||
# extension right keep it, anything else trusts the wire as before.
|
||||
name '/photo.jpg' 'image/png' 'photo.jpg' body=hex:FFD8FFE000104A46
|
||||
name '/photo.jpg' 'image/png' 'photo.png' body=hex:89504E470D0A1A0A
|
||||
name '/photo.jpg' 'image/png' 'photo.png'
|
||||
name '/doc.pdf' 'text/html' 'doc.pdf' body=hex:255044462D312E34
|
||||
name '/doc.pdf' 'text/html' 'doc.html' 'body=<html><body>soft 404</body></html>'
|
||||
name '/style.css' 'image/png' 'style.png' 'body=body { }' # no rule for css: wire wins
|
||||
|
||||
# A redirect answer resolves nothing: delayed placeholder name.
|
||||
name '/x.php' 'text/html' 'x.0.delayed' statuscode=301
|
||||
|
||||
# Root and query-only URLs get index + the md5-of-query suffix.
|
||||
name '/' 'text/html' 'index.html'
|
||||
name '/?a=1' 'text/html' 'index3872.html'
|
||||
|
||||
# Same URL crawled before: reuse its sav verbatim (case preserved).
|
||||
full '/X.PHP' 'text/html' 'www.example.com/CASE.HTML' \
|
||||
'prior=www.example.com|/X.PHP|www.example.com/CASE.HTML'
|
||||
|
||||
# Another URL owns the name: collision suffix -2, then -3, case-insensitively.
|
||||
name '/x.php' 'text/html' 'x-2.html' \
|
||||
'prior=www.example.com|/other.html|/dev/null/www.example.com/x.html'
|
||||
name '/x.php' 'text/html' 'x-3.html' \
|
||||
'prior=www.example.com|/o1.html|/dev/null/www.example.com/x.html' \
|
||||
'prior=www.example.com|/o2.html|/dev/null/www.example.com/x-2.html'
|
||||
name '/INDEX.HTML' 'text/html' 'INDEX-2.HTML' \
|
||||
'prior=www.example.com|/index.html|/dev/null/www.example.com/index.html'
|
||||
|
||||
# Same basename in another directory is NOT a collision.
|
||||
name '/x.php' 'text/html' 'x.html' \
|
||||
'prior=www.example.com|/sub/x.html|/dev/null/www.example.com/sub/x.html'
|
||||
|
||||
# 8-3 modes: DOS truncates every component to 8+3, ISO9660 level 2 to 31.
|
||||
full '/directory-long/verylongfilename.html' 'text/html' \
|
||||
'/dev/null/EXAMPLE/DIRECTOR/VERYLONG.HTM' n83=1
|
||||
full '/directory-long/verylongfilename.html' 'text/html' \
|
||||
'/dev/null/EXAMPLE_C/DIRECTORY_LONG/VERYLONGFILENAME.HTM' n83=2
|
||||
name '/verylongfilename.php' 'text/html' 'VERYLO-2.HTM' n83=1 \
|
||||
'prior=www.example.com|/other.html|/dev/null/EXAMPLE/VERYLONG.HTM'
|
||||
|
||||
# urlhack dedup (#271): // collapse and www-strip map to the prior link's sav;
|
||||
# the per-feature negatives opt out and take a fresh name.
|
||||
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/PRIOR.html' \
|
||||
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
|
||||
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' no-slash=1 \
|
||||
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
|
||||
full '/w.php' 'text/html' '/dev/null/www.example.com/W-PRIOR.html' adr=example.com \
|
||||
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
|
||||
full '/w.php' 'text/html' '/dev/null/example.com/w.html' adr=example.com no-www=1 \
|
||||
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
|
||||
|
||||
# Distinct URLs must stay distinct under urlhack (no over-normalization).
|
||||
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' \
|
||||
'prior=www.example.com|/a/c.php|/dev/null/www.example.com/a/C-PRIOR.html'
|
||||
|
||||
# --strip-query (#112): stripped key dedups onto the prior sav; without the
|
||||
# option the same URLs stay distinct.
|
||||
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/PAGE-PRIOR.html' \
|
||||
strip=sid 'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
|
||||
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
|
||||
'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
|
||||
|
||||
# A kept key that differs must still block the dedup (no over-stripping).
|
||||
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
|
||||
strip=sid 'prior=www.example.com|/page.php?id=4|/dev/null/www.example.com/PAGE-PRIOR.html'
|
||||
|
||||
# Hostile fils stay rooted under the mirror: ../ (raw or %2e-encoded) drops out,
|
||||
# control characters become spaces, oversized names cap at 210 chars (the cap
|
||||
# can chop the extension off entirely).
|
||||
full '/../../etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
|
||||
full '/%2e%2e/%2e%2e/etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
|
||||
full '/x.php' 'application/pdf' '/dev/null/www.example.com///evil.exe' 'cdispo=../../evil.exe'
|
||||
name $'/evil\rname\t.php' 'text/html' 'evil name .html'
|
||||
name "/$(printf 'a%.0s' {1..300}).php" 'text/html' "$(printf 'a%.0s' {1..210})"
|
||||
|
||||
87
tests/01_engine-sniff.test
Normal file
87
tests/01_engine-sniff.test
Normal file
@@ -0,0 +1,87 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# MIME magic consistency (-#test=sniff <content-type> <hex:..|text>), the
|
||||
# tie-break behind htsname's wire-vs-extension naming.
|
||||
|
||||
chk() {
|
||||
local mime="$1" body="$2" want="$3"
|
||||
out="$(httrack -#test=sniff "$mime" "$body" | sed -n 's/^sniff: //p')"
|
||||
test "$out" == "$want" || {
|
||||
echo "FAIL: '$mime' '$body' -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
yes='known=1 consistent=1'
|
||||
no='known=1 consistent=0'
|
||||
unk='known=0 consistent=0'
|
||||
|
||||
# images
|
||||
chk image/jpeg hex:FFD8FFE000104A46 "$yes"
|
||||
chk image/png hex:89504E470D0A1A0A "$yes"
|
||||
chk image/png hex:FFD8FFE000104A46 "$no" # jpeg bytes are not a png
|
||||
chk image/gif 'GIF89a' "$yes"
|
||||
chk image/bmp 'BMxxxx' "$yes"
|
||||
chk image/tiff hex:49492A00 "$yes"
|
||||
chk image/tiff hex:4D4D002A "$yes" # both endians
|
||||
chk image/x-icon hex:00000100 "$yes"
|
||||
chk image/x-icon hex:00000200 "$yes" # Windows cursor, spec maps to x-icon
|
||||
chk image/webp 'RIFFxxxxWEBPVP' "$yes"
|
||||
chk image/webp 'RIFFxxxxWAVE' "$no" # riff subtype discriminates
|
||||
chk image/avif hex:0000001C6674797061766966 "$yes"
|
||||
chk image/avif hex:0000001C6674797068656963 "$no" # heic brand is not avif
|
||||
chk image/heic hex:0000001C6674797068656963 "$yes"
|
||||
chk image/svg+xml '<svg xmlns="x">' "$yes"
|
||||
chk image/svg+xml $'\xef\xbb\xbf <?xml version="1.0"?>' "$yes" # BOM+ws skip
|
||||
|
||||
# audio / video
|
||||
chk audio/mpeg 'ID3xxx' "$yes"
|
||||
chk audio/mpeg hex:FFFB9000 "$yes" # bare frame sync
|
||||
chk audio/aac hex:FFF15080 "$yes"
|
||||
chk audio/flac 'fLaC' "$yes"
|
||||
chk audio/ogg hex:4F67675300 "$yes"
|
||||
chk audio/x-wav 'RIFFxxxxWAVE' "$yes"
|
||||
chk video/x-msvideo 'RIFFxxxxAVI ' "$yes"
|
||||
chk video/x-msvideo 'RIFFxxxxWAVE' "$no"
|
||||
chk video/mp4 hex:000000186674797069736F6D "$yes"
|
||||
chk video/webm hex:1A45DFA3 "$yes"
|
||||
chk video/mpeg hex:000001BA "$yes"
|
||||
chk video/x-ms-wmv hex:3026B2758E66CF11 "$yes"
|
||||
|
||||
# archives; zip magic covers the office-container families
|
||||
chk application/zip hex:504B0304 "$yes"
|
||||
chk application/vnd.openxmlformats-officedocument.wordprocessingml.document hex:504B0304 "$yes"
|
||||
chk application/vnd.oasis.opendocument.text hex:504B0304 "$yes"
|
||||
chk application/msword hex:D0CF11E0A1B11AE1 "$yes"
|
||||
chk application/msword hex:504B0304 "$no" # legacy .doc is OLE, not zip
|
||||
chk application/x-gzip hex:1F8B08 "$yes"
|
||||
chk application/x-bzip2 'BZh9' "$yes"
|
||||
chk application/x-7z-compressed hex:377ABCAF271C "$yes"
|
||||
chk application/x-rar-compressed hex:526172211A07 "$yes"
|
||||
chk application/zstd hex:28B52FFD "$yes"
|
||||
chk application/x-tar "hex:$(printf '00%.0s' {1..257})7573746172" "$yes" # ustar at 257
|
||||
chk application/x-tar hex:7573746172 "$no"
|
||||
|
||||
# documents, fonts, misc
|
||||
chk application/pdf '%PDF-1.7' "$yes"
|
||||
chk application/pdf '<html><body>soft 404</body></html>' "$no"
|
||||
chk application/postscript '%!PS-Adobe' "$yes"
|
||||
chk application/rtf '{\rtf1' "$yes"
|
||||
chk font/woff2 'wOF2' "$yes"
|
||||
chk font/otf 'OTTO' "$yes"
|
||||
chk font/ttf hex:0001000000 "$yes"
|
||||
chk application/x-shockwave-flash 'CWSx' "$yes"
|
||||
chk application/x-java-vm hex:CAFEBABE "$yes"
|
||||
chk application/wasm hex:0061736D "$yes"
|
||||
chk text/html $' \r\n<!DOCTYPE html><html>' "$yes"
|
||||
chk text/html '<html lang="en">' "$yes"
|
||||
chk text/html 'plain text, no markup' "$no"
|
||||
chk text/xml '<?xml version="1.0"?>' "$yes"
|
||||
|
||||
# no magic rule at all: never confirmed, never blocks the wire type
|
||||
chk text/css 'body { }' "$unk"
|
||||
chk text/plain 'hello' "$unk"
|
||||
chk application/x-javascript 'var x;' "$unk"
|
||||
7
tests/01_engine-status.test
Executable file
7
tests/01_engine-status.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# HTTP status -> reason phrase, including the modern 429/451 (#453).
|
||||
httrack -O /dev/null -#test=status run | grep -q "status self-test OK"
|
||||
7
tests/01_engine-unescape-bounds.test
Executable file
7
tests/01_engine-unescape-bounds.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Entity/URL unescapers reserve one byte for the trailing NUL (no 1-byte OOB).
|
||||
httrack -O /dev/null -#test=unescape-bounds run | grep -q "unescape-bounds self-test OK"
|
||||
7
tests/01_engine-useragent.test
Executable file
7
tests/01_engine-useragent.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Default User-Agent (#449): honest HTTrack token, no Windows 98 relic.
|
||||
httrack -O /dev/null -#test=useragent run | grep -q "useragent self-test OK"
|
||||
11
tests/01_zlib-acceptencoding.test
Executable file
11
tests/01_zlib-acceptencoding.test
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Accept-Encoding (#450): advertise gzip+deflate; decode gzip/zlib/raw-deflate.
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
httrack -O /dev/null -#test=acceptencoding "$dir" run |
|
||||
grep -q "acceptencoding self-test OK"
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
# Golden cache-format regression test (driven by 'httrack -#test=cache-golden <dir>').
|
||||
#
|
||||
# 01_engine-cache.test writes the cache with the same build it reads back (a
|
||||
# 01_zlib-cache.test writes the cache with the same build it reads back (a
|
||||
# round-trip), so it cannot catch a read-path or ZIP-format regression where
|
||||
# writer and reader drift together. This reads a *committed* cache frozen by an
|
||||
# earlier build and asserts a fixed set of entries still decodes field- and
|
||||
33
tests/01_zlib-savename-cached.test
Normal file
33
tests/01_zlib-savename-cached.test
Normal file
@@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Update-run naming from a real cache entry (-#test=savename cached=<ctype>|<save>).
|
||||
# Named 01_zlib-*: the cache writer needs zlib, which the MSan job can't run.
|
||||
|
||||
# resolve httrack before cd: make check puts a RELATIVE ../src on PATH
|
||||
httrack_bin=$(cd "$(dirname "$(command -v httrack)")" && pwd)/httrack
|
||||
|
||||
scratch=$(mktemp -d)
|
||||
trap 'rm -rf "$scratch"' EXIT
|
||||
cd "$scratch"
|
||||
|
||||
name() {
|
||||
local fil="$1" ctype="$2" want="$3"
|
||||
shift 3
|
||||
out="$("$httrack_bin" -O /dev/null -#test=savename "$fil" "$ctype" "$@" | sed -n 's/^savename: //p')"
|
||||
test "${out##*/}" == "$want" || {
|
||||
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
# No live bytes: the recorded save name (X-Save) reproduces the previous
|
||||
# verdict; cached body bytes (PNG magic) are ignored; css has no magic rule.
|
||||
name '/photo.jpg' 'image/png' 'photo.jpg' 'cached=image/png|www.example.com/photo.jpg'
|
||||
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.png'
|
||||
name '/photo.jpg' 'image/jpeg' 'photo.jpg' 'cached=image/jpeg|www.example.com/photo.png'
|
||||
name '/style.css' 'image/png' 'style.css' 'cached=image/png|www.example.com/style.css'
|
||||
# agreement keeps the URL ext verbatim (.jpeg), never canonicalized to .jpg
|
||||
name '/photo.jpeg' 'image/jpeg' 'photo.jpeg' 'cached=image/jpeg|www.example.com/photo.jpeg'
|
||||
@@ -1,11 +1,10 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Content-Type vs URL-extension naming (issue #267 family) under the default
|
||||
# delayed type check (-%N2). Policy: a MISSING Content-Type must not clobber a
|
||||
# URL extension that maps to a specific non-HTML type (.png/.pdf stay as-is);
|
||||
# an explicitly DECLARED type is trusted, so a binary-looking URL that really
|
||||
# serves HTML (text/html on .pdf/.jpg) is named .html. The "wrong" names are
|
||||
# asserted absent so a regression in either direction fails here.
|
||||
# Content-Type vs URL-extension naming (#267 family, default -%N2). A MISSING
|
||||
# type keeps a specific non-HTML ext; a DECLARED disagreeing type is trusted
|
||||
# unless magic bytes prove the ext right (lie/wrongtype/packed keep theirs),
|
||||
# so a real HTML body (report.pdf) still becomes .html. Wrong names are
|
||||
# asserted absent so a regression in either direction fails.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
@@ -14,7 +13,11 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/notype.pdf' --not-found 'types/notype.html' \
|
||||
--found 'types/photo.png' \
|
||||
--found 'types/doc.pdf' \
|
||||
--found 'types/lie.html' --not-found 'types/lie.png' \
|
||||
--found 'types/lie.png' --not-found 'types/lie.html' \
|
||||
--found 'types/wrongtype.jpg' --not-found 'types/wrongtype.png' \
|
||||
--found 'types/bigtype.jpg' --not-found 'types/bigtype.png' \
|
||||
--found 'types/mutant.jpg' --not-found 'types/mutant.png' \
|
||||
--found 'types/packed.jpg' --not-found 'types/packed.png' \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/page.htm' --not-found 'types/page.html' \
|
||||
--found 'types/script.js' \
|
||||
|
||||
@@ -1,15 +1,18 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# A second (update) pass must keep the names the first crawl chose. The stored
|
||||
# Content-Type rides the cache, so the update reads back the same value -- the
|
||||
# unknown/unknown sentinel for a typeless response, the declared type otherwise
|
||||
# -- and names consistently: a declared-text/html .pdf stays .html and a
|
||||
# typeless .png stays .png across the update rather than reverting.
|
||||
# An update pass keeps the names the first crawl chose: type and save name
|
||||
# ride the cache, so a declared-text/html .pdf stays .html, a typeless .png
|
||||
# stays .png, and a sniff-kept ext is reproduced from X-Save even when the
|
||||
# refetched content changed (mutant.jpg serves PNG bytes on the rerun).
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/lie.html' \
|
||||
--found 'types/lie.png' --not-found 'types/lie.html' \
|
||||
--found 'types/wrongtype.jpg' --not-found 'types/wrongtype.png' \
|
||||
--found 'types/bigtype.jpg' --not-found 'types/bigtype.png' \
|
||||
--found 'types/packed.jpg' --not-found 'types/packed.png' \
|
||||
--found 'types/mutant.jpg' --not-found 'types/mutant.png' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
|
||||
@@ -20,6 +20,14 @@ if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "python3 missing, skipping"
|
||||
exit 77
|
||||
fi
|
||||
# The fixture needs a second loopback IP (dead 127.0.0.2 + live 127.0.0.1) for
|
||||
# the fallback to have a target; GNU/Hurd has only 127.0.0.1, so skip there.
|
||||
case "$(uname -s)" in
|
||||
GNU | GNU/*)
|
||||
echo "GNU/Hurd: single loopback IP, connect-fallback fixture unbuildable, skipping"
|
||||
exit 77
|
||||
;;
|
||||
esac
|
||||
|
||||
server="$top_srcdir/tests/local-server.py"
|
||||
root="$top_srcdir/tests/server-root"
|
||||
|
||||
22
tests/27_local-cookies-file.test
Normal file
22
tests/27_local-cookies-file.test
Normal file
@@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# End-to-end --cookies-file (#215): /gated/secret.php needs a cookie no page
|
||||
# ever Set-Cookies, so it is reachable only when the option preloads it from a
|
||||
# Netscape cookies.txt. Locks the CLI->opt->cookie_load->wire plumbing.
|
||||
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# preloaded cookie -> secret page is served. -o0 means a 500 leaves no file, so
|
||||
# --found/--files only hold when the secret is genuinely fetched (200).
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --cookie 'session=opensesame' \
|
||||
--errors 0 --files 2 \
|
||||
--found 'gated/index.html' --found 'gated/secret.html' \
|
||||
httrack 'BASEURL/gated/index.php' -o0
|
||||
|
||||
# control: without the cookie the secret 500s; -o0 suppresses the error page so
|
||||
# its absence is real (error + missing file)
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'gated/index.html' --not-found 'gated/secret.html' \
|
||||
httrack 'BASEURL/gated/index.php' -o0
|
||||
36
tests/28_local-pause.test
Executable file
36
tests/28_local-pause.test
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# --pause (#185): a fixed inter-file delay must slow a multi-file crawl. Measure
|
||||
# the same crawl with and without --pause and compare: the harness overhead
|
||||
# cancels, leaving only the pause. Integer seconds keep it portable (BSD date
|
||||
# has no %N); a lower bound is not timing-flaky since a pause only adds time.
|
||||
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# python3 runs the local server (mirror local-crawl.sh); skip when absent, else
|
||||
# run() swallows its exit-77 and the serverless 0s/0s crawl looks like a fail.
|
||||
command -v python3 >/dev/null || {
|
||||
echo "python3 not found; skipping local crawl tests"
|
||||
exit 77
|
||||
}
|
||||
|
||||
run() { # echoes the wall-clock seconds of one crawl
|
||||
local t0 t1
|
||||
t0=$(date +%s)
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
httrack 'BASEURL/types/index.html' -c1 "$@" >/dev/null 2>&1
|
||||
t1=$(date +%s)
|
||||
echo $((t1 - t0))
|
||||
}
|
||||
|
||||
base=$(run)
|
||||
paused=$(run --pause 0.5)
|
||||
delta=$((paused - base))
|
||||
|
||||
echo "crawl: ${base}s, with --pause 0.5: ${paused}s (delta ${delta}s)"
|
||||
if [ "$delta" -lt 2 ]; then
|
||||
echo "FAIL: --pause did not delay the crawl (delta ${delta}s)" >&2
|
||||
exit 1
|
||||
fi
|
||||
11
tests/29_local-redirect-fragment.test
Executable file
11
tests/29_local-redirect-fragment.test
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
# Issue #204: a 302 Location with a #fragment must drop the fragment before the
|
||||
# target is fetched. The server is strict (400 on a '#' in the request-target),
|
||||
# so a leaked fragment logs an error and the target is never saved.
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'redir/target.html' \
|
||||
httrack 'BASEURL/redir/index.html'
|
||||
13
tests/30_local-fragment-link.test
Executable file
13
tests/30_local-fragment-link.test
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
# Issue #279: an anchored link (target.html#sec, quoted or bare) fetches the
|
||||
# target with the fragment dropped (strict server 400s on a '#' in the request)
|
||||
# but keeps it in the rewritten local link so the anchor still works.
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'fraglink/target.html' \
|
||||
--file-matches 'fraglink/index.html' 'href=target\.html#sec' \
|
||||
--file-matches 'fraglink/index.html' 'href="target\.html#sec2"' \
|
||||
httrack 'BASEURL/fraglink/index.html'
|
||||
23
tests/31_local-javaclass.test
Normal file
23
tests/31_local-javaclass.test
Normal file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
# The java plugin must load (versioned dlopen name) and parse a .class
|
||||
# constant pool: a resource named only inside Foo.class gets crawled.
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
tmproot=$(mktemp -d)
|
||||
trap 'rm -rf "$tmproot"' EXIT
|
||||
mkdir "$tmproot/javaclass"
|
||||
|
||||
cat >"$tmproot/javaclass/index.html" <<'EOF'
|
||||
<html><body><a href="Foo.class">applet</a></body></html>
|
||||
EOF
|
||||
printf 'GIF89a' >"$tmproot/javaclass/hello.gif"
|
||||
# magic/minor/major, count=2, one CONSTANT_Utf8 "hello.gif", class/superclass
|
||||
printf '\xCA\xFE\xBA\xBE\x00\x00\x00\x32\x00\x02\x01\x00\x09hello.gif\x00\x00\x00\x00' \
|
||||
>"$tmproot/javaclass/Foo.class"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --root "$tmproot" --errors 0 \
|
||||
--found 'javaclass/Foo.class' \
|
||||
--found 'javaclass/hello.gif' \
|
||||
httrack 'BASEURL/javaclass/index.html'
|
||||
17
tests/32_local-cdispo.test
Normal file
17
tests/32_local-cdispo.test
Normal file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Content-Disposition names the saved file: the attachment filename replaces
|
||||
# the URL-derived name, and a traversal filename is reduced to its last
|
||||
# component, inside the mirror.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'cdispo/report.pdf' \
|
||||
--file-matches 'cdispo/report.pdf' '%PDF' \
|
||||
--not-found 'cdispo/fetch.pdf' \
|
||||
--found 'cdispo/evil.pdf' \
|
||||
--not-found 'evil.pdf' \
|
||||
httrack 'BASEURL/cdispo/index.html'
|
||||
20
tests/33_local-delayed.test
Normal file
20
tests/33_local-delayed.test
Normal file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Degenerate delayed-type paths (#5/#107 family): redirects that never resolve
|
||||
# a name must drop cleanly -- no .delayed leftovers (audited by local-crawl.sh),
|
||||
# no "bogus state" cache warnings, resolvable links still land correctly.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --rerun --errors 0 \
|
||||
--found 'delayed/real.pdf' \
|
||||
--file-matches 'delayed/real.pdf' '%PDF' \
|
||||
--found 'delayed/notype.bin.html' \
|
||||
--found 'delayed/empty.html' \
|
||||
--not-found 'delayed/noloc.html' \
|
||||
--not-found 'delayed/selfloop.html' \
|
||||
--not-found 'delayed/chain9.pdf' \
|
||||
--log-not-found 'bogus state' \
|
||||
httrack 'BASEURL/delayed/index.html'
|
||||
21
tests/34_local-maxtime.test
Normal file
21
tests/34_local-maxtime.test
Normal file
@@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# -E time limit (#481): server pages trickle for minutes; the engine must stop
|
||||
# on its own at -E plus grace, aborting the in-flight transfers.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# cancelled crawls can orphan .delayed placeholders (#483): skip that audit
|
||||
start=$(date +%s)
|
||||
bash "$top_srcdir/tests/local-crawl.sh" \
|
||||
--skip-delayed-audit \
|
||||
--log-found 'More than 2 seconds passed' \
|
||||
httrack 'BASEURL/trickle/index.html' -E2 -c4
|
||||
wall=$(($(date +%s) - start))
|
||||
# hard stop is due at -E2 + 5s grace; near TRICKLE_SECONDS means it never fired
|
||||
if [ "$wall" -ge 30 ]; then
|
||||
echo "crawl took ${wall}s, -E hard stop did not engage" >&2
|
||||
exit 1
|
||||
fi
|
||||
@@ -1,4 +1,4 @@
|
||||
# Committed binary fixture read by 01_engine-cache-golden.test. List it
|
||||
# Committed binary fixture read by 01_zlib-cache-golden.test. List it
|
||||
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
||||
# silently drop it from the dist tarball and break "make distcheck".
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
@@ -6,6 +6,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
local-crawl.sh local-server.py server.crt server.key \
|
||||
server-root/simple/basic.html server-root/simple/link.html \
|
||||
server-root/stripquery/index.html server-root/stripquery/a.html \
|
||||
server-root/fraglink/index.html server-root/fraglink/target.html \
|
||||
fixtures/cache-golden/hts-cache/new.zip
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
@@ -25,9 +26,6 @@ TEST_EXTENSIONS = .test
|
||||
TEST_LOG_COMPILER = $(BASH)
|
||||
TESTS = \
|
||||
00_runnable.test \
|
||||
01_engine-cache.test \
|
||||
01_engine-cache-golden.test \
|
||||
01_engine-cache-writefail.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-cookies.test \
|
||||
@@ -37,18 +35,37 @@ TESTS = \
|
||||
01_engine-entities.test \
|
||||
01_engine-filelist.test \
|
||||
01_engine-filter.test \
|
||||
01_engine-ftp-line.test \
|
||||
01_engine-ftp-userpass.test \
|
||||
01_engine-hashtable.test \
|
||||
01_engine-header.test \
|
||||
01_engine-idna.test \
|
||||
01_engine-escape-room.test \
|
||||
01_engine-inplace-escape.test \
|
||||
01_engine-java.test \
|
||||
01_engine-makeindex.test \
|
||||
01_engine-mime.test \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-redirect.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-sniff.test \
|
||||
01_engine-status.test \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
01_engine-urlhack.test \
|
||||
01_engine-unescape-bounds.test \
|
||||
01_engine-useragent.test \
|
||||
01_zlib-acceptencoding.test \
|
||||
01_zlib-cache.test \
|
||||
01_zlib-cache-golden.test \
|
||||
01_zlib-cache-writefail.test \
|
||||
01_zlib-savename-cached.test \
|
||||
02_manpage-regen.test \
|
||||
02_update-cache.test \
|
||||
10_crawl-simple.test \
|
||||
@@ -72,6 +89,14 @@ TESTS = \
|
||||
23_local-errpage.test \
|
||||
24_local-resume-overlap.test \
|
||||
25_local-mime-exclude.test \
|
||||
26_local-strip-query.test
|
||||
26_local-strip-query.test \
|
||||
27_local-cookies-file.test \
|
||||
28_local-pause.test \
|
||||
29_local-redirect-fragment.test \
|
||||
30_local-fragment-link.test \
|
||||
31_local-javaclass.test \
|
||||
32_local-cdispo.test \
|
||||
33_local-delayed.test \
|
||||
34_local-maxtime.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -12,11 +12,17 @@
|
||||
# the mirror directory name.
|
||||
#
|
||||
# Usage:
|
||||
# bash local-crawl.sh [--tls] [--root DIR] \
|
||||
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
|
||||
# host root), to assert rewritten link/content survived the crawl.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
# which the ephemeral port forces into the cookie domain) and passes it to
|
||||
# httrack via --cookies-file, to exercise preloaded cookies.
|
||||
|
||||
set -u
|
||||
|
||||
@@ -85,6 +91,8 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create
|
||||
|
||||
# --- parse leading control flags --------------------------------------------
|
||||
declare -a audit=()
|
||||
declare -a cookies=()
|
||||
skip_delayed_audit=""
|
||||
scheme=http
|
||||
pos=0
|
||||
args=("$@")
|
||||
@@ -105,6 +113,13 @@ while test "$pos" -lt "$nargs"; do
|
||||
pos=$((pos + 1))
|
||||
root="${args[$pos]}"
|
||||
;;
|
||||
--cookie)
|
||||
pos=$((pos + 1))
|
||||
cookies+=("${args[$pos]}")
|
||||
;;
|
||||
--skip-delayed-audit)
|
||||
skip_delayed_audit=1
|
||||
;;
|
||||
--errors | --files)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
@@ -113,6 +128,10 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--file-matches | --file-not-matches)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}" "${args[$((pos + 2))]}")
|
||||
pos=$((pos + 2))
|
||||
;;
|
||||
httrack)
|
||||
pos=$((pos + 1))
|
||||
break
|
||||
@@ -158,6 +177,17 @@ while test "$pos" -lt "$nargs"; do
|
||||
pos=$((pos + 1))
|
||||
done
|
||||
|
||||
# --- materialize any --cookie entries into a cookies.txt ---------------------
|
||||
if test "${#cookies[@]}" -gt 0; then
|
||||
jar="${tmpdir}/cookies.txt"
|
||||
: >"$jar"
|
||||
for spec in "${cookies[@]}"; do
|
||||
printf '127.0.0.1:%s\tTRUE\t/\tFALSE\t1999999999\t%s\t%s\n' \
|
||||
"$port" "${spec%%=*}" "${spec#*=}" >>"$jar"
|
||||
done
|
||||
hts+=(--cookies-file "$jar")
|
||||
fi
|
||||
|
||||
# --- run httrack -------------------------------------------------------------
|
||||
which httrack >/dev/null || die "could not find httrack"
|
||||
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')
|
||||
@@ -220,6 +250,17 @@ done
|
||||
test -n "$hostroot" || die "could not find host root under $out"
|
||||
debug "host root: $hostroot"
|
||||
|
||||
# A completed crawl must leave no .delayed temporaries (issue #107).
|
||||
# --skip-delayed-audit: a cancelled crawl can orphan placeholders (issue #483)
|
||||
if test -z "$skip_delayed_audit"; then
|
||||
info "checking for leftover .delayed files"
|
||||
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
|
||||
if test -z "$leftovers"; then result "OK"; else
|
||||
result "leftover: $leftovers"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- audit -------------------------------------------------------------------
|
||||
i=0
|
||||
while test "$i" -lt "${#audit[@]}"; do
|
||||
@@ -275,6 +316,24 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
--file-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} matches ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then result "OK"; else
|
||||
result "no match"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--file-not-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} lacks ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then
|
||||
result "matched"
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
@@ -14,6 +14,7 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import os
|
||||
import time
|
||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||
@@ -110,6 +111,19 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
return self.fail_cookie("badger")
|
||||
self.send_html("\tThis is a test.")
|
||||
|
||||
# --cookies-file (#215): the secret page needs a cookie no page ever sets,
|
||||
# so it is reachable only when --cookies-file preloads it.
|
||||
GATE_COOKIE = ("session", "opensesame")
|
||||
|
||||
def route_gated_index(self):
|
||||
self.send_html('\tThis is a <a href="secret.php">link</a>')
|
||||
|
||||
def route_gated_secret(self):
|
||||
name, value = self.GATE_COOKIE
|
||||
if self.request_cookies().get(name) != value:
|
||||
return self.fail_cookie(name)
|
||||
self.send_html("\tThis is the secret.")
|
||||
|
||||
def route_robots(self):
|
||||
body = b"User-agent: *\nDisallow:\n"
|
||||
self.send_response(200)
|
||||
@@ -121,12 +135,14 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
|
||||
# --- type/extension matrix (issue #267 family) -------------------------
|
||||
|
||||
def send_raw(self, body, content_type):
|
||||
def send_raw(self, body, content_type, extra_headers=()):
|
||||
"""Send a raw body with an explicit Content-Type, or none at all when
|
||||
content_type is None (to observe httrack's typeless-file naming)."""
|
||||
self.send_response(200)
|
||||
if content_type is not None:
|
||||
self.send_header("Content-Type", content_type)
|
||||
for name, value in extra_headers:
|
||||
self.send_header(name, value)
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
@@ -135,6 +151,8 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
# Fake-binary blobs for the image/pdf/typeless cases.
|
||||
FAKE_PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64
|
||||
FAKE_PDF = b"%PDF-1.4\n" + b"\x00" * 64
|
||||
FAKE_JPEG = b"\xff\xd8\xff\xe0" + b"\x00" * 64
|
||||
BIG_JPEG = b"\xff\xd8\xff\xe0" + bytes(range(256)) * 64 # > sniff window
|
||||
|
||||
# path -> (body, content_type); None sends no header, "" sends an empty
|
||||
# Content-Type value (no usable type, must be treated like None).
|
||||
@@ -146,6 +164,8 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/types/notype.pdf": (FAKE_PDF, None),
|
||||
"/types/emptyct.png": (FAKE_PNG, ""),
|
||||
"/types/lie.png": (FAKE_PNG, "text/html"),
|
||||
"/types/wrongtype.jpg": (FAKE_JPEG, "image/png"),
|
||||
"/types/bigtype.jpg": (BIG_JPEG, "image/png"),
|
||||
"/types/report.pdf": (b"<html><body>real page</body></html>", "text/html"),
|
||||
"/types/page.htm": (b"<html><body>htm page</body></html>", "text/html"),
|
||||
"/types/script.js": (b"var x = 1;\n", "application/javascript"),
|
||||
@@ -163,6 +183,10 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
'\t<a href="notype.pdf">notypepdf</a>\n'
|
||||
'\t<img src="emptyct.png" />\n'
|
||||
'\t<img src="lie.png" />\n'
|
||||
'\t<img src="wrongtype.jpg" />\n'
|
||||
'\t<img src="bigtype.jpg" />\n'
|
||||
'\t<img src="mutant.jpg" />\n'
|
||||
'\t<img src="packed.jpg" />\n'
|
||||
'\t<a href="report.pdf">report</a>\n'
|
||||
'\t<a href="page.htm">htm</a>\n'
|
||||
'\t<script src="script.js"></script>\n'
|
||||
@@ -177,6 +201,25 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
body, ctype = self.TYPE_MATRIX[path]
|
||||
self.send_raw(body, ctype)
|
||||
|
||||
# content changes between crawls: run 1 sniffs JPEG, the update pass must
|
||||
# keep the run-1 name (recorded verdict) even though the body is now PNG
|
||||
MUTANT_SEEN = set()
|
||||
|
||||
def route_types_mutant(self):
|
||||
path = urlsplit(self.path).path
|
||||
body = self.FAKE_PNG if path in self.MUTANT_SEEN else self.FAKE_JPEG
|
||||
if self.command != "HEAD":
|
||||
self.MUTANT_SEEN.add(path)
|
||||
self.send_raw(body, "image/png")
|
||||
|
||||
# gzip on the wire: the sniff must see the decoded body, not the stream
|
||||
def route_types_packed(self):
|
||||
self.send_raw(
|
||||
gzip.compress(self.FAKE_JPEG),
|
||||
"image/png",
|
||||
extra_headers=[("Content-Encoding", "gzip")],
|
||||
)
|
||||
|
||||
# --- MIME-type exclusion abort (issue #58) -----------------------------
|
||||
# A -mime:application/pdf filter must abort the transfer once the header
|
||||
# arrives, not download the whole body and discard it.
|
||||
@@ -341,10 +384,116 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
# Content-Disposition naming: the attachment filename replaces the
|
||||
# URL-derived name; path components in it are stripped (RFC 2616).
|
||||
CDISPO_NAMES = {
|
||||
"/cdispo/fetch.php": "report.pdf",
|
||||
"/cdispo/evil.php": "../../evil.pdf",
|
||||
}
|
||||
|
||||
def route_cdispo_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="fetch.php">report</a>\n' '\t<a href="evil.php">evil</a>\n'
|
||||
)
|
||||
|
||||
def route_cdispo(self):
|
||||
filename = self.CDISPO_NAMES[urlsplit(self.path).path]
|
||||
cdispo = 'attachment; filename="%s"' % filename
|
||||
self.send_raw(
|
||||
self.FAKE_PDF,
|
||||
"application/pdf",
|
||||
extra_headers=[("Content-Disposition", cdispo)],
|
||||
)
|
||||
|
||||
# 302 whose Location carries a #fragment (#204): the fragment is a UA anchor
|
||||
# that must be dropped before the target is fetched. A leaked '#' reaches the
|
||||
# strict-server guard below and 400s.
|
||||
def route_redir_index(self):
|
||||
self.send_html('\t<a href="go.php">go</a>')
|
||||
|
||||
def route_redir_go(self):
|
||||
self.send_response(302, "Found")
|
||||
self.send_header("Location", "target.html#section")
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
def route_redir_target(self):
|
||||
self.send_raw(b"<html><body>redirect target</body></html>\n", "text/html")
|
||||
|
||||
# --- delayed-type degenerate paths (issues #5/#107) --------------------
|
||||
def route_delayed_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="noloc.php">noloc</a>\n'
|
||||
'\t<a href="selfloop.php">selfloop</a>\n'
|
||||
'\t<a href="chain1.php">chain</a>\n'
|
||||
'\t<a href="redir.php">redir</a>\n'
|
||||
'\t<a href="notype.bin">notype</a>\n'
|
||||
'\t<a href="empty.php">empty</a>\n'
|
||||
)
|
||||
|
||||
def send_redirect(self, location):
|
||||
self.send_response(302, "Found")
|
||||
if location is not None:
|
||||
self.send_header("Location", location)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
def route_delayed_noloc(self):
|
||||
self.send_redirect(None) # 302 without Location: name never resolves
|
||||
|
||||
def route_delayed_selfloop(self):
|
||||
self.send_redirect("selfloop.php")
|
||||
|
||||
def route_delayed_chain(self):
|
||||
# chain1..chain9: one more hop than the type-check redirect budget
|
||||
n = int(urlsplit(self.path).path.rsplit("chain", 1)[1].split(".")[0])
|
||||
if n < 9:
|
||||
self.send_redirect("chain%d.php" % (n + 1))
|
||||
else:
|
||||
self.send_raw(self.FAKE_PDF, "application/pdf")
|
||||
|
||||
def route_delayed_redir(self):
|
||||
self.send_redirect("real.pdf")
|
||||
|
||||
def route_delayed_realpdf(self):
|
||||
self.send_raw(self.FAKE_PDF, "application/pdf")
|
||||
|
||||
def route_delayed_notype(self):
|
||||
self.send_raw(self.FAKE_PDF, None)
|
||||
|
||||
def route_delayed_empty(self):
|
||||
self.send_raw(b"", "text/html") # 200 + Content-Length: 0
|
||||
|
||||
# -E time-limit (#481): pages that trickle far longer than any -E budget,
|
||||
# so only an engine-side abort can end the crawl.
|
||||
TRICKLE_SECONDS = 60
|
||||
|
||||
def route_trickle_index(self):
|
||||
self.send_html(
|
||||
"".join('\t<a href="p%d.bin">p%d</a>\n' % (i, i) for i in range(8))
|
||||
)
|
||||
|
||||
def route_trickle_page(self):
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", str(2 * self.TRICKLE_SECONDS))
|
||||
self.end_headers()
|
||||
if self.command == "HEAD":
|
||||
return
|
||||
try:
|
||||
for _ in range(self.TRICKLE_SECONDS):
|
||||
self.wfile.write(b"xy")
|
||||
self.wfile.flush()
|
||||
time.sleep(1.0)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
"/cookies/third.php": route_third,
|
||||
"/gated/index.php": route_gated_index,
|
||||
"/gated/secret.php": route_gated_secret,
|
||||
"/robots.txt": route_robots,
|
||||
"/types/index.html": route_types_index,
|
||||
"/types/control.php": route_types,
|
||||
@@ -354,6 +503,10 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/types/notype.pdf": route_types,
|
||||
"/types/emptyct.png": route_types,
|
||||
"/types/lie.png": route_types,
|
||||
"/types/wrongtype.jpg": route_types,
|
||||
"/types/bigtype.jpg": route_types,
|
||||
"/types/mutant.jpg": route_types_mutant,
|
||||
"/types/packed.jpg": route_types_packed,
|
||||
"/types/report.pdf": route_types,
|
||||
"/types/page.htm": route_types,
|
||||
"/types/script.js": route_types,
|
||||
@@ -376,10 +529,51 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/mimex/index.html": route_mimex_index,
|
||||
"/mimex/blob.pdf": route_mimex_blob,
|
||||
"/mimex/real.html": route_mimex_real,
|
||||
"/cdispo/index.html": route_cdispo_index,
|
||||
"/cdispo/fetch.php": route_cdispo,
|
||||
"/cdispo/evil.php": route_cdispo,
|
||||
"/delayed/index.html": route_delayed_index,
|
||||
"/trickle/index.html": route_trickle_index,
|
||||
"/trickle/p0.bin": route_trickle_page,
|
||||
"/trickle/p1.bin": route_trickle_page,
|
||||
"/trickle/p2.bin": route_trickle_page,
|
||||
"/trickle/p3.bin": route_trickle_page,
|
||||
"/trickle/p4.bin": route_trickle_page,
|
||||
"/trickle/p5.bin": route_trickle_page,
|
||||
"/trickle/p6.bin": route_trickle_page,
|
||||
"/trickle/p7.bin": route_trickle_page,
|
||||
"/delayed/noloc.php": route_delayed_noloc,
|
||||
"/delayed/selfloop.php": route_delayed_selfloop,
|
||||
"/delayed/redir.php": route_delayed_redir,
|
||||
"/delayed/real.pdf": route_delayed_realpdf,
|
||||
"/delayed/notype.bin": route_delayed_notype,
|
||||
"/delayed/empty.php": route_delayed_empty,
|
||||
"/delayed/chain1.php": route_delayed_chain,
|
||||
"/delayed/chain2.php": route_delayed_chain,
|
||||
"/delayed/chain3.php": route_delayed_chain,
|
||||
"/delayed/chain4.php": route_delayed_chain,
|
||||
"/delayed/chain5.php": route_delayed_chain,
|
||||
"/delayed/chain6.php": route_delayed_chain,
|
||||
"/delayed/chain7.php": route_delayed_chain,
|
||||
"/delayed/chain8.php": route_delayed_chain,
|
||||
"/delayed/chain9.php": route_delayed_chain,
|
||||
"/redir/index.html": route_redir_index,
|
||||
"/redir/go.php": route_redir_go,
|
||||
"/redir/target.html": route_redir_target,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
|
||||
def reject_fragment(self):
|
||||
# Strict server: a '#' in the request-target is the client failing to
|
||||
# drop a fragment (#204). RFC 3986 forbids it on the wire; answer 400.
|
||||
if "#" in self.path:
|
||||
self.send_response(400, "Bad Request")
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
return True
|
||||
return False
|
||||
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
@@ -391,10 +585,14 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
return False
|
||||
|
||||
def do_GET(self):
|
||||
if self.reject_fragment():
|
||||
return
|
||||
if not self.dispatch():
|
||||
super().do_GET()
|
||||
|
||||
def do_HEAD(self):
|
||||
if self.reject_fragment():
|
||||
return
|
||||
if not self.dispatch():
|
||||
super().do_HEAD()
|
||||
|
||||
|
||||
4
tests/server-root/fraglink/index.html
Normal file
4
tests/server-root/fraglink/index.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html><body>
|
||||
<a href=target.html#sec>unquoted fragment link</a>
|
||||
<a href="target.html#sec2">quoted fragment link</a>
|
||||
</body></html>
|
||||
1
tests/server-root/fraglink/target.html
Normal file
1
tests/server-root/fraglink/target.html
Normal file
@@ -0,0 +1 @@
|
||||
<html><body><a name="sec"></a><a name="sec2"></a>target</body></html>
|
||||
Reference in New Issue
Block a user