mirror of
https://github.com/xroche/httrack.git
synced 2026-06-29 21:45:24 +03:00
Compare commits
1 Commits
worktree-a
...
fix/urlhac
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
600001b282 |
49
.github/workflows/ci.yml
vendored
49
.github/workflows/ci.yml
vendored
@@ -61,50 +61,6 @@ jobs:
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Reproduce the Debian buildds: they build in a minimal chroot with no
|
||||
# python3, so the local-server tests must SKIP (exit 77), not fail. GitHub
|
||||
# runners ship python3, so every other job hides this path; here we remove it
|
||||
# before `make check`. This is the guard that would have caught the 3.49.10-1
|
||||
# FTBFS (28_local-pause failed instead of skipping when python3 was absent).
|
||||
buildd-no-python3:
|
||||
name: build (no python3, Debian buildd)
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
set -euo pipefail
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential autoconf automake libtool autoconf-archive \
|
||||
zlib1g-dev libssl-dev
|
||||
|
||||
- name: Configure
|
||||
run: |
|
||||
set -euo pipefail
|
||||
autoreconf -fi
|
||||
./configure
|
||||
|
||||
- name: Build
|
||||
run: make -j"$(nproc)"
|
||||
|
||||
- name: Test without python3
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Hide every python3* so `command -v python3` fails like it does in the
|
||||
# buildd chroot; masking with /bin/false would still resolve.
|
||||
sudo find /usr/bin /usr/local/bin -maxdepth 1 -name 'python3*' \
|
||||
-exec mv {} {}.hidden \;
|
||||
! command -v python3
|
||||
make check
|
||||
|
||||
- name: Print the test log on failure
|
||||
if: failure()
|
||||
run: cat tests/test-suite.log 2>/dev/null || true
|
||||
|
||||
# Portability: build and test on macOS (Darwin/clang) on a native runner --
|
||||
# no VM. The tree has no __APPLE__ branches, so Darwin exercises the
|
||||
# generic-Unix path on a second libc and kernel. brew's openssl@3 is keg-only,
|
||||
@@ -269,9 +225,8 @@ jobs:
|
||||
MSAN_OPTIONS: abort_on_error=1:halt_on_error=1
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# 01_engine-* only; zlib-dependent self-tests are named 01_zlib-* and
|
||||
# skipped here (uninstrumented libz floods MSan with false positives).
|
||||
tests="$(cd tests && ls 01_engine-*.test | tr '\n' ' ')"
|
||||
# Engine self-tests only; the cache trio pulls in uninstrumented zlib.
|
||||
tests="$(cd tests && ls 01_engine-*.test | grep -v -- '-cache' | tr '\n' ' ')"
|
||||
make check TESTS="$tests"
|
||||
|
||||
- name: Print the test log on failure
|
||||
|
||||
@@ -39,10 +39,6 @@ Welcome, and nothing to disclose. Two rules:
|
||||
|
||||
The sign-off covers AI-assisted code too.
|
||||
|
||||
## Translations
|
||||
|
||||
Interface strings live in [`lang/`](lang/). See [lang/README.md](lang/README.md) for the file format and how to add or update a language.
|
||||
|
||||
## Bugs
|
||||
|
||||
Open an issue with the version, OS, command used, and expected vs actual result.
|
||||
|
||||
10
configure.ac
10
configure.ac
@@ -1,6 +1,6 @@
|
||||
AC_PREREQ([2.71])
|
||||
|
||||
AC_INIT([httrack], [3.49.10], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_INIT([httrack], [3.49.9], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_COPYRIGHT([
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2015 Xavier Roche and other contributors
|
||||
@@ -29,10 +29,10 @@ AC_CONFIG_SRCDIR(src/httrack.c)
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
AM_INIT_AUTOMAKE([subdir-objects])
|
||||
# 3:2:0: 3.49.10 only appends tail fields to the options struct (no existing
|
||||
# symbol or offset changed vs 3.49.9), so it stays soname .so.3; bump revision.
|
||||
# (3:0:0 was the htsblk mime-buffer widening, the ABI break that moved .so.2 -> .so.3.)
|
||||
VERSION_INFO="3:2:0"
|
||||
# 3:1:0: 3.49.9 changed code but not the exported interface vs 3.49.8 (same 164
|
||||
# symbols, no struct-layout change), so bump revision only. (3:0:0 was the htsblk
|
||||
# mime-buffer widening, an ABI break that moved the soname .so.2 -> .so.3.)
|
||||
VERSION_INFO="3:1:0"
|
||||
AM_MAINTAINER_MODE
|
||||
AC_USE_SYSTEM_EXTENSIONS
|
||||
|
||||
|
||||
13
debian/changelog
vendored
13
debian/changelog
vendored
@@ -1,16 +1,3 @@
|
||||
httrack (3.49.10-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: new download-pacing and URL-handling options plus a
|
||||
batch of crawl and robustness fixes (full list in history.txt).
|
||||
* Rewrite debian/copyright in machine-readable DEP-5 format, crediting the
|
||||
bundled minizip, md5 and coucal sources (#415).
|
||||
* Lead the webhttrack browser dependency with chromium so httrack is not
|
||||
dragged into the firefox-esr autoremoval cascade (#436).
|
||||
* Override the embedded-library lint for the bundled minizip (#419).
|
||||
* Bump Standards-Version to 4.7.4 (no changes required).
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sun, 28 Jun 2026 14:01:53 +0200
|
||||
|
||||
httrack (3.49.9-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: Content-Type and file-type detection fixes (trust a
|
||||
|
||||
4
debian/control
vendored
4
debian/control
vendored
@@ -2,7 +2,7 @@ Source: httrack
|
||||
Section: web
|
||||
Priority: optional
|
||||
Maintainer: Xavier Roche <roche@httrack.com>
|
||||
Standards-Version: 4.7.4
|
||||
Standards-Version: 4.7.0
|
||||
Build-Depends: debhelper-compat (= 13), autoconf, autoconf-archive, automake, libtool, zlib1g-dev, libssl-dev
|
||||
Rules-Requires-Root: no
|
||||
Homepage: http://www.httrack.com
|
||||
@@ -30,7 +30,7 @@ Description: Copy websites to your computer (Offline browser)
|
||||
Package: webhttrack
|
||||
Architecture: any
|
||||
Multi-Arch: foreign
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}, webhttrack-common, sensible-utils, chromium | firefox-esr | www-browser
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}, webhttrack-common, sensible-utils, firefox-esr | chromium | www-browser
|
||||
Replaces: webhttrack-common (<< 3.43.9-2)
|
||||
Breaks: webhttrack-common (<< 3.43.9-2)
|
||||
Suggests: httrack, httrack-doc
|
||||
|
||||
20
history.txt
20
history.txt
@@ -4,25 +4,7 @@ HTTrack Website Copier release history:
|
||||
|
||||
This file lists all changes and fixes that have been made for HTTrack
|
||||
|
||||
3.49-10
|
||||
+ New: --cookies-file to preload a Netscape cookies.txt before crawling (#215)
|
||||
+ New: --pause to space out file downloads by a random delay (#185)
|
||||
+ New: --strip-query to drop selected query keys from the dedup naming (#112)
|
||||
+ Changed: split the -%u URL hacks into independent --keep-www-prefix, --keep-double-slashes and --keep-query-order toggles (#271)
|
||||
+ Fixed: follow a redirect Location after dropping its #fragment, instead of requesting the fragment and polluting the saved name (#204)
|
||||
+ Fixed: escaped brackets inside a *[...] filter character class (#148)
|
||||
+ Fixed: honor the server's Content-Range when resuming a partial download, instead of appending overlapping bytes (#198)
|
||||
+ Fixed: abort the download as soon as the response type is excluded by -mime:, instead of fetching then discarding the body (#58)
|
||||
+ Fixed: keep size-based filter rules neutral until the file size is known (#143)
|
||||
+ Fixed: stop the mirror with a clean fatal error on a cache write failure, instead of crashing (#174, #219)
|
||||
+ Fixed: stop the 412/416 partial re-get loop on --continue and --update (#206)
|
||||
+ Fixed: keep an unrecognized URL tail instead of mangling it to .html (#115)
|
||||
+ Fixed: honor --tolerant (-%B) on a broken Content-Length, and fix an out-of-bounds read it exposed (#32, #41)
|
||||
+ Fixed: fall back to the next resolved address when a connection fails or stalls, instead of hanging on a dead IPv6 address
|
||||
+ Fixed: report why a -%L URL list could not be loaded (#49)
|
||||
+ Changed: multiple internal hardening, build and CI improvements
|
||||
|
||||
.49-9
|
||||
3.49-9
|
||||
+ Fixed: file-type detection from the Content-Type header: trust a declared type over a binary URL extension, honor --assume under the delayed type check, and keep a known extension against a bogus or empty Content-Type (#267, #29, #56)
|
||||
+ Fixed: an uninitialized-buffer read when the Content-Type is empty (#411)
|
||||
+ Fixed: restored C++ source-compatibility of the installed headers so reverse dependencies (httraqt) build again (#413)
|
||||
|
||||
@@ -247,7 +247,7 @@ See also: The <a href="faq.html#VF1">FAQ</a><br>
|
||||
<td>the \ character</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td nowrap><tt>*[\[,\]]</tt></td>
|
||||
<td nowrap><tt>*[\[\]]</tt></td>
|
||||
<td>the [ or ] character</td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
||||
@@ -295,7 +295,7 @@ Max Depth
|
||||
Maximum external depth:
|
||||
Maximum external depth:
|
||||
Filters (refuse/accept links) :
|
||||
Filters (refuse/accept links):
|
||||
Filters (refuse/accept links) :
|
||||
Paths
|
||||
Paths
|
||||
Save prefs
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
# Translating HTTrack
|
||||
|
||||
Interface strings live here, one `.txt` file per language. `English.txt` is the reference: every other file maps each English string to its translation.
|
||||
|
||||
## File format
|
||||
|
||||
Plain text, entries in consecutive pairs of lines:
|
||||
|
||||
```
|
||||
<English string>
|
||||
<translation>
|
||||
```
|
||||
|
||||
The first line of a pair is the lookup key and must stay identical to the one in `English.txt`; translate only the second line. Missing entries fall back to the English text at runtime, so a partial translation works.
|
||||
|
||||
Preserve any `\r\n`, `\t` and `printf` placeholders (`%s`, `%d`, ...) in the translation.
|
||||
|
||||
A few `LANGUAGE_*` entries at the top describe the file itself:
|
||||
|
||||
| Key | Meaning |
|
||||
| --- | --- |
|
||||
| `LANGUAGE_NAME` | Name shown in the language picker, in its own language (`Deutsch`, not `German`) |
|
||||
| `LANGUAGE_ISO` | ISO 639 code, with region if needed (`de`, `pt_BR`) |
|
||||
| `LANGUAGE_CHARSET` | Encoding the file is saved in (`ISO-8859-1`, `windows-1251`, `UTF-8`, ...) |
|
||||
| `LANGUAGE_AUTHOR` | Your name and contact |
|
||||
| `LANGUAGE_WINDOWSID` | Windows locale name used by WinHTTrack (`German (Standard)`) |
|
||||
|
||||
Save the file in exactly its declared `LANGUAGE_CHARSET`; an editor that rewrites it as UTF-8 will corrupt the non-ASCII bytes.
|
||||
|
||||
## Adding or updating a language
|
||||
|
||||
1. Copy `English.txt` to `<Language>.txt`, or edit the existing file.
|
||||
2. Translate each second line; leave the English keys untouched.
|
||||
3. Fill in the `LANGUAGE_*` header for a new file.
|
||||
4. Open a pull request, or attach the file to a GitHub issue.
|
||||
|
||||
When new strings land in `English.txt` they show up untranslated (as English) until a translator fills them in.
|
||||
@@ -24,7 +24,6 @@ httrack \- offline browser : copy websites to a local directory
|
||||
[ \fB\-EN, \-\-max\-time[=N]\fR ]
|
||||
[ \fB\-AN, \-\-max\-rate[=N]\fR ]
|
||||
[ \fB\-%cN, \-\-connection\-per\-second[=N]\fR ]
|
||||
[ \fB\-%G, \-\-pause\fR ]
|
||||
[ \fB\-GN, \-\-max\-pause[=N]\fR ]
|
||||
[ \fB\-cN, \-\-sockets[=N]\fR ]
|
||||
[ \fB\-TN, \-\-timeout[=N]\fR ]
|
||||
@@ -50,7 +49,6 @@ httrack \- offline browser : copy websites to a local directory
|
||||
[ \fB\-%p, \-\-preserve\fR ]
|
||||
[ \fB\-%T, \-\-utf8\-conversion\fR ]
|
||||
[ \fB\-bN, \-\-cookies[=N]\fR ]
|
||||
[ \fB\-%K, \-\-cookies\-file\fR ]
|
||||
[ \fB\-u, \-\-check\-type[=N]\fR ]
|
||||
[ \fB\-j, \-\-parse\-java[=N]\fR ]
|
||||
[ \fB\-sN, \-\-robots[=N]\fR ]
|
||||
@@ -156,8 +154,6 @@ maximum mirror time in seconds (60=1 minute, 3600=1 hour) (\-\-max\-time[=N])
|
||||
maximum transfer rate in bytes/seconds (1000=1KB/s max) (\-\-max\-rate[=N])
|
||||
.IP \-%cN
|
||||
maximum number of connections/seconds (*%c10) (\-\-connection\-per\-second[=N])
|
||||
.IP \-%G
|
||||
random pause of MIN[:MAX] seconds between files (e.g. %G5:10) (\-\-pause <param>)
|
||||
.IP \-GN
|
||||
pause transfer if N bytes reached, and wait until lock file is deleted (\-\-max\-pause[=N])
|
||||
.SS Flow control:
|
||||
@@ -216,8 +212,6 @@ links conversion to UTF\-8 (\-\-utf8\-conversion)
|
||||
.SS Spider options:
|
||||
.IP \-bN
|
||||
accept cookies in cookies.txt (0=do not accept,* 1=accept) (\-\-cookies[=N])
|
||||
.IP \-%K
|
||||
load extra cookies from a Netscape cookies.txt (\-\-cookies\-file <param>)
|
||||
.IP \-u
|
||||
check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always) (\-\-check\-type[=N])
|
||||
.IP \-j
|
||||
|
||||
@@ -112,10 +112,6 @@ const char *hts_optalias[][4] = {
|
||||
{"include-query-string", "-%q", "single", ""},
|
||||
{"strip-query", "-%g", "param1",
|
||||
"strip [host/pattern=]key1,key2,... from URLs"},
|
||||
{"cookies-file", "-%K", "param1",
|
||||
"load extra cookies from a Netscape cookies.txt"},
|
||||
{"pause", "-%G", "param1",
|
||||
"random pause of MIN[:MAX] seconds between files"},
|
||||
{"generate-errors", "-o", "single", ""},
|
||||
{"do-not-generate-errors", "-o0", "single", ""},
|
||||
{"purge-old", "-X", "param", ""},
|
||||
|
||||
@@ -129,8 +129,6 @@ typedef enum HTTPStatusCode {
|
||||
HTTP_UNSUPPORTED_MEDIA_TYPE = 415,
|
||||
HTTP_REQUESTED_RANGE_NOT_SATISFIABLE = 416,
|
||||
HTTP_EXPECTATION_FAILED = 417,
|
||||
HTTP_TOO_MANY_REQUESTS = 429,
|
||||
HTTP_UNAVAILABLE_FOR_LEGAL_REASONS = 451,
|
||||
HTTP_INTERNAL_SERVER_ERROR = 500,
|
||||
HTTP_NOT_IMPLEMENTED = 501,
|
||||
HTTP_BAD_GATEWAY = 502,
|
||||
|
||||
@@ -3,12 +3,12 @@
|
||||
|
||||
# Change this to download files
|
||||
if false; then
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
|
||||
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
|
||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
|
||||
rm -f CP932.TXT CP936.TXT CP949.TXT CP950.TXT
|
||||
fi
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ Please visit our Website: http://www.httrack.com
|
||||
// catch_url_init(&port,&return_host);
|
||||
HTSEXT_API T_SOC catch_url_init_std(int *port_prox, char *adr_prox) {
|
||||
T_SOC soc;
|
||||
int try_to_listen_to[] = {8080, 3128, 80, 81, 82, 8081, 3129, 0, -1};
|
||||
int try_to_listen_to[] = { 8080, 3128, 80, 81, 82, 8081, 3129, 31337, 0, -1 };
|
||||
int i = 0;
|
||||
|
||||
do {
|
||||
|
||||
193
src/htscore.c
193
src/htscore.c
@@ -35,7 +35,6 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <ctype.h>
|
||||
#include <stdint.h> /* uint64_t for the pause mixer (already a hard dep via md5.h) */
|
||||
|
||||
/* File defs */
|
||||
#include "htscore.h"
|
||||
@@ -406,40 +405,29 @@ void hts_invalidate_link(httrackp * opt, int lpos) {
|
||||
opt->liens[lpos]->pass2 = -1;
|
||||
}
|
||||
|
||||
// Write the makeindex footer (refresh meta when makeindex_links==1), close
|
||||
// the file, then run usercommand.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil) {
|
||||
if (!*makeindex_done) {
|
||||
if (*makeindex_fp) {
|
||||
char BIGSTK tempo[1024];
|
||||
if (makeindex_links == 1) {
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE * 2];
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped));
|
||||
snprintf(tempo, sizeof(tempo),
|
||||
"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">" CRLF,
|
||||
link_escaped);
|
||||
} else
|
||||
tempo[0] = '\0';
|
||||
hts_template_format(*makeindex_fp, template_footer,
|
||||
"<!-- Mirror and index made by HTTrack Website "
|
||||
"Copier/" HTTRACK_VERSION " " HTTRACK_AFF_AUTHORS
|
||||
" -->",
|
||||
tempo, /* EOF */ NULL);
|
||||
fflush(*makeindex_fp);
|
||||
fclose(*makeindex_fp);
|
||||
*makeindex_fp = NULL;
|
||||
usercommand(opt, 0, NULL,
|
||||
fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_html_utf8), "index.html"),
|
||||
adr, fil);
|
||||
}
|
||||
}
|
||||
*makeindex_done = 1;
|
||||
}
|
||||
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF, link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),StringBuff(opt->path_html_utf8),"index.html"),"",""); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
|
||||
/* does it look like XML ? (SVG et al.) */
|
||||
static int look_like_xml(const char *s) {
|
||||
@@ -535,12 +523,9 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
opt->cookie = &cookie;
|
||||
cookie.max_len = 30000; // max len
|
||||
strcpybuff(cookie.data, "");
|
||||
// Load the mirror's cookies.txt, then the one in the current directory
|
||||
// Charger cookies.txt par défaut ou cookies.txt du miroir
|
||||
cookie_load(opt->cookie, StringBuff(opt->path_log), "cookies.txt");
|
||||
cookie_load(opt->cookie, "", "cookies.txt");
|
||||
// A user-supplied cookie file is merged last so it wins on conflicts
|
||||
if (strnotempty(StringBuff(opt->cookies_file)))
|
||||
cookie_load(opt->cookie, "", StringBuff(opt->cookies_file));
|
||||
} else
|
||||
opt->cookie = NULL;
|
||||
|
||||
@@ -1807,18 +1792,90 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
if (strnotempty(savename()) == 0) { // pas de chemin de sauvegarde
|
||||
if (strcmp(urlfil(), "/robots.txt") == 0) { // robots.txt
|
||||
if (r.adr) {
|
||||
int bptr = 0;
|
||||
char BIGSTK line[1024];
|
||||
char BIGSTK buff[8192];
|
||||
char BIGSTK infobuff[8192];
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
hts_boolean keep_root = (opt->robots >= HTS_ROBOTS_ALWAYS_STRICT)
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
#else
|
||||
hts_boolean keep_root = HTS_TRUE;
|
||||
#endif
|
||||
int record = 0;
|
||||
|
||||
robots_parse(&robots, urladr(), r.adr, r.size, infobuff,
|
||||
sizeof(infobuff), keep_root);
|
||||
if (strnotempty(infobuff)) {
|
||||
line[0] = '\0';
|
||||
buff[0] = '\0';
|
||||
infobuff[0] = '\0';
|
||||
//
|
||||
#if DEBUG_ROBOTS
|
||||
printf("robots.txt dump:\n%s\n", r.adr);
|
||||
#endif
|
||||
do {
|
||||
char *comm;
|
||||
int llen;
|
||||
|
||||
bptr += binput(r.adr + bptr, line, sizeof(line) - 2);
|
||||
/* strip comment */
|
||||
comm = strchr(line, '#');
|
||||
if (comm != NULL) {
|
||||
*comm = '\0';
|
||||
}
|
||||
/* strip spaces */
|
||||
llen = (int) strlen(line);
|
||||
while(llen > 0 && is_realspace(line[llen - 1])) {
|
||||
line[llen - 1] = '\0';
|
||||
llen--;
|
||||
}
|
||||
if (strfield(line, "user-agent:")) {
|
||||
char *a;
|
||||
|
||||
a = line + 11;
|
||||
while(is_realspace(*a))
|
||||
a++; // sauter espace(s)
|
||||
if (*a == '*') {
|
||||
if (record != 2)
|
||||
record = 1; // c pour nous
|
||||
} else if (strfield(a, "httrack") || strfield(a, "winhttrack")
|
||||
|| strfield(a, "webhttrack")) {
|
||||
buff[0] = '\0'; // re-enregistrer
|
||||
infobuff[0] = '\0';
|
||||
record = 2; // locked
|
||||
#if DEBUG_ROBOTS
|
||||
printf("explicit disallow for httrack\n");
|
||||
#endif
|
||||
} else
|
||||
record = 0;
|
||||
} else if (record) {
|
||||
if (strfield(line, "disallow:")) {
|
||||
char *a = line + 9;
|
||||
|
||||
while(is_realspace(*a))
|
||||
a++; // sauter espace(s)
|
||||
if (strnotempty(a)) {
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
if (strcmp(a, "/") != 0 ||
|
||||
opt->robots >= HTS_ROBOTS_ALWAYS_STRICT)
|
||||
#endif
|
||||
{ /* ignoring disallow: / */
|
||||
if ((strlen(buff) + strlen(a) + 8) < sizeof(buff)) {
|
||||
strcatbuff(buff, a);
|
||||
strcatbuff(buff, "\n");
|
||||
if ((strlen(infobuff) + strlen(a) + 8) <
|
||||
sizeof(infobuff)) {
|
||||
if (strnotempty(infobuff))
|
||||
strcatbuff(infobuff, ", ");
|
||||
strcatbuff(infobuff, a);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
else {
|
||||
hts_log_print(opt, LOG_NOTICE,
|
||||
"Note: %s robots.txt rules are too restrictive, ignoring /",
|
||||
urladr());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
} while((bptr < r.size) && (strlen(buff) < (sizeof(buff) - 32)));
|
||||
if (strnotempty(buff)) {
|
||||
checkrobots_set(&robots, urladr(), buff);
|
||||
hts_log_print(opt, LOG_INFO,
|
||||
"Note: robots.txt forbidden links for %s are: %s",
|
||||
urladr(), infobuff);
|
||||
@@ -2055,8 +2112,7 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
/*
|
||||
Ensure the index is being closed
|
||||
*/
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp, makeindex_links,
|
||||
makeindex_firstlink, template_footer, "", "");
|
||||
HT_INDEX_END;
|
||||
|
||||
/*
|
||||
updating-a-remotely-deteted-website hack
|
||||
@@ -3255,21 +3311,6 @@ HTS_INLINE int back_fillmax(struct_back * sback, httrackp * opt,
|
||||
return -1; /* plus de place */
|
||||
}
|
||||
|
||||
/* Seed-derived: stable within a gap, rerolls per launch; a per-call rand()
|
||||
would bias the delay toward min_ms (see header). Jitter, not crypto. */
|
||||
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms) {
|
||||
uint64_t z = (uint64_t) seed;
|
||||
|
||||
if (max_ms <= min_ms)
|
||||
return min_ms;
|
||||
/* SplitMix64 finalizer: scrambles the low-entropy ms timestamp. */
|
||||
z += 0x9E3779B97F4A7C15ULL;
|
||||
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL;
|
||||
z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL;
|
||||
z ^= z >> 31;
|
||||
return min_ms + (int) (z % (uint64_t) (max_ms - min_ms + 1));
|
||||
}
|
||||
|
||||
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
|
||||
int n = opt->maxsoc - back_nsoc(sback);
|
||||
|
||||
@@ -3290,18 +3331,6 @@ int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
|
||||
}
|
||||
}
|
||||
|
||||
// #185 randomized inter-file pause: non-blocking, one launch per gap
|
||||
if (n > 0 && opt->pause_max_ms > 0 && HTS_STAT.last_connect > 0) {
|
||||
TStamp opTime =
|
||||
HTS_STAT.last_request ? HTS_STAT.last_request : HTS_STAT.last_connect;
|
||||
TStamp lap = mtime_local() - opTime;
|
||||
|
||||
if (lap < hts_pause_target_ms(opTime, opt->pause_min_ms, opt->pause_max_ms))
|
||||
n = 0;
|
||||
else
|
||||
n = 1;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
@@ -3713,14 +3742,6 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
||||
if (StringNotEmpty(from->strip_query))
|
||||
StringCopyS(to->strip_query, from->strip_query);
|
||||
|
||||
if (StringNotEmpty(from->cookies_file))
|
||||
StringCopyS(to->cookies_file, from->cookies_file);
|
||||
|
||||
if (from->pause_max_ms > 0) {
|
||||
to->pause_min_ms = from->pause_min_ms;
|
||||
to->pause_max_ms = from->pause_max_ms;
|
||||
}
|
||||
|
||||
if (from->retry > -1)
|
||||
to->retry = from->retry;
|
||||
|
||||
|
||||
@@ -362,14 +362,6 @@ void usercommand(httrackp * opt, int exe, const char *cmd, const char *file,
|
||||
|
||||
void usercommand_exe(const char *cmd, const char *file);
|
||||
|
||||
// Finish the makeindex index.html (footer + refresh meta), run usercommand.
|
||||
// Updates *makeindex_done/*makeindex_fp in place; adr/fil are the mode strings.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil);
|
||||
|
||||
int filters_init(char ***ptrfilters, int maxfilter, int filterinc);
|
||||
|
||||
int fspc(httrackp * opt, FILE * fp, const char *type);
|
||||
@@ -426,10 +418,6 @@ int back_pluggable_sockets(struct_back * sback, httrackp * opt);
|
||||
|
||||
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt);
|
||||
|
||||
/* Randomized inter-file pause target in [min_ms,max_ms] (#185), derived from a
|
||||
timestamp seed so it is stable within one gap and rerolls per launch. */
|
||||
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms);
|
||||
|
||||
/* Schedule more links from the heap into free slots. Returns the number queued,
|
||||
or <=0 if none could be added (no free slot / paused / stopped). */
|
||||
int back_fill(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
|
||||
@@ -1976,51 +1976,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
StringCat(opt->strip_query, argv[na]);
|
||||
}
|
||||
break;
|
||||
case 'K': // cookies-file: extra Netscape cookies.txt to preload
|
||||
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
|
||||
HTS_PANIC_PRINTF(
|
||||
"Option cookies-file needs a blank space and "
|
||||
"a cookies.txt path");
|
||||
printf("Example: --cookies-file \"/home/me/cookies.txt\"\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
na++;
|
||||
if (strlen(argv[na]) >= 1024) {
|
||||
HTS_PANIC_PRINTF("Cookie file path too long");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
}
|
||||
StringCopy(opt->cookies_file, argv[na]);
|
||||
}
|
||||
break;
|
||||
case 'G': // pause: randomized inter-file delay MIN[:MAX] seconds
|
||||
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
|
||||
HTS_PANIC_PRINTF("Option pause needs a blank space and a "
|
||||
"delay in seconds (MIN[:MAX])");
|
||||
printf("Example: --pause 5:10\n");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
} else {
|
||||
double pmin = 0, pmax = 0;
|
||||
int nf;
|
||||
|
||||
na++;
|
||||
nf = sscanf(argv[na], "%lf:%lf", &pmin, &pmax);
|
||||
if (nf < 2)
|
||||
pmax = pmin; /* a single value means a fixed delay */
|
||||
/* positive-form bounds: NaN fails every comparison, so this
|
||||
rejects it before the undefined (int)(NaN*1000) cast */
|
||||
if (nf < 1 || !(pmin >= 0 && pmax >= pmin && pmax <= 86400)) {
|
||||
HTS_PANIC_PRINTF("Invalid --pause range (expected "
|
||||
"MIN[:MAX] seconds, 0<=MIN<=MAX<=86400)");
|
||||
htsmain_free();
|
||||
return -1;
|
||||
}
|
||||
opt->pause_min_ms = (int) (pmin * 1000.0);
|
||||
opt->pause_max_ms = (int) (pmax * 1000.0);
|
||||
}
|
||||
break;
|
||||
case 't': /* do not change type (ending) of filenames according to the MIME type */
|
||||
opt->no_type_change = 1;
|
||||
if (*(com+1)=='0') { opt->no_type_change = 0; com++; }
|
||||
|
||||
@@ -30,14 +30,12 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htssafe.h"
|
||||
|
||||
/* static int decode_entity(const uint64_t hash, const size_t len);
|
||||
*/
|
||||
/* static int decode_entity(const unsigned int hash, const size_t len);
|
||||
*/
|
||||
#include "htsentities.h"
|
||||
|
||||
/* hexadecimal conversion */
|
||||
@@ -52,31 +50,30 @@ static int get_hex_value(char c) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* 64-bit FNV-1a; must match htsentities.sh, which keys the entity table on it.
|
||||
*/
|
||||
#define HASH_INIT 0xcbf29ce484222325ULL
|
||||
#define HASH_PRIME 0x100000001b3ULL
|
||||
#define HASH_ADD(HASH, C) \
|
||||
do { \
|
||||
(HASH) ^= (unsigned char) (C); \
|
||||
(HASH) *= HASH_PRIME; \
|
||||
} while (0)
|
||||
/* Numerical Recipes,
|
||||
see <http://en.wikipedia.org/wiki/Linear_congruential_generator> */
|
||||
#define HASH_PRIME ( 1664525 )
|
||||
#define HASH_CONST ( 1013904223 )
|
||||
#define HASH_ADD(HASH, C) do { \
|
||||
(HASH) *= HASH_PRIME; \
|
||||
(HASH) += HASH_CONST; \
|
||||
(HASH) += (C); \
|
||||
} while(0)
|
||||
|
||||
int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) {
|
||||
size_t i, j, ampStart, ampStartDest;
|
||||
int uc;
|
||||
int hex;
|
||||
uint64_t hash;
|
||||
unsigned int hash;
|
||||
|
||||
assertf(max != 0);
|
||||
for (i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0, uc = -1, hex = 0,
|
||||
hash = HASH_INIT;
|
||||
src[i] != '\0'; i++) {
|
||||
for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0,
|
||||
uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) {
|
||||
/* start of entity */
|
||||
if (src[i] == '&') {
|
||||
ampStart = i;
|
||||
ampStartDest = j;
|
||||
hash = HASH_INIT;
|
||||
hash = 0;
|
||||
uc = -1;
|
||||
}
|
||||
/* inside a potential entity */
|
||||
@@ -177,11 +174,14 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
|
||||
}
|
||||
/* alphanumerical entity */
|
||||
else {
|
||||
/* alphanum, capped at the longest name
|
||||
* '∳' (31) */
|
||||
if (i <= ampStart + 31 && ((src[i] >= '0' && src[i] <= '9') ||
|
||||
(src[i] >= 'A' && src[i] <= 'Z') ||
|
||||
(src[i] >= 'a' && src[i] <= 'z'))) {
|
||||
/* alphanum and not too far ('ϑ' is the longest) */
|
||||
if (i <= ampStart + 10 &&
|
||||
(
|
||||
(src[i] >= '0' && src[i] <= '9')
|
||||
|| (src[i] >= 'A' && src[i] <= 'Z')
|
||||
|| (src[i] >= 'a' && src[i] <= 'z')
|
||||
)
|
||||
) {
|
||||
/* compute hash */
|
||||
HASH_ADD(hash, (unsigned char) src[i]);
|
||||
} else {
|
||||
|
||||
13612
src/htsentities.h
13612
src/htsentities.h
File diff suppressed because it is too large
Load Diff
@@ -1,92 +1,75 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Regenerate htsentities.h from the WHATWG named character references.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
src=entities.json
|
||||
url=https://html.spec.whatwg.org/entities.json
|
||||
src=html40.txt
|
||||
url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt
|
||||
dest=htsentities.h
|
||||
|
||||
# 64-bit FNV-1a of $1, printed as a C constant. Must match the hash in
|
||||
# htsencoding.c. The offset basis is stored as its wrapped (signed) bit pattern;
|
||||
# bash arithmetic is 64-bit two's complement, so the result is bit-exact.
|
||||
fnv1a() {
|
||||
local s=$1 i c h=$((0xcbf29ce484222325))
|
||||
for ((i = 0; i < ${#s}; i++)); do
|
||||
printf -v c '%d' "'${s:i:1}"
|
||||
h=$(((h ^ (c & 0xff)) * 0x100000001b3))
|
||||
done
|
||||
printf '0x%016xULL' "$h"
|
||||
}
|
||||
(
|
||||
cat <<EOF
|
||||
/*
|
||||
-- ${dest} --
|
||||
FILE GENERATED BY $0, DO NOT MODIFY
|
||||
|
||||
if [ ! -f "$src" ]; then
|
||||
curl -fsS "$url" -o "$src"
|
||||
fi
|
||||
We compute the LCG hash
|
||||
(see <http://en.wikipedia.org/wiki/Linear_congruential_generator>)
|
||||
for each entity. We should in theory check using strncmp() that we
|
||||
actually have the correct entity, but this is actually statistically
|
||||
not needed.
|
||||
|
||||
# Keep ';'-terminated single-codepoint names; the ~93 multi-codepoint refs can't
|
||||
# fit decode_entity's single-codepoint return and are skipped (left verbatim).
|
||||
pairs=$(jq -r '
|
||||
to_entries
|
||||
| map(select((.key | endswith(";")) and (.value.codepoints | length == 1)))
|
||||
| sort_by(.key)
|
||||
| .[] | "\(.key | ltrimstr("&") | rtrimstr(";"))\t\(.value.codepoints[0])"' "$src")
|
||||
We may want to do better, but we expect the hash function to be uniform, and
|
||||
let the compiler be smart enough to optimize the switch (for example by
|
||||
checking in log2() intervals)
|
||||
|
||||
This code has been generated using the evil $0 script.
|
||||
*/
|
||||
|
||||
# Skipped multi-codepoint names, kept to prove none aliases an emitted hash.
|
||||
skipped=$(jq -r '
|
||||
to_entries
|
||||
| map(select((.key | endswith(";")) and (.value.codepoints | length > 1)))
|
||||
| .[] | .key | ltrimstr("&") | rtrimstr(";")' "$src")
|
||||
|
||||
cases=""
|
||||
emit_hashes=""
|
||||
while IFS=$'\t' read -r name cp; do
|
||||
hash=$(fnv1a "$name")
|
||||
cases+=" /* $name */"$'\n'
|
||||
cases+=" case $hash:"$'\n'
|
||||
cases+=" if (len == ${#name}) {"$'\n'
|
||||
cases+=" return $cp;"$'\n'
|
||||
cases+=" }"$'\n'
|
||||
cases+=" break;"$'\n'
|
||||
emit_hashes+="$hash"$'\n'
|
||||
done <<<"$pairs"
|
||||
|
||||
skip_hashes=""
|
||||
while IFS= read -r name; do
|
||||
[ -n "$name" ] && skip_hashes+="$(fnv1a "$name")"$'\n'
|
||||
done <<<"$skipped"
|
||||
|
||||
# The switch keys on the hash alone, so the dispatch is correct only while every
|
||||
# emitted name hashes uniquely; prove it here, no runtime name compare needed.
|
||||
dups=$(printf '%s' "$emit_hashes" | sort | uniq -d || true)
|
||||
if [ -n "$dups" ]; then
|
||||
echo "FATAL: two entity names share a hash (duplicate switch case); change the hash:" >&2
|
||||
echo "$dups" >&2
|
||||
exit 1
|
||||
fi
|
||||
# A skipped name colliding with an emitted hash would mis-decode instead of
|
||||
# staying verbatim; forbid that too.
|
||||
aliased=$(comm -12 <(printf '%s' "$emit_hashes" | sort -u) <(printf '%s' "$skip_hashes" | sort -u) || true)
|
||||
if [ -n "$aliased" ]; then
|
||||
echo "FATAL: a skipped multi-codepoint name aliases an emitted hash:" >&2
|
||||
echo "$aliased" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cat >"$dest" <<EOF
|
||||
/* GENERATED by $0 from the WHATWG named character references
|
||||
(${url}). DO NOT EDIT.
|
||||
Dispatch keys on a 64-bit FNV-1a hash of the entity name; the generator
|
||||
aborts on any hash collision, so no runtime name compare is needed. */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
static int decode_entity(const uint64_t hash, const size_t len) {
|
||||
static int decode_entity(const unsigned int hash, const size_t len) {
|
||||
switch(hash) {
|
||||
${cases} }
|
||||
EOF
|
||||
(
|
||||
if test -f ${src}; then
|
||||
cat ${src}
|
||||
else
|
||||
GET "${url}"
|
||||
fi
|
||||
) |
|
||||
grep -E '^<!ENTITY [a-zA-Z0-9_]' |
|
||||
sed \
|
||||
-e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
|
||||
-e 's/-->$//' \
|
||||
-e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/' |
|
||||
(
|
||||
read -r A
|
||||
while test -n "$A"; do
|
||||
ent="${A%% *}"
|
||||
code=$(echo "$A" | cut -f2 -d' ')
|
||||
# compute hash
|
||||
hash=0
|
||||
i=0
|
||||
a=1664525
|
||||
c=1013904223
|
||||
m="$((1 << 32))"
|
||||
while test "$i" -lt ${#ent}; do
|
||||
d="$(echo -n "${ent:${i}:1}" | hexdump -v -e '/1 "%d"')"
|
||||
hash="$((((hash * a) % (m) + d + c) % (m)))"
|
||||
i=$((i + 1))
|
||||
done
|
||||
echo -e " /* $A */"
|
||||
echo -e " case ${hash}u:"
|
||||
echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {"
|
||||
echo -e " return ${code};"
|
||||
echo -e " }"
|
||||
echo -e " break;"
|
||||
|
||||
# next
|
||||
read -r A
|
||||
done
|
||||
)
|
||||
cat <<EOF
|
||||
}
|
||||
/* unknown */
|
||||
return -1;
|
||||
}
|
||||
EOF
|
||||
|
||||
echo "wrote $dest ($(grep -c '^ case ' "$dest") entities)" >&2
|
||||
) >${dest}
|
||||
|
||||
@@ -193,12 +193,7 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
|
||||
int len = (int) strlen(joker);
|
||||
|
||||
while((joker[i] != RIGHT) && (joker[i]) && (i < len)) {
|
||||
// '\' escapes the next char as a literal member, e.g. *[\[\]]
|
||||
if (joker[i] == '\\' && joker[i + 1] != '\0') {
|
||||
i++;
|
||||
pass[(int) (unsigned char) joker[i]] = 1;
|
||||
i++;
|
||||
} else if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10]
|
||||
if ((joker[i] == '<') || (joker[i] == '>')) { // *[<10]
|
||||
int lsize = 0;
|
||||
int lverdict;
|
||||
|
||||
@@ -226,9 +221,7 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
|
||||
while(isdigit((unsigned char) joker[i]))
|
||||
i++;
|
||||
}
|
||||
} else if (joker[i + 1] == '-' && joker[i + 2] != '\0') {
|
||||
// range *[A-Z]; the '\0' guard rejects a truncated *[a- (else
|
||||
// i+=3 overshoots the NUL)
|
||||
} else if (joker[i + 1] == '-') { // 2 car, ex: *[A-Z]
|
||||
if ((int) (unsigned char) joker[i + 2] >
|
||||
(int) (unsigned char) joker[i]) {
|
||||
int j;
|
||||
@@ -240,7 +233,10 @@ HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * s
|
||||
}
|
||||
// else err=1;
|
||||
i += 3;
|
||||
} else { // 1 car, ex: *[ ]
|
||||
} else { // 1 car, ex: *[ ]
|
||||
if (joker[i + 2] == '\\' && joker[i + 3] != 0) { // escaped char, such as *[\[] or *[\]]
|
||||
i++;
|
||||
}
|
||||
pass[(int) (unsigned char) joker[i]] = 1;
|
||||
i++;
|
||||
}
|
||||
|
||||
@@ -43,8 +43,8 @@ Please visit our Website: http://www.httrack.com
|
||||
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
||||
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
||||
LIB_VERSION the data/cache format generation. */
|
||||
#define HTTRACK_VERSION "3.49-10"
|
||||
#define HTTRACK_VERSIONID "3.49.10"
|
||||
#define HTTRACK_VERSION "3.49-9"
|
||||
#define HTTRACK_VERSIONID "3.49.9"
|
||||
#define HTTRACK_AFF_VERSION "3.x"
|
||||
#define HTTRACK_LIB_VERSION "2.0"
|
||||
|
||||
@@ -229,10 +229,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_DEFAULT_FOOTER \
|
||||
"<!-- Mirrored from %s%s by HTTrack Website Copier/" HTTRACK_AFF_VERSION \
|
||||
" " HTTRACK_AFF_AUTHORS ", %s -->"
|
||||
/* Honest crawler User-Agent; no fake OS/browser to go stale. */
|
||||
#define HTS_DEFAULT_USER_AGENT \
|
||||
"Mozilla/5.0 (compatible; HTTrack/" HTTRACK_AFF_VERSION \
|
||||
"; +https://www.httrack.com/)"
|
||||
#define HTTRACK_WEB "http://www.httrack.com"
|
||||
#define HTS_UPDATE_WEBSITE \
|
||||
"http://www.httrack.com/" \
|
||||
|
||||
@@ -521,7 +521,6 @@ void help(const char *app, int more) {
|
||||
infomsg(" EN maximum mirror time in seconds (60=1 minute, 3600=1 hour)");
|
||||
infomsg(" AN maximum transfer rate in bytes/seconds (1000=1KB/s max)");
|
||||
infomsg(" %cN maximum number of connections/seconds (*%c10)");
|
||||
infomsg(" %G random pause of MIN[:MAX] seconds between files (e.g. %G5:10)");
|
||||
infomsg
|
||||
(" GN pause transfer if N bytes reached, and wait until lock file is deleted");
|
||||
infomsg("");
|
||||
@@ -573,7 +572,6 @@ void help(const char *app, int more) {
|
||||
infomsg("");
|
||||
infomsg("Spider options:");
|
||||
infomsg(" bN accept cookies in cookies.txt (0=do not accept,* 1=accept)");
|
||||
infomsg(" %K load extra cookies from a Netscape cookies.txt");
|
||||
infomsg
|
||||
(" u check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always)");
|
||||
infomsg
|
||||
|
||||
114
src/htslib.c
114
src/htslib.c
@@ -563,39 +563,6 @@ const char *hts_mime[][2] = {
|
||||
{"", ""}
|
||||
};
|
||||
|
||||
/* Modern web formats (post-2010), kept in their own table: appending to the
|
||||
legacy hts_mime[] above makes clang-format reflow its whole initializer.
|
||||
Scanned after hts_mime[], so it never shadows a legacy mapping. */
|
||||
static const char *hts_mime_modern[][2] = {
|
||||
{"image/webp", "webp"},
|
||||
{"image/avif", "avif"},
|
||||
{"image/heic", "heic"},
|
||||
{"font/woff", "woff"},
|
||||
{"font/woff2", "woff2"},
|
||||
{"font/ttf", "ttf"},
|
||||
{"font/otf", "otf"},
|
||||
{"application/json", "json"},
|
||||
{"application/ld+json", "jsonld"},
|
||||
{"application/manifest+json", "webmanifest"},
|
||||
{"application/wasm", "wasm"},
|
||||
{"text/javascript", "js"},
|
||||
{"text/javascript", "mjs"},
|
||||
{"text/markdown", "md"},
|
||||
{"video/mp4", "mp4"},
|
||||
{"video/webm", "webm"},
|
||||
{"video/ogg", "ogv"},
|
||||
{"video/mp2t", "ts"},
|
||||
{"audio/mp4", "m4a"},
|
||||
{"audio/aac", "aac"},
|
||||
{"audio/ogg", "oga"},
|
||||
{"audio/opus", "opus"},
|
||||
{"audio/flac", "flac"},
|
||||
{"audio/webm", "weba"},
|
||||
{"application/x-7z-compressed", "7z"},
|
||||
{"application/x-rar-compressed", "rar"},
|
||||
{"application/zstd", "zst"},
|
||||
{"", ""}};
|
||||
|
||||
// Reserved (RFC2396)
|
||||
#define CIS(c,ch) ( ((unsigned char)(c)) == (ch) )
|
||||
#define CHAR_RESERVED(c) ( CIS(c,';') \
|
||||
@@ -1326,12 +1293,16 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
|
||||
// Compression accepted ?
|
||||
if (retour->req.http11) {
|
||||
hts_boolean compressible = HTS_FALSE;
|
||||
#if HTS_USEZLIB
|
||||
compressible = (!retour->req.range_used && !retour->req.nocompression);
|
||||
if ((!retour->req.range_used)
|
||||
&& (!retour->req.nocompression))
|
||||
print_buffer(&bstr, "Accept-Encoding: " "gzip" /* gzip if the preffered encoding */
|
||||
", " "identity;q=0.9" H_CRLF);
|
||||
else
|
||||
print_buffer(&bstr, "Accept-Encoding: identity" H_CRLF); /* no compression */
|
||||
#else
|
||||
print_buffer(&bstr, "Accept-Encoding: identity" H_CRLF); /* no compression */
|
||||
#endif
|
||||
print_buffer(&bstr, "Accept-Encoding: %s" H_CRLF,
|
||||
hts_acceptencoding(compressible));
|
||||
}
|
||||
|
||||
/* Authentification */
|
||||
@@ -1947,10 +1918,6 @@ HTSEXT_API const char *infostatuscode_const(int statuscode) {
|
||||
return "Requested Range Not Satisfiable";
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
case 501:
|
||||
@@ -4341,20 +4308,6 @@ void guess_httptype(httrackp * opt, char *s, const char *fil) {
|
||||
(void) get_httptype_sized(opt, s, HTS_MIMETYPE_SIZE, fil, 1);
|
||||
}
|
||||
|
||||
// first match in a NUL-terminated {mime,ext} table. key selects the lookup
|
||||
// column (0=mime, 1=ext); returns the other column, or NULL if no row matches
|
||||
// (a "*" partner means the row carries no value).
|
||||
static const char *hts_mime_lookup(const char *(*table)[2], int key,
|
||||
const char *needle) {
|
||||
int j;
|
||||
|
||||
for (j = 0; strnotempty(table[j][1]); j++) {
|
||||
if (strfield2(table[j][key], needle) && table[j][!key][0] != '*')
|
||||
return table[j][!key];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// write the mime type for fil into s (capacity ssize)
|
||||
// flag: 1 to always return a type (the "application/..." / octet-stream
|
||||
// fallback) returns 1 if a type was written to s, 0 otherwise
|
||||
@@ -4378,15 +4331,17 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
while ((a > fil) && (*a != '.') && (*a != '/'))
|
||||
a--;
|
||||
if (a >= fil && *a == '.' && strlen(a) < 32) {
|
||||
const char *mime;
|
||||
int j = 0;
|
||||
|
||||
a++;
|
||||
mime = hts_mime_lookup(hts_mime, 1, a);
|
||||
if (mime == NULL)
|
||||
mime = hts_mime_lookup(hts_mime_modern, 1, a);
|
||||
if (mime != NULL) {
|
||||
strlcpybuff(s, mime, ssize);
|
||||
return 1;
|
||||
while(strnotempty(hts_mime[j][1])) {
|
||||
if (strfield2(hts_mime[j][1], a)) {
|
||||
if (hts_mime[j][0][0] != '*') { // a match exists
|
||||
strlcpybuff(s, hts_mime[j][0], ssize);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
j++;
|
||||
}
|
||||
|
||||
if (flag) {
|
||||
@@ -4410,11 +4365,6 @@ HTSEXT_API void get_httptype(httrackp *opt, char *s, const char *fil,
|
||||
(void) get_httptype_sized(opt, s, HTS_MIMETYPE_SIZE, fil, flag);
|
||||
}
|
||||
|
||||
/* Advertised Accept-Encoding; gzip and deflate both decode via hts_zunpack */
|
||||
const char *hts_acceptencoding(hts_boolean compressible) {
|
||||
return compressible ? "gzip, deflate, identity;q=0.9" : "identity";
|
||||
}
|
||||
|
||||
// get type of fil (php)
|
||||
// s: buffer (text/html) or NULL
|
||||
// return: 1 if known by user
|
||||
@@ -4526,16 +4476,18 @@ int get_userhttptype(httrackp * opt, char *s, const char *fil) {
|
||||
// returns 1 if an extension was found (and written to s), 0 otherwise
|
||||
int give_mimext(char *s, size_t ssize, const char *st) {
|
||||
int ok = 0;
|
||||
const char *ext;
|
||||
int j = 0;
|
||||
|
||||
st = hts_effective_mime(st); /* no declared type: derive an html ext */
|
||||
s[0] = '\0';
|
||||
ext = hts_mime_lookup(hts_mime, 0, st);
|
||||
if (ext == NULL)
|
||||
ext = hts_mime_lookup(hts_mime_modern, 0, st);
|
||||
if (ext != NULL) {
|
||||
strlcpybuff(s, ext, ssize);
|
||||
ok = 1;
|
||||
while((!ok) && (strnotempty(hts_mime[j][1]))) {
|
||||
if (strfield2(hts_mime[j][0], st)) {
|
||||
if (hts_mime[j][1][0] != '*') { // a match exists
|
||||
strlcpybuff(s, hts_mime[j][1], ssize);
|
||||
ok = 1;
|
||||
}
|
||||
}
|
||||
j++;
|
||||
}
|
||||
// wrap "x" mimetypes, such as:
|
||||
// application/x-mp3
|
||||
@@ -5802,13 +5754,6 @@ HTSEXT_API int hts_init(void) {
|
||||
abortLog("unable to initialize TLS: SSL_CTX_new()");
|
||||
assertf("unable to initialize TLS" == NULL);
|
||||
}
|
||||
/* Pin a TLS floor (no SSLv3/TLS1.0/1.1); no cert verify, by design. */
|
||||
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
|
||||
SSL_CTX_set_min_proto_version(openssl_ctx, TLS1_2_VERSION);
|
||||
#else
|
||||
SSL_CTX_set_options(openssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
|
||||
SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -6060,7 +6005,8 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
opt->shell = HTS_FALSE;
|
||||
opt->proxy.active = 0; // pas de proxy
|
||||
opt->user_agent_send = HTS_TRUE;
|
||||
StringCopy(opt->user_agent, HTS_DEFAULT_USER_AGENT);
|
||||
StringCopy(opt->user_agent,
|
||||
"Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)");
|
||||
StringCopy(opt->referer, "");
|
||||
StringCopy(opt->from, "");
|
||||
opt->savename_83 = HTS_SAVENAME_83_LONG; // long names by default
|
||||
@@ -6099,9 +6045,6 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
opt->no_query_dedup = HTS_FALSE;
|
||||
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
|
||||
StringCopy(opt->strip_query, "");
|
||||
StringCopy(opt->cookies_file, "");
|
||||
opt->pause_min_ms = 0;
|
||||
opt->pause_max_ms = 0;
|
||||
opt->ftp_proxy = HTS_TRUE;
|
||||
opt->convert_utf8 = HTS_TRUE;
|
||||
StringCopy(opt->filelist, "");
|
||||
@@ -6247,7 +6190,6 @@ HTSEXT_API void hts_free_opt(httrackp * opt) {
|
||||
StringFree(opt->footer);
|
||||
StringFree(opt->mod_blacklist);
|
||||
StringFree(opt->strip_query);
|
||||
StringFree(opt->cookies_file);
|
||||
|
||||
StringFree(opt->path_html);
|
||||
StringFree(opt->path_html_utf8);
|
||||
|
||||
@@ -285,9 +285,6 @@ int ishttperror(int err);
|
||||
int get_userhttptype(httrackp * opt, char *s, const char *fil);
|
||||
int give_mimext(char *s, size_t ssize, const char *st);
|
||||
|
||||
/* Advertised Accept-Encoding value (no header name/CRLF); see htslib.c. */
|
||||
const char *hts_acceptencoding(hts_boolean compressible);
|
||||
|
||||
int may_bogus_multiple(httrackp * opt, const char *mime, const char *filename);
|
||||
int may_unknown2(httrackp * opt, const char *mime, const char *filename);
|
||||
|
||||
|
||||
@@ -535,10 +535,6 @@ struct httrackp {
|
||||
no_www_dedup; /**< with urlhack, keep www.host distinct from host */
|
||||
hts_boolean no_slash_dedup; /**< with urlhack, keep redundant // in paths */
|
||||
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
|
||||
String cookies_file; /**< extra Netscape cookies.txt to preload
|
||||
(--cookies-file) */
|
||||
int pause_min_ms; /**< inter-file pause lower bound, ms (0=off, #185) */
|
||||
int pause_max_ms; /**< inter-file pause upper bound, ms */
|
||||
};
|
||||
|
||||
/* Running statistics for a mirror. */
|
||||
|
||||
@@ -167,6 +167,30 @@ Please visit our Website: http://www.httrack.com
|
||||
}
|
||||
#define HT_ADD_FOP
|
||||
|
||||
// COPY IN HTSCORE.C
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_html_utf8),"index.html"),"primary","primary"); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
|
||||
#define ENGINE_DEFINE_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
@@ -278,14 +302,6 @@ static HTS_INLINE char html_prevc(const char *html, const char *start) {
|
||||
return html > start ? html[-1] : ' ';
|
||||
}
|
||||
|
||||
/* Drop a redirect Location's #fragment: a UA anchor, never part of the fetched
|
||||
* resource (#204). */
|
||||
static void url_drop_fragment(char *const url) {
|
||||
char *const frag = strchr(url, '#');
|
||||
if (frag != NULL)
|
||||
*frag = '\0';
|
||||
}
|
||||
|
||||
/* True if [s, s+len) is exactly an HTTP method token (XHR.open's first
|
||||
argument is a method, not a URL: #218). Case-insensitive. */
|
||||
static int is_http_method(const char *s, size_t len) {
|
||||
@@ -685,9 +701,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
} else if (heap(ptr)->depth < opt->depth) { // on a sauté level1+1 et level1
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp,
|
||||
makeindex_links, makeindex_firstlink,
|
||||
template_footer, "primary", "primary");
|
||||
HT_INDEX_END;
|
||||
}
|
||||
} // if (opt->makeindex)
|
||||
}
|
||||
@@ -3582,7 +3596,6 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
//
|
||||
|
||||
strcpybuff(mov_url, r->location);
|
||||
url_drop_fragment(mov_url);
|
||||
|
||||
// url qque -> adresse+fichier
|
||||
if ((reponse =
|
||||
@@ -4790,7 +4803,6 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
|
||||
mov_url[0] = '\0';
|
||||
strcpybuff(mov_url, back[b].r.location); // copier URL
|
||||
url_drop_fragment(mov_url);
|
||||
|
||||
/* Remove (temporarily created) file if it was created */
|
||||
UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), back[b].url_sav));
|
||||
|
||||
167
src/htsrobots.c
167
src/htsrobots.c
@@ -44,84 +44,28 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
// -- robots --
|
||||
|
||||
/* RFC 9309 path-prefix match; '*' any run, '$' anchors end; linear. */
|
||||
static hts_boolean robots_pattern_match(const char *pattern, const char *path) {
|
||||
size_t patlen = strlen(pattern);
|
||||
hts_boolean anchored = HTS_FALSE;
|
||||
const char *p, *pend, *s;
|
||||
const char *star = NULL, *star_s = NULL;
|
||||
|
||||
if (patlen > 0 && pattern[patlen - 1] == '$') {
|
||||
anchored = HTS_TRUE;
|
||||
patlen--;
|
||||
}
|
||||
p = pattern;
|
||||
pend = pattern + patlen;
|
||||
s = path;
|
||||
while (*s != '\0') {
|
||||
if (p == pend) {
|
||||
if (!anchored)
|
||||
return HTS_TRUE; // prefix matched
|
||||
if (star != NULL) { // anchored: '*' must eat the rest
|
||||
p = star + 1;
|
||||
s = ++star_s;
|
||||
continue;
|
||||
}
|
||||
return HTS_FALSE;
|
||||
}
|
||||
if (*p == '*') {
|
||||
star = p++;
|
||||
star_s = s;
|
||||
} else if (*p == *s) {
|
||||
p++;
|
||||
s++;
|
||||
} else if (star != NULL) {
|
||||
p = star + 1;
|
||||
s = ++star_s;
|
||||
} else {
|
||||
return HTS_FALSE;
|
||||
}
|
||||
}
|
||||
while (p < pend && *p == '*')
|
||||
p++;
|
||||
return (p == pend) ? HTS_TRUE : HTS_FALSE;
|
||||
}
|
||||
|
||||
// fil="" : vérifier si règle déja enregistrée
|
||||
int checkrobots(robots_wizard * robots, const char *adr, const char *fil) {
|
||||
while(robots) {
|
||||
if (strfield2(robots->adr, adr)) {
|
||||
if (fil[0]) {
|
||||
/* RFC 9309: longest pattern wins, Allow beats Disallow on ties. */
|
||||
int ptr = 0;
|
||||
char line[HTS_ROBOTS_TOKEN_SIZE];
|
||||
size_t toklen = strlen(robots->token);
|
||||
size_t best_len = 0;
|
||||
hts_boolean matched = HTS_FALSE;
|
||||
hts_boolean best_allow = HTS_FALSE;
|
||||
char line[250];
|
||||
|
||||
while (ptr < (int) toklen) {
|
||||
ptr += binput(robots->token + ptr, line, sizeof(line) - 1);
|
||||
if (line[0] != 'A' && line[0] != 'D')
|
||||
continue;
|
||||
{
|
||||
const hts_boolean is_allow =
|
||||
(line[0] == 'A') ? HTS_TRUE : HTS_FALSE;
|
||||
const char *pat = line + 1;
|
||||
|
||||
if (robots_pattern_match(pat, fil)) {
|
||||
const size_t len = strlen(pat);
|
||||
|
||||
if (!matched || len > best_len || (len == best_len && is_allow)) {
|
||||
matched = HTS_TRUE;
|
||||
best_len = len;
|
||||
best_allow = is_allow;
|
||||
if (strnotempty(robots->token)) {
|
||||
do {
|
||||
ptr += binput(robots->token + ptr, line, 200);
|
||||
if (line[0] == '/') { // absolu
|
||||
if (strfield(fil, line)) { // commence avec ligne
|
||||
return -1; // interdit
|
||||
}
|
||||
} else { // relatif
|
||||
if (strstrcase(fil, line)) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while((strnotempty(line)) && (ptr < (int) strlen(robots->token)));
|
||||
}
|
||||
if (matched && !best_allow)
|
||||
return -1; // forbidden
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
@@ -130,93 +74,6 @@ int checkrobots(robots_wizard * robots, const char *adr, const char *fil) {
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Append "<marker><pattern>\n" to the bounded rule blob if it fits. */
|
||||
static void robots_blob_add(char *blob, size_t blobsize, char marker,
|
||||
const char *pat) {
|
||||
const size_t used = strlen(blob);
|
||||
const size_t need = strlen(pat) + 2; // marker + '\n'
|
||||
|
||||
if (need < blobsize - used) { // overflow-safe: used <= blobsize-1
|
||||
blob[used] = marker;
|
||||
blob[used + 1] = '\0';
|
||||
strlcatbuff(blob, pat, blobsize);
|
||||
strlcatbuff(blob, "\n", blobsize);
|
||||
}
|
||||
}
|
||||
|
||||
void robots_parse(robots_wizard *robots, const char *adr, const char *body,
|
||||
size_t bodysize, char *info, size_t infosize,
|
||||
hts_boolean keep_root_disallow) {
|
||||
size_t bptr = 0;
|
||||
int record = 0;
|
||||
char BIGSTK line[1024];
|
||||
char BIGSTK blob[HTS_ROBOTS_TOKEN_SIZE];
|
||||
|
||||
blob[0] = '\0';
|
||||
if (info != NULL && infosize > 0)
|
||||
info[0] = '\0';
|
||||
#if DEBUG_ROBOTS
|
||||
printf("robots.txt dump:\n%s\n", body);
|
||||
#endif
|
||||
while (bptr < bodysize) {
|
||||
char *comm;
|
||||
int llen;
|
||||
|
||||
bptr += binput(body + bptr, line, sizeof(line) - 2);
|
||||
comm = strchr(line, '#'); // strip comment
|
||||
if (comm != NULL)
|
||||
*comm = '\0';
|
||||
llen = (int) strlen(line); // strip trailing spaces
|
||||
while (llen > 0 && is_realspace(line[llen - 1])) {
|
||||
line[llen - 1] = '\0';
|
||||
llen--;
|
||||
}
|
||||
if (strfield(line, "user-agent:")) {
|
||||
char *a = line + 11;
|
||||
|
||||
while (is_realspace(*a))
|
||||
a++;
|
||||
if (*a == '*') {
|
||||
if (record != 2)
|
||||
record = 1; // generic group applies to us
|
||||
} else if (strfield(a, "httrack") || strfield(a, "winhttrack") ||
|
||||
strfield(a, "webhttrack")) {
|
||||
blob[0] = '\0'; // explicit group: restart capture
|
||||
if (info != NULL && infosize > 0)
|
||||
info[0] = '\0';
|
||||
record = 2; // locked to the httrack group
|
||||
} else
|
||||
record = 0;
|
||||
} else if (record) {
|
||||
hts_boolean is_allow = strfield(line, "allow:");
|
||||
hts_boolean is_disallow = !is_allow && strfield(line, "disallow:");
|
||||
|
||||
if (is_allow || is_disallow) {
|
||||
char *a = line + (is_allow ? 6 : 9);
|
||||
|
||||
while (is_realspace(*a))
|
||||
a++;
|
||||
if (strnotempty(a)) {
|
||||
if (is_disallow && !keep_root_disallow && strcmp(a, "/") == 0) {
|
||||
// dropped: site-wide disallow ignored by option
|
||||
} else {
|
||||
robots_blob_add(blob, sizeof(blob), is_allow ? 'A' : 'D', a);
|
||||
if (is_disallow && info != NULL &&
|
||||
strlen(a) + 2 < infosize - strlen(info)) {
|
||||
if (strnotempty(info))
|
||||
strlcatbuff(info, ", ", infosize);
|
||||
strlcatbuff(info, a, infosize);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (strnotempty(blob))
|
||||
checkrobots_set(robots, adr, blob);
|
||||
}
|
||||
|
||||
int checkrobots_set(robots_wizard * robots, const char *adr, const char *data) {
|
||||
if (((int) strlen(adr)) >= sizeof(robots->adr) - 2)
|
||||
return 0;
|
||||
|
||||
@@ -39,27 +39,17 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_DEF_FWSTRUCT_robots_wizard
|
||||
typedef struct robots_wizard robots_wizard;
|
||||
#endif
|
||||
|
||||
/* Per-host blob: one rule per line, first byte 'A'/'D' then path pattern. */
|
||||
#define HTS_ROBOTS_TOKEN_SIZE 4096
|
||||
|
||||
struct robots_wizard {
|
||||
char adr[128];
|
||||
char token[HTS_ROBOTS_TOKEN_SIZE];
|
||||
char token[4096];
|
||||
struct robots_wizard *next;
|
||||
};
|
||||
|
||||
/* Library internal definictions */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
/* -1 if `fil` disallowed for `adr` (RFC 9309); empty: -1 if rules exist. */
|
||||
int checkrobots(robots_wizard * robots, const char *adr, const char *fil);
|
||||
void checkrobots_free(robots_wizard * robots);
|
||||
int checkrobots_set(robots_wizard * robots, const char *adr, const char *data);
|
||||
/* Parse robots.txt `body` for `adr`, storing the HTTrack group's rules; `info`
|
||||
gets a disallow summary, `keep_root_disallow` FALSE drops "Disallow: /". */
|
||||
void robots_parse(robots_wizard *robots, const char *adr, const char *body,
|
||||
size_t bodysize, char *info, size_t infosize,
|
||||
hts_boolean keep_root_disallow);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -50,9 +50,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htsmd5.h"
|
||||
#if HTS_USEZLIB
|
||||
#include "htszlib.h"
|
||||
#endif
|
||||
#include "coucal/coucal.h"
|
||||
|
||||
#include <ctype.h>
|
||||
@@ -242,14 +239,6 @@ static void basic_selftests(void) {
|
||||
assertf(strcmp(ext, "html") == 0);
|
||||
assertf(give_mimext(ext, sizeof(ext), "no/such-mime-type") == 0);
|
||||
assertf(ext[0] == '\0');
|
||||
// modern web formats -> extension. Avoid MIME types the
|
||||
// application/<=4-char-subtype fallback could fabricate without a row.
|
||||
assertf(give_mimext(ext, sizeof(ext), "image/webp") == 1);
|
||||
assertf(strcmp(ext, "webp") == 0);
|
||||
assertf(give_mimext(ext, sizeof(ext), "application/manifest+json") == 1);
|
||||
assertf(strcmp(ext, "webmanifest") == 0);
|
||||
assertf(give_mimext(ext, sizeof(ext), "font/woff2") == 1);
|
||||
assertf(strcmp(ext, "woff2") == 0);
|
||||
}
|
||||
// convtolower(): lower-cases into the caller buffer (bounded by its size).
|
||||
{
|
||||
@@ -304,16 +293,6 @@ static void basic_selftests(void) {
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"x.gif", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "image/gif") == 0);
|
||||
// modern extensions map back to their MIME type
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"x.webp", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "image/webp") == 0);
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"app.wasm", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "application/wasm") == 0);
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"mod.mjs", 0) == 1);
|
||||
assertf(strcmp(r.contenttype, "text/javascript") == 0);
|
||||
// no extension and flag==0: nothing written, returns 0
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"noextfile", 0) == 0);
|
||||
@@ -533,21 +512,15 @@ static int string_safety_selftests(void) {
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
static int st_filter(httrackp *opt, int argc, char **argv) {
|
||||
char *str, *pat;
|
||||
int matched;
|
||||
|
||||
(void) opt;
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "filter: needs a filter pattern and a string\n");
|
||||
return 1;
|
||||
}
|
||||
/* exact-size heap copies so a sanitizer traps any over-read of the pattern */
|
||||
str = strdupt(argv[1]);
|
||||
pat = strdupt(argv[0]);
|
||||
matched = strjoker(str, pat, NULL, NULL) != NULL;
|
||||
printf("%s does %s %s\n", argv[1], matched ? "match" : "NOT match", argv[0]);
|
||||
freet(str);
|
||||
freet(pat);
|
||||
if (strjoker(argv[1], argv[0], NULL, NULL))
|
||||
printf("%s does match %s\n", argv[1], argv[0]);
|
||||
else
|
||||
printf("%s does NOT match %s\n", argv[1], argv[0]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -926,71 +899,12 @@ static int st_copyopt(httrackp *opt, int argc, char **argv) {
|
||||
if (to->parseall != HTS_TRUE)
|
||||
err = 1;
|
||||
|
||||
/* String field: a non-empty source deep-copies across, an empty source
|
||||
leaves the target intact (StringNotEmpty guard). Covers the exported
|
||||
copy_htsopt String path that no crawl test reaches. */
|
||||
StringCopy(from->cookies_file, "/tmp/jar.txt");
|
||||
StringCopy(to->cookies_file, "");
|
||||
copy_htsopt(from, to);
|
||||
if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0)
|
||||
err = 1;
|
||||
StringCopy(from->cookies_file, "");
|
||||
copy_htsopt(from, to);
|
||||
if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0)
|
||||
err = 1;
|
||||
|
||||
/* #185 pause pair: copied when enabled (max>0), the 0 sentinel skips */
|
||||
from->pause_min_ms = 5000;
|
||||
from->pause_max_ms = 10000;
|
||||
to->pause_min_ms = to->pause_max_ms = 0;
|
||||
copy_htsopt(from, to);
|
||||
if (to->pause_min_ms != 5000 || to->pause_max_ms != 10000)
|
||||
err = 1;
|
||||
from->pause_min_ms = from->pause_max_ms = 0;
|
||||
copy_htsopt(from, to);
|
||||
if (to->pause_min_ms != 5000 || to->pause_max_ms != 10000)
|
||||
err = 1;
|
||||
|
||||
hts_free_opt(from);
|
||||
hts_free_opt(to);
|
||||
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
|
||||
return err;
|
||||
}
|
||||
|
||||
static int st_pause(httrackp *opt, int argc, char **argv) {
|
||||
int err = 0, i, seen_low = 0, seen_high = 0;
|
||||
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
/* Consecutive-ms seeds (production shape: launch timestamps a few ms apart)
|
||||
must stay in range and spread, not collapse to a bound -- worst case for a
|
||||
weak low-bit mixer. */
|
||||
for (i = 0; i < 10000; i++) {
|
||||
int t = hts_pause_target_ms((TStamp) (1719500000000LL + i), 5000, 10000);
|
||||
|
||||
if (t < 5000 || t > 10000)
|
||||
err = 1;
|
||||
seen_low |= (t < 6000);
|
||||
seen_high |= (t > 9000);
|
||||
}
|
||||
if (!seen_low || !seen_high)
|
||||
err = 1;
|
||||
if (hts_pause_target_ms(12345, 8000, 8000) != 8000) /* equal bounds = fixed */
|
||||
err = 1;
|
||||
/* deterministic: a seed yields the same target even after an intervening call
|
||||
with another seed (no global PRNG state to perturb it) */
|
||||
{
|
||||
int a = hts_pause_target_ms(99, 5000, 10000);
|
||||
|
||||
(void) hts_pause_target_ms(54321, 5000, 10000);
|
||||
if (hts_pause_target_ms(99, 5000, 10000) != a)
|
||||
err = 1;
|
||||
}
|
||||
printf("pause: %s\n", err ? "FAIL" : "OK");
|
||||
return err;
|
||||
}
|
||||
|
||||
static int st_relative(httrackp *opt, int argc, char **argv) {
|
||||
char s[HTS_URLMAXSIZE * 2];
|
||||
|
||||
@@ -1305,324 +1219,6 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// hts_finish_makeindex writes the footer, emits the refresh meta only when
|
||||
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
|
||||
static int st_makeindex(httrackp *opt, int argc, char **argv) {
|
||||
char path[HTS_URLMAXSIZE];
|
||||
char buf[4096];
|
||||
FILE *fp;
|
||||
size_t n;
|
||||
int done;
|
||||
|
||||
assertf(argc >= 1);
|
||||
snprintf(path, sizeof(path), "%s/index.html", argv[0]);
|
||||
|
||||
/* single first link: footer + a refresh meta carrying the escaped URL */
|
||||
done = 0;
|
||||
fp = fopen(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
hts_finish_makeindex(opt, &done, &fp, 1, "http://example.com/a b", "%s%s", "",
|
||||
"");
|
||||
assertf(fp == NULL); /* the function closed and cleared it */
|
||||
assertf(done != 0);
|
||||
fp = fopen(path, "rb");
|
||||
assertf(fp != NULL);
|
||||
n = fread(buf, 1, sizeof(buf) - 1, fp);
|
||||
fclose(fp);
|
||||
buf[n] = '\0';
|
||||
assertf(strstr(buf, "Mirror and index made by HTTrack") != NULL);
|
||||
assertf(strstr(buf, "Refresh") != NULL);
|
||||
assertf(strstr(buf, "example.com") != NULL);
|
||||
|
||||
/* no single link: footer only, no refresh meta */
|
||||
done = 0;
|
||||
fp = fopen(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
hts_finish_makeindex(opt, &done, &fp, 0, NULL, "%s%s", "", "");
|
||||
assertf(fp == NULL);
|
||||
assertf(done != 0);
|
||||
fp = fopen(path, "rb");
|
||||
assertf(fp != NULL);
|
||||
n = fread(buf, 1, sizeof(buf) - 1, fp);
|
||||
fclose(fp);
|
||||
buf[n] = '\0';
|
||||
assertf(strstr(buf, "Mirror and index made by HTTrack") != NULL);
|
||||
assertf(strstr(buf, "Refresh") == NULL);
|
||||
|
||||
UNLINK(path);
|
||||
printf("makeindex self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Default User-Agent: honest HTTrack token, no resurrected Windows 98. */
|
||||
static int st_useragent(httrackp *opt, int argc, char **argv) {
|
||||
const char *ua = StringBuff(opt->user_agent);
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
assertf(ua != NULL);
|
||||
assertf(strcmp(ua, HTS_DEFAULT_USER_AGENT) == 0);
|
||||
/* Teeth independent of the macro: honest token + self-identifier, and no
|
||||
legacy Mozilla/4.x fake-browser string (rejects the whole relic family). */
|
||||
assertf(strstr(ua, "HTTrack/") != NULL);
|
||||
assertf(strstr(ua, "+https://www.httrack.com/") != NULL);
|
||||
assertf(strstr(ua, "Mozilla/4.") == NULL);
|
||||
printf("useragent self-test OK: %s\n", ua);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* HTTP status code -> reason phrase, including the modern 429/451. */
|
||||
static int st_status(httrackp *opt, int argc, char **argv) {
|
||||
const char *s;
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
s = infostatuscode_const(429);
|
||||
assertf(s != NULL && strcmp(s, "Too Many Requests") == 0);
|
||||
s = infostatuscode_const(451);
|
||||
assertf(s != NULL && strcmp(s, "Unavailable For Legal Reasons") == 0);
|
||||
/* A spot-check of a long-standing code, and an unknown one. */
|
||||
s = infostatuscode_const(404);
|
||||
assertf(s != NULL && strcmp(s, "Not Found") == 0);
|
||||
assertf(infostatuscode_const(799) == NULL);
|
||||
printf("status self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if HTS_USEZLIB
|
||||
/* Deflate src->path at windowBits (16+ gzip, + zlib, - raw); 0 on success. */
|
||||
static int ae_write_packed(const char *path, int windowBits,
|
||||
const unsigned char *src, size_t len) {
|
||||
unsigned char out[8192];
|
||||
z_stream strm;
|
||||
FILE *f;
|
||||
int zerr;
|
||||
|
||||
memset(&strm, 0, sizeof(strm));
|
||||
if (deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, windowBits, 8,
|
||||
Z_DEFAULT_STRATEGY) != Z_OK)
|
||||
return 1;
|
||||
if ((f = FOPEN(path, "wb")) == NULL) {
|
||||
deflateEnd(&strm);
|
||||
return 1;
|
||||
}
|
||||
strm.next_in = (Bytef *) src;
|
||||
strm.avail_in = (uInt) len;
|
||||
do {
|
||||
size_t n;
|
||||
|
||||
strm.next_out = out;
|
||||
strm.avail_out = sizeof(out);
|
||||
zerr = deflate(&strm, Z_FINISH);
|
||||
n = sizeof(out) - strm.avail_out;
|
||||
if (n > 0 && fwrite(out, 1, n, f) != n) {
|
||||
deflateEnd(&strm);
|
||||
fclose(f);
|
||||
return 1;
|
||||
}
|
||||
} while (zerr == Z_OK);
|
||||
deflateEnd(&strm);
|
||||
fclose(f);
|
||||
return (zerr == Z_STREAM_END) ? 0 : 1;
|
||||
}
|
||||
|
||||
/* Forged raw deflate (08 1D) that misdetects as zlib; only fallback decodes */
|
||||
static int ae_write_collision(const char *path, const unsigned char *src,
|
||||
size_t len) {
|
||||
/* block-1 LEN low byte 0x1D: with 0x08, (0x081D)%31==0 */
|
||||
const size_t n1 = 29;
|
||||
size_t n2, p = 0;
|
||||
unsigned char *buf;
|
||||
FILE *f;
|
||||
int ok;
|
||||
|
||||
if (len < n1 || len - n1 > 0xFFFF)
|
||||
return 1;
|
||||
n2 = len - n1;
|
||||
buf = malloct(10 + len);
|
||||
if (buf == NULL)
|
||||
return 1;
|
||||
buf[p++] = 0x08; /* BFINAL=0, BTYPE=00, forged padding -> zlib CMF nibble */
|
||||
buf[p++] = (unsigned char) (n1 & 0xff);
|
||||
buf[p++] = (unsigned char) (n1 >> 8);
|
||||
buf[p++] = (unsigned char) (~n1 & 0xff);
|
||||
buf[p++] = (unsigned char) ((~n1 >> 8) & 0xff);
|
||||
memcpy(buf + p, src, n1);
|
||||
p += n1;
|
||||
buf[p++] = 0x01; /* BFINAL=1, BTYPE=00 */
|
||||
buf[p++] = (unsigned char) (n2 & 0xff);
|
||||
buf[p++] = (unsigned char) (n2 >> 8);
|
||||
buf[p++] = (unsigned char) (~n2 & 0xff);
|
||||
buf[p++] = (unsigned char) ((~n2 >> 8) & 0xff);
|
||||
memcpy(buf + p, src + n1, n2);
|
||||
p += n2;
|
||||
f = FOPEN(path, "wb");
|
||||
ok = (f != NULL && fwrite(buf, 1, p, f) == p);
|
||||
if (f != NULL)
|
||||
fclose(f);
|
||||
freet(buf);
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
/* Compare path's bytes to expect[0..len); 0 if equal. Streams (large files). */
|
||||
static int ae_check_decoded(const char *path, const unsigned char *expect,
|
||||
size_t len) {
|
||||
unsigned char buf[8192];
|
||||
FILE *f = FOPEN(path, "rb");
|
||||
size_t off = 0, n;
|
||||
|
||||
if (f == NULL)
|
||||
return 1;
|
||||
while ((n = fread(buf, 1, sizeof(buf), f)) > 0) {
|
||||
if (n > len - off || memcmp(buf, expect + off, n) != 0) {
|
||||
fclose(f);
|
||||
return 1;
|
||||
}
|
||||
off += n;
|
||||
}
|
||||
fclose(f);
|
||||
return (off == len) ? 0 : 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Accept-Encoding (#450): advertise gzip+deflate; both decode (hts_zunpack) */
|
||||
static int st_acceptencoding(httrackp *opt, int argc, char **argv) {
|
||||
const char *off = hts_acceptencoding(HTS_FALSE);
|
||||
const char *on = hts_acceptencoding(HTS_TRUE);
|
||||
|
||||
(void) opt;
|
||||
assertf(strcmp(off, "identity") == 0);
|
||||
assertf(strstr(on, "gzip") != NULL);
|
||||
assertf(strstr(on, "deflate") != NULL); /* fails on the old gzip-only list */
|
||||
#if HTS_USEZLIB
|
||||
if (argc >= 1) {
|
||||
static const int windowBits[] = {16 + MAX_WBITS, MAX_WBITS, -MAX_WBITS};
|
||||
const unsigned char small[] =
|
||||
"deflate round-trip: HTTrack decodes gzip and deflate alike. "
|
||||
"deflate round-trip: HTTrack decodes gzip and deflate alike.";
|
||||
const size_t slen = sizeof(small) - 1;
|
||||
/* 64 KiB of varied (LCG) bytes: forces the multi-fread loop */
|
||||
const size_t blen = 64 * 1024;
|
||||
unsigned char *body = malloct(blen);
|
||||
uint32_t x = 0x1234567u;
|
||||
char inpath[HTS_URLMAXSIZE], outpath[HTS_URLMAXSIZE];
|
||||
size_t i;
|
||||
|
||||
assertf(body != NULL);
|
||||
for (i = 0; i < blen; i++) {
|
||||
x = x * 1103515245u + 12345u;
|
||||
body[i] = (unsigned char) (x >> 16);
|
||||
}
|
||||
/* gzip, zlib (RFC1950) and raw deflate (RFC1951), both small and large. */
|
||||
for (i = 0; i < sizeof(windowBits) / sizeof(windowBits[0]); i++) {
|
||||
snprintf(inpath, sizeof(inpath), "%s/ae-in-%d.z", argv[0], windowBits[i]);
|
||||
snprintf(outpath, sizeof(outpath), "%s/ae-out-%d", argv[0],
|
||||
windowBits[i]);
|
||||
assertf(ae_write_packed(inpath, windowBits[i], small, slen) == 0);
|
||||
assertf(hts_zunpack(inpath, outpath) == (int) slen);
|
||||
assertf(ae_check_decoded(outpath, small, slen) == 0);
|
||||
assertf(ae_write_packed(inpath, windowBits[i], body, blen) == 0);
|
||||
assertf(hts_zunpack(inpath, outpath) == (int) blen);
|
||||
assertf(ae_check_decoded(outpath, body, blen) == 0);
|
||||
}
|
||||
/* Fallback teeth: raw deflate misdetected as zlib; -1 without the retry. */
|
||||
snprintf(inpath, sizeof(inpath), "%s/ae-collide.z", argv[0]);
|
||||
snprintf(outpath, sizeof(outpath), "%s/ae-collide.out", argv[0]);
|
||||
assertf(ae_write_collision(inpath, body, 64) == 0);
|
||||
assertf(hts_zunpack(inpath, outpath) == 64);
|
||||
assertf(ae_check_decoded(outpath, body, 64) == 0);
|
||||
freet(body);
|
||||
}
|
||||
#else
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
#endif
|
||||
printf("acceptencoding self-test OK: %s\n", on);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Each call parses `txt` under a fresh host, then checkrobots() for `path`. */
|
||||
static int rb_decide(robots_wizard *r, const char *txt, const char *path) {
|
||||
static int n = 0;
|
||||
char host[64];
|
||||
|
||||
snprintf(host, sizeof(host), "h%d.example", n++);
|
||||
robots_parse(r, host, txt, strlen(txt), NULL, 0, HTS_TRUE);
|
||||
return checkrobots(r, host, path);
|
||||
}
|
||||
|
||||
static int st_robots(httrackp *opt, int argc, char **argv) {
|
||||
robots_wizard robots;
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
memset(&robots, 0, sizeof(robots));
|
||||
|
||||
/* Longer Allow re-opens subtree under Disallow: / (old matcher couldn't). */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /\nAllow: /public/\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/public/x") == 0); /* allowed */
|
||||
assertf(rb_decide(&robots, txt, "/private") == -1); /* denied */
|
||||
assertf(rb_decide(&robots, txt, "/") == -1); /* denied */
|
||||
}
|
||||
|
||||
/* Equal-length match: Allow wins the tie over Disallow. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /foo\nAllow: /foo\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/foo/bar") == 0);
|
||||
}
|
||||
|
||||
/* Longest match wins even when it is not the last rule. */
|
||||
{
|
||||
assertf(rb_decide(&robots, "User-agent: *\nDisallow: /a/b\nAllow: /a\n",
|
||||
"/a/b/c") == -1);
|
||||
assertf(rb_decide(&robots, "User-agent: *\nAllow: /a/b\nDisallow: /a\n",
|
||||
"/a/b/c") == 0);
|
||||
}
|
||||
|
||||
/* '*' matches any run of characters. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /*.php\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/a/b/index.php") == -1);
|
||||
assertf(rb_decide(&robots, txt, "/a/b/index.html") == 0);
|
||||
}
|
||||
|
||||
/* Trailing '$' anchors the end of the path. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /a$\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/a") == -1);
|
||||
assertf(rb_decide(&robots, txt, "/ab") == 0);
|
||||
assertf(rb_decide(&robots, txt, "/a/b") == 0);
|
||||
}
|
||||
|
||||
/* The httrack-specific group replaces the generic '*' group entirely. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /everyone\n"
|
||||
"User-agent: httrack\nDisallow: /\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/anything") == -1);
|
||||
}
|
||||
|
||||
/* Replace, not merge: the generic group does not bind the httrack group. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /x\n"
|
||||
"User-agent: httrack\nDisallow: /y\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/x") == 0);
|
||||
assertf(rb_decide(&robots, txt, "/y") == -1);
|
||||
}
|
||||
|
||||
/* No rules: everything is allowed. */
|
||||
assertf(rb_decide(&robots, "User-agent: *\nDisallow:\n", "/x") == 0);
|
||||
|
||||
checkrobots_free(&robots);
|
||||
printf("robots self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* Registry: name -> handler, with a usage hint and a one-line description. */
|
||||
/* ------------------------------------------------------------ */
|
||||
@@ -1655,7 +1251,6 @@ static const struct selftest_entry {
|
||||
{"strsafe", "[overflow|overflow-buff [str]]", "bounded string-op self-test",
|
||||
st_strsafe},
|
||||
{"copyopt", "", "copy_htsopt option-copy self-test", st_copyopt},
|
||||
{"pause", "", "randomized inter-file pause target self-test", st_pause},
|
||||
{"relative", "<link> <curr-file>", "relative link between two paths",
|
||||
st_relative},
|
||||
{"resolve", "<link> <adr> <fil>", "resolve a link against an origin",
|
||||
@@ -1669,14 +1264,6 @@ static const struct selftest_entry {
|
||||
st_cache_writefail},
|
||||
{"dns", "", "DNS resolver/cache self-test", st_dns},
|
||||
{"cookies", "", "cookie request-header self-test", st_cookies},
|
||||
{"useragent", "", "default User-Agent self-test", st_useragent},
|
||||
{"makeindex", "[dir]", "hts_finish_makeindex footer/refresh self-test",
|
||||
st_makeindex},
|
||||
{"status", "", "HTTP status code -> reason phrase self-test", st_status},
|
||||
{"acceptencoding", "[dir]",
|
||||
"Accept-Encoding advertises gzip+deflate, both decode", st_acceptencoding},
|
||||
{"robots", "", "robots.txt RFC 9309 Allow/Disallow precedence self-test",
|
||||
st_robots},
|
||||
};
|
||||
|
||||
static void list_selftests(void) {
|
||||
|
||||
@@ -358,12 +358,12 @@ int smallserver(T_SOC soc, char *url, char *method, char *data, char *path) {
|
||||
{NULL, 0}
|
||||
};
|
||||
initStrElt initStr[] = {
|
||||
{"user", HTS_DEFAULT_USER_AGENT},
|
||||
{"footer", "<!-- Mirrored from %s%s by HTTrack Website Copier/3.x "
|
||||
"[XR&CO'2014], %s -->"},
|
||||
{"url2",
|
||||
"+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"},
|
||||
{NULL, NULL}};
|
||||
{"user", "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)"},
|
||||
{"footer",
|
||||
"<!-- Mirrored from %s%s by HTTrack Website Copier/3.x [XR&CO'2014], %s -->"},
|
||||
{"url2", "+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"},
|
||||
{NULL, NULL}
|
||||
};
|
||||
int i = 0;
|
||||
|
||||
for(i = 0; initInt[i].name; i++) {
|
||||
|
||||
@@ -80,10 +80,6 @@ htspair_t hts_detect_embed[] = {
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
/* HTML5 media siblings of <img src>: same near-link treatment (#451) */
|
||||
static const htspair_t hts_detect_embed_html5[] = {
|
||||
{"source", "src"}, {"source", "srcset"}, {"track", "src"}, {NULL, NULL}};
|
||||
|
||||
/* Internal */
|
||||
static int hts_acceptlink_(httrackp * opt, int ptr, const char *adr,
|
||||
const char *fil, const char *tag,
|
||||
@@ -140,17 +136,6 @@ static int cmp_token(const char *tag, const char *cmp) {
|
||||
&& !isalnum((unsigned char) tag[p]));
|
||||
}
|
||||
|
||||
/* TRUE if (tag, attribute) matches an embedded-asset pair in the table */
|
||||
static hts_boolean is_embed_pair(const htspair_t *table, const char *tag,
|
||||
const char *attribute) {
|
||||
int i;
|
||||
for (i = 0; table[i].tag != NULL; i++) {
|
||||
if (cmp_token(tag, table[i].tag) && cmp_token(attribute, table[i].attr))
|
||||
return HTS_TRUE;
|
||||
}
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
const char *adr, const char *fil, const char *tag,
|
||||
const char *attribute, int *set_prio_to,
|
||||
@@ -178,9 +163,15 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
|
||||
/* Built-in known tags (<img src=..>, ..) */
|
||||
if (forbidden_url != 0 && opt->nearlink && tag != NULL && attribute != NULL) {
|
||||
if (is_embed_pair(hts_detect_embed, tag, attribute) ||
|
||||
is_embed_pair(hts_detect_embed_html5, tag, attribute)) {
|
||||
embedded_triggered = 1;
|
||||
int i;
|
||||
|
||||
for(i = 0; hts_detect_embed[i].tag != NULL; i++) {
|
||||
if (cmp_token(tag, hts_detect_embed[i].tag)
|
||||
&& cmp_token(attribute, hts_detect_embed[i].attr)
|
||||
) {
|
||||
embedded_triggered = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
101
src/htszlib.c
101
src/htszlib.c
@@ -47,89 +47,48 @@ Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/*
|
||||
Unpack file into a new file (gzip, zlib RFC1950 or raw deflate RFC1951).
|
||||
Unpack file into a new file
|
||||
Return value: size of the new file, or -1 if an error occurred
|
||||
*/
|
||||
/* Note: utf-8 */
|
||||
int hts_zunpack(char *filename, char *newfile) {
|
||||
int ret = -1;
|
||||
|
||||
if (filename != NULL && newfile != NULL && filename[0] && newfile[0]) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
FILE *const in = FOPEN(fconv(catbuff, sizeof(catbuff), filename), "rb");
|
||||
if (filename != NULL && newfile != NULL) {
|
||||
if (filename[0] && newfile[0]) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
FILE *const in = FOPEN(fconv(catbuff, sizeof(catbuff), filename), "rb");
|
||||
const int fd = in != NULL ? fileno(in) : -1;
|
||||
const int dup_fd = fd != -1 ? dup(fd) : -1;
|
||||
// Note: we must dup to be able to flose cleanly.
|
||||
const gzFile gz = dup_fd != -1 ? gzdopen(dup_fd, "rb") : NULL;
|
||||
|
||||
if (in != NULL) {
|
||||
unsigned char BIGSTK inbuf[8192];
|
||||
size_t navail = fread(inbuf, 1, sizeof(inbuf), in);
|
||||
/* gzip/zlib headers -> +32 windowBits; else raw deflate (RFC1951) */
|
||||
const hts_boolean wrapped =
|
||||
(navail >= 2 &&
|
||||
((inbuf[0] == 0x1f && inbuf[1] == 0x8b) ||
|
||||
((inbuf[0] & 0x0f) == Z_DEFLATED &&
|
||||
(((unsigned) inbuf[0] << 8 | inbuf[1]) % 31) == 0)));
|
||||
int attempt;
|
||||
if (gz) {
|
||||
FILE *const fpout = FOPEN(fconv(catbuff, sizeof(catbuff), newfile), "wb");
|
||||
int size = 0;
|
||||
|
||||
/* deflate is ambiguous; on failure retry with the other windowBits */
|
||||
for (attempt = 0; attempt < 2 && ret < 0; attempt++) {
|
||||
const int windowBits =
|
||||
(attempt == 0 ? wrapped : !wrapped) ? (32 + MAX_WBITS) : -MAX_WBITS;
|
||||
FILE *fpout;
|
||||
z_stream strm;
|
||||
if (fpout) {
|
||||
int nr;
|
||||
|
||||
if (attempt > 0) {
|
||||
/* rewind input; reopening fpout "wb" discards the partial output */
|
||||
if (fseek(in, 0, SEEK_SET) != 0)
|
||||
break;
|
||||
navail = fread(inbuf, 1, sizeof(inbuf), in);
|
||||
}
|
||||
fpout = FOPEN(fconv(catbuff, sizeof(catbuff), newfile), "wb");
|
||||
if (fpout == NULL)
|
||||
break;
|
||||
memset(&strm, 0, sizeof(strm));
|
||||
if (inflateInit2(&strm, windowBits) != Z_OK) {
|
||||
fclose(fpout);
|
||||
break;
|
||||
}
|
||||
{
|
||||
hts_boolean ok = HTS_TRUE;
|
||||
int size = 0;
|
||||
int zerr = Z_OK;
|
||||
|
||||
/* chunked inflate; first chunk in inbuf, single member */
|
||||
do {
|
||||
strm.next_in = inbuf;
|
||||
strm.avail_in = (uInt) navail;
|
||||
do {
|
||||
unsigned char BIGSTK outbuf[8192];
|
||||
size_t produced;
|
||||
char BIGSTK buff[1024];
|
||||
|
||||
strm.next_out = outbuf;
|
||||
strm.avail_out = sizeof(outbuf);
|
||||
zerr = inflate(&strm, Z_NO_FLUSH);
|
||||
if (zerr == Z_NEED_DICT || zerr == Z_DATA_ERROR ||
|
||||
zerr == Z_MEM_ERROR || zerr == Z_STREAM_ERROR) {
|
||||
ok = HTS_FALSE;
|
||||
break;
|
||||
}
|
||||
produced = sizeof(outbuf) - strm.avail_out;
|
||||
if (produced > 0 &&
|
||||
fwrite(outbuf, 1, produced, fpout) != produced) {
|
||||
ok = HTS_FALSE;
|
||||
break;
|
||||
}
|
||||
size += (int) produced;
|
||||
} while (strm.avail_out == 0);
|
||||
if (!ok || zerr == Z_STREAM_END)
|
||||
break;
|
||||
navail = fread(inbuf, 1, sizeof(inbuf), in);
|
||||
} while (navail > 0);
|
||||
if (ok && zerr == Z_STREAM_END)
|
||||
ret = size;
|
||||
}
|
||||
inflateEnd(&strm);
|
||||
fclose(fpout);
|
||||
nr = gzread(gz, buff, sizeof(buff));
|
||||
if (nr > 0) {
|
||||
size += nr;
|
||||
if (fwrite(buff, 1, nr, fpout) != nr)
|
||||
nr = size = -1;
|
||||
}
|
||||
} while(nr > 0);
|
||||
fclose(fpout);
|
||||
} else
|
||||
size = -1;
|
||||
gzclose(gz);
|
||||
ret = (int) size;
|
||||
}
|
||||
if (in != NULL) {
|
||||
fclose(in);
|
||||
}
|
||||
fclose(in);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
||||
@@ -497,12 +497,6 @@ static const char *GetHttpMessage(int statuscode) {
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
break;
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
break;
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
break;
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
break;
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
# Golden cache-format regression test (driven by 'httrack -#test=cache-golden <dir>').
|
||||
#
|
||||
# 01_zlib-cache.test writes the cache with the same build it reads back (a
|
||||
# 01_engine-cache.test writes the cache with the same build it reads back (a
|
||||
# round-trip), so it cannot catch a read-path or ZIP-format regression where
|
||||
# writer and reader drift together. This reads a *committed* cache frozen by an
|
||||
# earlier build and asserts a fixed set of entries still decodes field- and
|
||||
@@ -90,16 +90,4 @@ refused "dangling-quote argument not refused cleanly"
|
||||
run_only "$tmp/q-lone" '"'
|
||||
refused "lone-quote argument not refused cleanly"
|
||||
|
||||
# --pause (#185): valid MIN[:MAX] accepted; malformed, reversed, over-range and
|
||||
# non-finite values refused cleanly. NaN defeats naive `<`/`>` checks (it
|
||||
# compares false to everything), so it must not slip through to the int cast.
|
||||
run "$tmp/pause-ok" --pause 0.2:0.4
|
||||
accepted "$tmp/pause-ok" "#185: valid --pause range rejected"
|
||||
run "$tmp/pause-fix" --pause 0.2
|
||||
accepted "$tmp/pause-fix" "#185: valid fixed --pause rejected"
|
||||
for bad in nan nan:5 5:nan inf 10:5 99999; do
|
||||
run "$tmp/pause-bad" --pause "$bad"
|
||||
refused "#185: invalid --pause '$bad' not refused cleanly"
|
||||
done
|
||||
|
||||
exit 0
|
||||
|
||||
@@ -18,21 +18,6 @@ ent '&' '&'
|
||||
ent '<>' '<>'
|
||||
ent 'é' 'é'
|
||||
|
||||
# HTML5 names from the WHATWG set
|
||||
ent '…' '…'
|
||||
ent '⋃' '⋃'
|
||||
# longest name (31 chars) exercises the name-length cap
|
||||
ent '∳' '∳'
|
||||
# astral codepoint -> 4-byte UTF-8
|
||||
ent '𝔸' '𝔸'
|
||||
# multi-codepoint refs are skipped at generation, so left verbatim
|
||||
ent 'fj' 'fj'
|
||||
|
||||
# common HTML4 names still decode (regression guard against accidental drops)
|
||||
ent '©®™' '©®™'
|
||||
ent '—–' '—–'
|
||||
ent 'αβ' 'αβ'
|
||||
|
||||
# numeric: decimal and hex
|
||||
ent 'AB' 'AB'
|
||||
ent 'A' 'A'
|
||||
|
||||
@@ -50,54 +50,27 @@ match '*foo*bar' 'foozbar'
|
||||
# '?' is the query-string marker, not a single-char wildcard
|
||||
nomatch 'a?c' 'abc'
|
||||
|
||||
# Inside a class, backslash escapes the next char as a literal member (#148):
|
||||
# '\X' matches X only (not '\'), and an escaped ']' is a member, not the terminator.
|
||||
# backslash escapes a metacharacter inside a class so it is matched literally.
|
||||
# Quirk: the decoder also adds the backslash itself to the set, so '\X' matches
|
||||
# both X and '\'. These assertions pin that behavior.
|
||||
match '*[\*]' '*'
|
||||
nomatch '*[\*]' "\\"
|
||||
match '*[\*]' "\\"
|
||||
nomatch '*[\*]' 'a'
|
||||
match '*[\\]' "\\"
|
||||
nomatch '*[\\]' '*'
|
||||
nomatch '*[\\]' 'a'
|
||||
match '*[\[]' '['
|
||||
nomatch '*[\[]' "\\"
|
||||
match '*[\]]' ']'
|
||||
nomatch '*[\]]' "\\"
|
||||
match '*[\[]' "\\"
|
||||
nomatch '*[\[]' 'a'
|
||||
|
||||
# '*[\[\]]' is "the [ or ] character", as the filter guide documents.
|
||||
match '*[\[\]]' '['
|
||||
match '*[\[\]]' ']'
|
||||
nomatch '*[\[\]]' 'a'
|
||||
match '*[\[,\]]' '[' # comma between members is optional
|
||||
match '*[\[,\]]' ']'
|
||||
match '*[a,\[]' 'a' # an escaped member no longer eats the preceding one
|
||||
match '*[a,\[]' '['
|
||||
|
||||
# Escape is decoded before the range/separator/size checks, so '\-' '\,' '\<'
|
||||
# are literal members, not operators.
|
||||
match '*[a\-z]' 'a'
|
||||
match '*[a\-z]' 'z'
|
||||
nomatch '*[a\-z]' 'b' # not the a..z range
|
||||
match '*[\,]' ','
|
||||
nomatch '*[\,]' "\\" # the escape must not leak '\' into the class
|
||||
match '*[\<]' '<'
|
||||
nomatch '*[\<]' "\\"
|
||||
match '*[\[,\],a]' '['
|
||||
match '*[\[,\],a]' ']'
|
||||
match '*[\[,\],a]' 'a'
|
||||
|
||||
# A truncated range '*[a-' is the literal members {a,-}; the parser must not
|
||||
# read past the end decoding it (was a 1-byte heap over-read in the range arm).
|
||||
match '*[a-' 'a'
|
||||
nomatch '*[a-' 'b'
|
||||
|
||||
# *(...) matches exactly one char from the class; *[...] matches a run.
|
||||
match '*(a,b)' 'a'
|
||||
nomatch '*(a,b)' 'aa'
|
||||
nomatch '*(a,b)' 'c'
|
||||
|
||||
# documented composite filters (filters.html)
|
||||
match 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.zip'
|
||||
nomatch 'www.*[path].com/*[path].zip' 'www.foo.com/a/b.tar'
|
||||
match '*.html*[]' 'page.html'
|
||||
nomatch '*.html*[]' 'page.html?x=1' # *[] forbids the trailing query
|
||||
# A literal ']' cannot be a class member: the class parser stops at the first
|
||||
# ']', escaped or not. So '*[\[\]]' does NOT mean "the [ or ] character" as the
|
||||
# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
|
||||
# by a trailing literal ']'. These assertions document the current (buggy)
|
||||
# behavior so any future matcher fix is a deliberate, visible change.
|
||||
nomatch '*[\[\]]' '[' # not matched, despite the docs
|
||||
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
|
||||
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
|
||||
nomatch '*[\[\]]' '[]x'
|
||||
|
||||
# Size-based rules (-#test=filtersize <size> <string> <filter...>): a negative size
|
||||
# means the size is still unknown (scan time). A size exclusion must stay neutral
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# hts_finish_makeindex writes the footer and gates the refresh meta on a single
|
||||
# first link (guards the macro->function extraction).
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
httrack -O /dev/null -#test=makeindex "$dir" run |
|
||||
grep -q "makeindex self-test OK"
|
||||
@@ -323,33 +323,4 @@ grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
|
||||
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
|
||||
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
|
||||
|
||||
# HTML5 <source>/<track> follow as embedded near-links past the -r2 depth boundary (#451).
|
||||
# img.gif positive control; plain.gif (bare <a href>) negative control proves the gate is selective.
|
||||
site10="$tmp/html5media"
|
||||
mkdir -p "$site10"
|
||||
for f in img ss plain; do gif "$site10/$f.gif"; done
|
||||
printf 'x' >"$site10/v.webm"
|
||||
printf 'x' >"$site10/subs.vtt"
|
||||
cat >"$site10/index.html" <<EOF
|
||||
<html><body><a href="leaf.html">leaf</a></body></html>
|
||||
EOF
|
||||
cat >"$site10/leaf.html" <<EOF
|
||||
<html><body>
|
||||
<img src="img.gif">
|
||||
<picture><source srcset="ss.gif 2x"></picture>
|
||||
<video><source src="v.webm"></video>
|
||||
<video><track src="subs.vtt"></video>
|
||||
<a href="plain.gif">plain link past the boundary</a>
|
||||
</body></html>
|
||||
EOF
|
||||
out10="$tmp/html5media-out"
|
||||
rm -rf "$out10"
|
||||
mkdir -p "$out10"
|
||||
httrack "file://$site10/index.html" -O "$out10" --quiet --near -r2 >"$out10/.log" 2>&1 || true
|
||||
found "img.gif" "$out10"
|
||||
found "ss.gif" "$out10"
|
||||
found "v.webm" "$out10"
|
||||
found "subs.vtt" "$out10"
|
||||
notfound "plain.gif" "$out10"
|
||||
|
||||
exit 0
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# --pause (#185): the inter-file pause target must stay in [min,max] and spread
|
||||
# across it (a per-call rand() would collapse it toward min). Driven by the
|
||||
# in-process 'httrack -#test=pause' test. POSIX-portable ($(BASH) is /bin/sh on macOS).
|
||||
|
||||
set -eu
|
||||
|
||||
# 'run' is an ignored placeholder argument.
|
||||
out=$(httrack -#test=pause run)
|
||||
|
||||
test "$out" = "pause: OK" || {
|
||||
echo "expected 'pause: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# robots.txt RFC 9309 Allow/Disallow precedence (#452): longest match wins.
|
||||
httrack -O /dev/null -#test=robots run | grep -q "robots self-test OK"
|
||||
@@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# HTTP status -> reason phrase, including the modern 429/451 (#453).
|
||||
httrack -O /dev/null -#test=status run | grep -q "status self-test OK"
|
||||
@@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Default User-Agent (#449): honest HTTrack token, no Windows 98 relic.
|
||||
httrack -O /dev/null -#test=useragent run | grep -q "useragent self-test OK"
|
||||
@@ -1,11 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Accept-Encoding (#450): advertise gzip+deflate; decode gzip/zlib/raw-deflate.
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
httrack -O /dev/null -#test=acceptencoding "$dir" run |
|
||||
grep -q "acceptencoding self-test OK"
|
||||
@@ -20,14 +20,6 @@ if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "python3 missing, skipping"
|
||||
exit 77
|
||||
fi
|
||||
# The fixture needs a second loopback IP (dead 127.0.0.2 + live 127.0.0.1) for
|
||||
# the fallback to have a target; GNU/Hurd has only 127.0.0.1, so skip there.
|
||||
case "$(uname -s)" in
|
||||
GNU | GNU/*)
|
||||
echo "GNU/Hurd: single loopback IP, connect-fallback fixture unbuildable, skipping"
|
||||
exit 77
|
||||
;;
|
||||
esac
|
||||
|
||||
server="$top_srcdir/tests/local-server.py"
|
||||
root="$top_srcdir/tests/server-root"
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# End-to-end --cookies-file (#215): /gated/secret.php needs a cookie no page
|
||||
# ever Set-Cookies, so it is reachable only when the option preloads it from a
|
||||
# Netscape cookies.txt. Locks the CLI->opt->cookie_load->wire plumbing.
|
||||
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# preloaded cookie -> secret page is served. -o0 means a 500 leaves no file, so
|
||||
# --found/--files only hold when the secret is genuinely fetched (200).
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --cookie 'session=opensesame' \
|
||||
--errors 0 --files 2 \
|
||||
--found 'gated/index.html' --found 'gated/secret.html' \
|
||||
httrack 'BASEURL/gated/index.php' -o0
|
||||
|
||||
# control: without the cookie the secret 500s; -o0 suppresses the error page so
|
||||
# its absence is real (error + missing file)
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'gated/index.html' --not-found 'gated/secret.html' \
|
||||
httrack 'BASEURL/gated/index.php' -o0
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# --pause (#185): a fixed inter-file delay must slow a multi-file crawl. Measure
|
||||
# the same crawl with and without --pause and compare: the harness overhead
|
||||
# cancels, leaving only the pause. Integer seconds keep it portable (BSD date
|
||||
# has no %N); a lower bound is not timing-flaky since a pause only adds time.
|
||||
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# python3 runs the local server (mirror local-crawl.sh); skip when absent, else
|
||||
# run() swallows its exit-77 and the serverless 0s/0s crawl looks like a fail.
|
||||
command -v python3 >/dev/null || {
|
||||
echo "python3 not found; skipping local crawl tests"
|
||||
exit 77
|
||||
}
|
||||
|
||||
run() { # echoes the wall-clock seconds of one crawl
|
||||
local t0 t1
|
||||
t0=$(date +%s)
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
httrack 'BASEURL/types/index.html' -c1 "$@" >/dev/null 2>&1
|
||||
t1=$(date +%s)
|
||||
echo $((t1 - t0))
|
||||
}
|
||||
|
||||
base=$(run)
|
||||
paused=$(run --pause 0.5)
|
||||
delta=$((paused - base))
|
||||
|
||||
echo "crawl: ${base}s, with --pause 0.5: ${paused}s (delta ${delta}s)"
|
||||
if [ "$delta" -lt 2 ]; then
|
||||
echo "FAIL: --pause did not delay the crawl (delta ${delta}s)" >&2
|
||||
exit 1
|
||||
fi
|
||||
@@ -1,11 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Issue #204: a 302 Location with a #fragment must drop the fragment before the
|
||||
# target is fetched. The server is strict (400 on a '#' in the request-target),
|
||||
# so a leaked fragment logs an error and the target is never saved.
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'redir/target.html' \
|
||||
httrack 'BASEURL/redir/index.html'
|
||||
@@ -1,4 +1,4 @@
|
||||
# Committed binary fixture read by 01_zlib-cache-golden.test. List it
|
||||
# Committed binary fixture read by 01_engine-cache-golden.test. List it
|
||||
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
||||
# silently drop it from the dist tarball and break "make distcheck".
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
@@ -25,6 +25,9 @@ TEST_EXTENSIONS = .test
|
||||
TEST_LOG_COMPILER = $(BASH)
|
||||
TESTS = \
|
||||
00_runnable.test \
|
||||
01_engine-cache.test \
|
||||
01_engine-cache-golden.test \
|
||||
01_engine-cache-writefail.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-cookies.test \
|
||||
@@ -36,25 +39,16 @@ TESTS = \
|
||||
01_engine-filter.test \
|
||||
01_engine-hashtable.test \
|
||||
01_engine-idna.test \
|
||||
01_engine-makeindex.test \
|
||||
01_engine-mime.test \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-status.test \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
01_engine-urlhack.test \
|
||||
01_engine-useragent.test \
|
||||
01_zlib-acceptencoding.test \
|
||||
01_zlib-cache.test \
|
||||
01_zlib-cache-golden.test \
|
||||
01_zlib-cache-writefail.test \
|
||||
02_manpage-regen.test \
|
||||
02_update-cache.test \
|
||||
10_crawl-simple.test \
|
||||
@@ -78,9 +72,6 @@ TESTS = \
|
||||
23_local-errpage.test \
|
||||
24_local-resume-overlap.test \
|
||||
25_local-mime-exclude.test \
|
||||
26_local-strip-query.test \
|
||||
27_local-cookies-file.test \
|
||||
28_local-pause.test \
|
||||
29_local-redirect-fragment.test
|
||||
26_local-strip-query.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -12,14 +12,11 @@
|
||||
# the mirror directory name.
|
||||
#
|
||||
# Usage:
|
||||
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
|
||||
# bash local-crawl.sh [--tls] [--root DIR] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
# which the ephemeral port forces into the cookie domain) and passes it to
|
||||
# httrack via --cookies-file, to exercise preloaded cookies.
|
||||
|
||||
set -u
|
||||
|
||||
@@ -88,7 +85,6 @@ tmpdir=$(mktemp -d "${tmptopdir}/httrack_local.XXXXXX") || die "could not create
|
||||
|
||||
# --- parse leading control flags --------------------------------------------
|
||||
declare -a audit=()
|
||||
declare -a cookies=()
|
||||
scheme=http
|
||||
pos=0
|
||||
args=("$@")
|
||||
@@ -109,10 +105,6 @@ while test "$pos" -lt "$nargs"; do
|
||||
pos=$((pos + 1))
|
||||
root="${args[$pos]}"
|
||||
;;
|
||||
--cookie)
|
||||
pos=$((pos + 1))
|
||||
cookies+=("${args[$pos]}")
|
||||
;;
|
||||
--errors | --files)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
@@ -166,17 +158,6 @@ while test "$pos" -lt "$nargs"; do
|
||||
pos=$((pos + 1))
|
||||
done
|
||||
|
||||
# --- materialize any --cookie entries into a cookies.txt ---------------------
|
||||
if test "${#cookies[@]}" -gt 0; then
|
||||
jar="${tmpdir}/cookies.txt"
|
||||
: >"$jar"
|
||||
for spec in "${cookies[@]}"; do
|
||||
printf '127.0.0.1:%s\tTRUE\t/\tFALSE\t1999999999\t%s\t%s\n' \
|
||||
"$port" "${spec%%=*}" "${spec#*=}" >>"$jar"
|
||||
done
|
||||
hts+=(--cookies-file "$jar")
|
||||
fi
|
||||
|
||||
# --- run httrack -------------------------------------------------------------
|
||||
which httrack >/dev/null || die "could not find httrack"
|
||||
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')
|
||||
|
||||
@@ -110,19 +110,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
return self.fail_cookie("badger")
|
||||
self.send_html("\tThis is a test.")
|
||||
|
||||
# --cookies-file (#215): the secret page needs a cookie no page ever sets,
|
||||
# so it is reachable only when --cookies-file preloads it.
|
||||
GATE_COOKIE = ("session", "opensesame")
|
||||
|
||||
def route_gated_index(self):
|
||||
self.send_html('\tThis is a <a href="secret.php">link</a>')
|
||||
|
||||
def route_gated_secret(self):
|
||||
name, value = self.GATE_COOKIE
|
||||
if self.request_cookies().get(name) != value:
|
||||
return self.fail_cookie(name)
|
||||
self.send_html("\tThis is the secret.")
|
||||
|
||||
def route_robots(self):
|
||||
body = b"User-agent: *\nDisallow:\n"
|
||||
self.send_response(200)
|
||||
@@ -354,27 +341,10 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
# 302 whose Location carries a #fragment (#204): the fragment is a UA anchor
|
||||
# that must be dropped before the target is fetched. A leaked '#' reaches the
|
||||
# strict-server guard below and 400s.
|
||||
def route_redir_index(self):
|
||||
self.send_html('\t<a href="go.php">go</a>')
|
||||
|
||||
def route_redir_go(self):
|
||||
self.send_response(302, "Found")
|
||||
self.send_header("Location", "target.html#section")
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
def route_redir_target(self):
|
||||
self.send_raw(b"<html><body>redirect target</body></html>\n", "text/html")
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
"/cookies/third.php": route_third,
|
||||
"/gated/index.php": route_gated_index,
|
||||
"/gated/secret.php": route_gated_secret,
|
||||
"/robots.txt": route_robots,
|
||||
"/types/index.html": route_types_index,
|
||||
"/types/control.php": route_types,
|
||||
@@ -406,23 +376,10 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/mimex/index.html": route_mimex_index,
|
||||
"/mimex/blob.pdf": route_mimex_blob,
|
||||
"/mimex/real.html": route_mimex_real,
|
||||
"/redir/index.html": route_redir_index,
|
||||
"/redir/go.php": route_redir_go,
|
||||
"/redir/target.html": route_redir_target,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
|
||||
def reject_fragment(self):
|
||||
# Strict server: a '#' in the request-target is the client failing to
|
||||
# drop a fragment (#204). RFC 3986 forbids it on the wire; answer 400.
|
||||
if "#" in self.path:
|
||||
self.send_response(400, "Bad Request")
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
return True
|
||||
return False
|
||||
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
@@ -434,14 +391,10 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
return False
|
||||
|
||||
def do_GET(self):
|
||||
if self.reject_fragment():
|
||||
return
|
||||
if not self.dispatch():
|
||||
super().do_GET()
|
||||
|
||||
def do_HEAD(self):
|
||||
if self.reject_fragment():
|
||||
return
|
||||
if not self.dispatch():
|
||||
super().do_HEAD()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user