mirror of
https://github.com/xroche/httrack.git
synced 2026-06-22 18:17:46 +03:00
Compare commits
35 Commits
feature/ap
...
chore/lint
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fe7041ddbf | ||
|
|
f5543df1af | ||
|
|
fee30aa95d | ||
|
|
f9f4700ee1 | ||
|
|
f030fa21e3 | ||
|
|
bdd1c1bc2c | ||
|
|
56665a268f | ||
|
|
2e948b9acd | ||
|
|
cae11499f1 | ||
|
|
02c7f4ebf6 | ||
|
|
9070b44a70 | ||
|
|
799c045061 | ||
|
|
fb1ee3bf2e | ||
|
|
6a08ca7d39 | ||
|
|
a8b491e509 | ||
|
|
a8e4bb3b81 | ||
|
|
0145ec37a3 | ||
|
|
a80fab38ba | ||
|
|
c52a524a63 | ||
|
|
1907621d37 | ||
|
|
3b2d7afdaa | ||
|
|
6ee539619e | ||
|
|
fb098b27b4 | ||
|
|
5f6a3fb917 | ||
|
|
f9e676dbe3 | ||
|
|
1b440c44b5 | ||
|
|
ac6dd1a570 | ||
|
|
4549ec3695 | ||
|
|
ac56c31b24 | ||
|
|
ee6beeeb7d | ||
|
|
6788bda380 | ||
|
|
7ead8d595e | ||
|
|
93f502990c | ||
|
|
0f4b2596b2 | ||
|
|
4a676bb5e1 |
20
.github/workflows/ci.yml
vendored
20
.github/workflows/ci.yml
vendored
@@ -320,6 +320,21 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
name: lint (shellcheck, shfmt)
|
name: lint (shellcheck, shfmt)
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-24.04
|
||||||
|
# Every tracked shell script; the globs expand at run time. Kept here so the
|
||||||
|
# shellcheck and shfmt steps below cannot drift apart.
|
||||||
|
env:
|
||||||
|
SHELL_SCRIPTS: >-
|
||||||
|
.githooks/pre-commit
|
||||||
|
bootstrap
|
||||||
|
build.sh
|
||||||
|
html/div/search.sh
|
||||||
|
man/makeman.sh
|
||||||
|
src/htsbasiccharsets.sh
|
||||||
|
src/htsentities.sh
|
||||||
|
src/webhttrack
|
||||||
|
tests/*.sh
|
||||||
|
tests/*.test
|
||||||
|
tools/mkdeb.sh
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v6
|
||||||
|
|
||||||
@@ -332,12 +347,11 @@ jobs:
|
|||||||
sudo apt-get install -y --no-install-recommends shellcheck shfmt
|
sudo apt-get install -y --no-install-recommends shellcheck shfmt
|
||||||
shfmt --version
|
shfmt --version
|
||||||
|
|
||||||
# Lint the scripts we maintain; the legacy scripts are a separate cleanup.
|
|
||||||
- name: shellcheck
|
- name: shellcheck
|
||||||
run: shellcheck man/makeman.sh tools/mkdeb.sh .githooks/pre-commit tests/*.test tests/check-network.sh
|
run: shellcheck $SHELL_SCRIPTS
|
||||||
|
|
||||||
- name: shfmt
|
- name: shfmt
|
||||||
run: shfmt -d -i 4 man/makeman.sh tools/mkdeb.sh .githooks/pre-commit
|
run: shfmt -d -i 4 $SHELL_SCRIPTS
|
||||||
|
|
||||||
# Check clang-format on CHANGED LINES ONLY. The engine predates clang-format
|
# Check clang-format on CHANGED LINES ONLY. The engine predates clang-format
|
||||||
# (it was shaped by an old Visual Studio formatter) and does not round-trip,
|
# (it was shaped by an old Visual Studio formatter) and does not round-trip,
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
|
|
||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
# Simple indexing test using HTTrack
|
# Simple indexing test using HTTrack
|
||||||
# A "real" script/program would use advanced search, and
|
# A "real" script/program would use advanced search, and
|
||||||
# use dichotomy to find the word in the index.txt file
|
# use dichotomy to find the word in the index.txt file
|
||||||
# This script is really basic and NOT optimized, and
|
# This script is really basic and NOT optimized, and
|
||||||
# should not be used for professional purpose :)
|
# should not be used for professional purpose :)
|
||||||
@@ -11,50 +10,49 @@ TESTSITE="http://localhost/"
|
|||||||
|
|
||||||
# Create an index if necessary
|
# Create an index if necessary
|
||||||
if ! test -f "index.txt"; then
|
if ! test -f "index.txt"; then
|
||||||
echo "Building the index .."
|
echo "Building the index .."
|
||||||
rm -rf test
|
rm -rf test
|
||||||
httrack --display "$TESTSITE" -%I -O test
|
httrack --display "$TESTSITE" -%I -O test
|
||||||
mv test/index.txt ./
|
mv test/index.txt ./
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Convert crlf to lf
|
# Convert crlf to lf
|
||||||
if test "`head index.txt -n 1 | tr '\r' '#' | grep -c '#'`" = "1"; then
|
if test "$(head index.txt -n 1 | tr '\r' '#' | grep -c '#')" = "1"; then
|
||||||
echo "Converting index to Unix LF style (not CR/LF) .."
|
echo "Converting index to Unix LF style (not CR/LF) .."
|
||||||
mv -f index.txt index.txt.old
|
mv -f index.txt index.txt.old
|
||||||
cat index.txt.old|tr -d '\r' > index.txt
|
tr -d '\r' <index.txt.old >index.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
keyword=-
|
keyword=-
|
||||||
while test -n "$keyword"; do
|
while test -n "$keyword"; do
|
||||||
printf "Enter a keyword: "
|
printf "Enter a keyword: "
|
||||||
read keyword
|
read -r keyword
|
||||||
|
|
||||||
if test -n "$keyword"; then
|
if test -n "$keyword"; then
|
||||||
FOUNDK="`grep -niE \"^$keyword\" index.txt`"
|
FOUNDK="$(grep -niE "^$keyword" index.txt)"
|
||||||
|
|
||||||
if test -n "$FOUNDK"; then
|
if test -n "$FOUNDK"; then
|
||||||
if ! test `echo "$FOUNDK"|wc -l` = "1"; then
|
if ! test "$(echo "$FOUNDK" | wc -l)" = "1"; then
|
||||||
# Multiple matches
|
# Multiple matches
|
||||||
printf "Found multiple keywords: "
|
printf "Found multiple keywords: "
|
||||||
echo "$FOUNDK"|cut -f2 -d':'|tr '\n' ' '
|
echo "$FOUNDK" | cut -f2 -d':' | tr '\n' ' '
|
||||||
echo ""
|
echo ""
|
||||||
echo "Use keyword$ to find only one"
|
echo "Use keyword$ to find only one"
|
||||||
else
|
else
|
||||||
# One match
|
# One match
|
||||||
N=`echo "$FOUNDK"|cut -f1 -d':'`
|
N=$(echo "$FOUNDK" | cut -f1 -d':')
|
||||||
PM=`tail +$N index.txt|grep -nE "\("|head -n 1`
|
PM=$(tail "+$N" index.txt | grep -nE "\(" | head -n 1)
|
||||||
if ! echo "$PM"|grep "ignored">/dev/null; then
|
if ! echo "$PM" | grep "ignored" >/dev/null; then
|
||||||
M=`echo $PM|cut -f1 -d':'`
|
M=$(echo "$PM" | cut -f1 -d':')
|
||||||
echo "Found in:"
|
echo "Found in:"
|
||||||
cat index.txt | tail "+$N" | head -n "$M" | grep -E "[0-9]* " | cut -f2 -d' '
|
tail "+$N" index.txt | head -n "$M" | grep -E "[0-9]* " | cut -f2 -d' '
|
||||||
else
|
else
|
||||||
echo "keyword ignored (too many hits)"
|
echo "keyword ignored (too many hits)"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "not found"
|
echo "not found"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|||||||
@@ -2532,8 +2532,26 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
#if HTS_USEOPENSSL
|
#if HTS_USEOPENSSL
|
||||||
/* SSL mode */
|
/* SSL mode */
|
||||||
if (back[i].r.ssl) {
|
if (back[i].r.ssl) {
|
||||||
|
int tunnel_ok = 1;
|
||||||
|
|
||||||
|
// https via proxy: CONNECT-tunnel before TLS (#85)
|
||||||
|
if (back[i].r.req.proxy.active && back[i].r.ssl_con == NULL) {
|
||||||
|
const int timeout = back[i].timeout > 0 ? back[i].timeout : 30;
|
||||||
|
|
||||||
|
tunnel_ok =
|
||||||
|
http_proxy_tunnel(opt, &back[i].r, back[i].url_adr, timeout);
|
||||||
|
if (!tunnel_ok) {
|
||||||
|
if (!strnotempty(back[i].r.msg))
|
||||||
|
strcpybuff(back[i].r.msg, "proxy CONNECT failed");
|
||||||
|
deletehttp(&back[i].r);
|
||||||
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
|
back[i].r.statuscode = STATUSCODE_NON_FATAL;
|
||||||
|
back[i].status = STATUS_READY;
|
||||||
|
back_set_finished(sback, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
// handshake not yet launched
|
// handshake not yet launched
|
||||||
if (!back[i].r.ssl_con) {
|
if (tunnel_ok && !back[i].r.ssl_con) {
|
||||||
SSL_CTX_set_options(openssl_ctx, SSL_OP_ALL);
|
SSL_CTX_set_options(openssl_ctx, SSL_OP_ALL);
|
||||||
// new session
|
// new session
|
||||||
back[i].r.ssl_con = SSL_new(openssl_ctx);
|
back[i].r.ssl_con = SSL_new(openssl_ctx);
|
||||||
@@ -2551,7 +2569,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
back[i].r.statuscode = STATUSCODE_SSL_HANDSHAKE;
|
back[i].r.statuscode = STATUSCODE_SSL_HANDSHAKE;
|
||||||
}
|
}
|
||||||
/* Error */
|
/* Error */
|
||||||
if (back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
|
if (tunnel_ok && back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
|
||||||
strcpybuff(back[i].r.msg, "bad SSL/TLS handshake");
|
strcpybuff(back[i].r.msg, "bad SSL/TLS handshake");
|
||||||
deletehttp(&back[i].r);
|
deletehttp(&back[i].r);
|
||||||
back[i].r.soc = INVALID_SOCKET;
|
back[i].r.soc = INVALID_SOCKET;
|
||||||
@@ -3838,7 +3856,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
/* funny log for commandline users */
|
/* funny log for commandline users */
|
||||||
//if (!opt->quiet) {
|
//if (!opt->quiet) {
|
||||||
// petite animation
|
// petite animation
|
||||||
if (opt->verbosedisplay == 1) {
|
if (opt->verbosedisplay == HTS_VERBOSE_SIMPLE) {
|
||||||
if (back[i].status == STATUS_READY) {
|
if (back[i].status == STATUS_READY) {
|
||||||
if (back[i].r.statuscode == HTTP_OK)
|
if (back[i].r.statuscode == HTTP_OK)
|
||||||
printf("* %s%s (" LLintP " bytes) - OK" VT_CLREOL "\r",
|
printf("* %s%s (" LLintP " bytes) - OK" VT_CLREOL "\r",
|
||||||
|
|||||||
@@ -3,57 +3,59 @@
|
|||||||
|
|
||||||
# Change this to download files
|
# Change this to download files
|
||||||
if false; then
|
if false; then
|
||||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
|
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
|
||||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
|
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
|
||||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
|
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
|
||||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
|
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
|
||||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
|
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
|
||||||
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
|
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
|
||||||
rm -f CP932.TXT CP936.TXT CP949.TXT CP950.TXT
|
rm -f CP932.TXT CP936.TXT CP949.TXT CP950.TXT
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Produce code
|
# Produce code
|
||||||
printf "/** GENERATED FILE ($0), DO NOT EDIT **/\n\n"
|
printf '/** GENERATED FILE (%s), DO NOT EDIT **/\n\n' "$0"
|
||||||
for i in *.TXT ; do
|
for i in *.TXT; do
|
||||||
echo "processing $i" >&2
|
echo "processing $i" >&2
|
||||||
grep -vE "^(#|$)" $i | grep -E "^0x" | sed -e 's/[[:space:]]/ /g' | cut -f1,2 -d' ' | \
|
grep -vE "^(#|$)" "$i" | grep -E "^0x" | sed -e 's/[[:space:]]/ /g' | cut -f1,2 -d' ' |
|
||||||
(
|
(
|
||||||
unset arr
|
unset arr
|
||||||
while read LINE ; do
|
while read -r LINE; do
|
||||||
from=$[$(echo $LINE | cut -f1 -d' ')]
|
from=$(($(echo "$LINE" | cut -f1 -d' ')))
|
||||||
if ! test -n "$from"; then
|
if ! test -n "$from"; then
|
||||||
echo "error with $i" >&2
|
echo "error with $i" >&2
|
||||||
exit 1
|
exit 1
|
||||||
elif test $from -ge 256; then
|
elif test $from -ge 256; then
|
||||||
echo "out-of-range ($LINE) with $i" >&2
|
echo "out-of-range ($LINE) with $i" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
to=$(echo $LINE | cut -f2 -d' ')
|
to=$(echo "$LINE" | cut -f2 -d' ')
|
||||||
arr[$from]=$to
|
arr[from]=$to
|
||||||
done
|
done
|
||||||
name=$(echo $i | tr 'A-Z' 'a-z' | tr '-' '_' | sed -e 's/\.txt//' -e 's/8859/iso_8859/')
|
# shellcheck disable=SC2018,SC2019 # charset filenames are ASCII; keep C-locale A-Z/a-z
|
||||||
printf "/* Table for $i */\nstatic const hts_UCS4 table_${name}[256] = {\n "
|
name=$(echo "$i" | tr 'A-Z' 'a-z' | tr '-' '_' | sed -e 's/\.txt//' -e 's/8859/iso_8859/')
|
||||||
i=0
|
printf '/* Table for %s */\nstatic const hts_UCS4 table_%s[256] = {\n ' "$i" "$name"
|
||||||
while test "$i" -lt 256; do
|
idx=0
|
||||||
if test "$i" -gt 0; then
|
while test "$idx" -lt 256; do
|
||||||
printf ", "
|
if test "$idx" -gt 0; then
|
||||||
if test $[${i}%8] -eq 0; then
|
printf ", "
|
||||||
printf "\n "
|
if test $((idx % 8)) -eq 0; then
|
||||||
fi
|
printf "\n "
|
||||||
fi
|
fi
|
||||||
value=${arr[$i]:-0}
|
fi
|
||||||
printf "0x%04x" $value
|
value=${arr[$idx]:-0}
|
||||||
i=$[${i}+1]
|
printf "0x%04x" "$value"
|
||||||
done
|
idx=$((idx + 1))
|
||||||
printf " };\n\n"
|
done
|
||||||
)
|
printf " };\n\n"
|
||||||
echo "processed $i" >&2
|
)
|
||||||
|
echo "processed $i" >&2
|
||||||
done
|
done
|
||||||
|
|
||||||
# Indexes
|
# Indexes
|
||||||
printf "static const struct {\n const char *name;\n const hts_UCS4 *table;\n} table_mappings[] = {\n"
|
printf "static const struct {\n const char *name;\n const hts_UCS4 *table;\n} table_mappings[] = {\n"
|
||||||
for i in *.TXT ; do
|
for i in *.TXT; do
|
||||||
name=$(echo $i | tr 'A-Z' 'a-z' | tr '-' '_' | sed -e 's/\.txt//' -e 's/8859/iso_8859/')
|
# shellcheck disable=SC2018,SC2019 # charset filenames are ASCII; keep C-locale A-Z/a-z
|
||||||
printf " { \"$(echo $name | tr -d '_')\", table_${name} },\n"
|
name=$(echo "$i" | tr 'A-Z' 'a-z' | tr '-' '_' | sed -e 's/\.txt//' -e 's/8859/iso_8859/')
|
||||||
|
printf ' { "%s", table_%s },\n' "$(echo "$name" | tr -d '_')" "$name"
|
||||||
done
|
done
|
||||||
printf " { NULL, NULL }\n};\n"
|
printf " { NULL, NULL }\n};\n"
|
||||||
|
|||||||
@@ -135,7 +135,8 @@ HTSEXT_API T_SOC catch_url_init(int *port, /* 128 bytes */ char *adr) {
|
|||||||
// returns 0 if error
|
// returns 0 if error
|
||||||
// url: buffer where URL must be stored - or ip:port in case of failure
|
// url: buffer where URL must be stored - or ip:port in case of failure
|
||||||
// data: 32Kb
|
// data: 32Kb
|
||||||
HTSEXT_API int catch_url(T_SOC soc, char *url, char *method, char *data) {
|
HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||||
|
char *data) {
|
||||||
int retour = 0;
|
int retour = 0;
|
||||||
|
|
||||||
// connexion (accept)
|
// connexion (accept)
|
||||||
|
|||||||
@@ -2585,7 +2585,7 @@ static int mkdir_compat(const char *pathname) {
|
|||||||
|
|
||||||
/* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */
|
/* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */
|
||||||
/* Note: preserve errno */
|
/* Note: preserve errno */
|
||||||
HTSEXT_API int dir_exists(const char *path) {
|
HTSEXT_API hts_boolean dir_exists(const char *path) {
|
||||||
const int err = errno;
|
const int err = errno;
|
||||||
STRUCT_STAT st;
|
STRUCT_STAT st;
|
||||||
char BIGSTK file[HTS_URLMAXSIZE * 2];
|
char BIGSTK file[HTS_URLMAXSIZE * 2];
|
||||||
@@ -3342,7 +3342,8 @@ int back_fill(struct_back * sback, httrackp * opt, cache_back * cache,
|
|||||||
int ptr, int numero_passe) {
|
int ptr, int numero_passe) {
|
||||||
int n = back_pluggable_sockets(sback, opt);
|
int n = back_pluggable_sockets(sback, opt);
|
||||||
|
|
||||||
if (opt->savename_delayed == 2 && !opt->delayed_cached) /* cancel (always delayed) */
|
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD &&
|
||||||
|
!opt->delayed_cached) /* cancel (always delayed) */
|
||||||
return 0;
|
return 0;
|
||||||
if (n > 0) {
|
if (n > 0) {
|
||||||
int p;
|
int p;
|
||||||
@@ -3646,7 +3647,7 @@ HTSEXT_API int hts_setpause(httrackp * opt, int p) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ask for termination
|
// ask for termination
|
||||||
HTSEXT_API int hts_request_stop(httrackp * opt, int force) {
|
HTSEXT_API int hts_request_stop(httrackp *opt, hts_boolean force) {
|
||||||
if (opt != NULL) {
|
if (opt != NULL) {
|
||||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||||
hts_mutexlock(&opt->state.lock);
|
hts_mutexlock(&opt->state.lock);
|
||||||
@@ -3656,7 +3657,7 @@ HTSEXT_API int hts_request_stop(httrackp * opt, int force) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
HTSEXT_API int hts_has_stopped(httrackp * opt) {
|
HTSEXT_API hts_boolean hts_has_stopped(httrackp *opt) {
|
||||||
int ended;
|
int ended;
|
||||||
hts_mutexlock(&opt->state.lock);
|
hts_mutexlock(&opt->state.lock);
|
||||||
ended = opt->state.is_ended;
|
ended = opt->state.is_ended;
|
||||||
@@ -3678,12 +3679,12 @@ HTSEXT_API int hts_has_stopped(httrackp * opt) {
|
|||||||
//}
|
//}
|
||||||
// ajout d'URL
|
// ajout d'URL
|
||||||
// -1 : erreur
|
// -1 : erreur
|
||||||
HTSEXT_API int hts_addurl(httrackp * opt, char **url) {
|
HTSEXT_API hts_boolean hts_addurl(httrackp *opt, char **url) {
|
||||||
if (url)
|
if (url)
|
||||||
opt->state._hts_addurl = url;
|
opt->state._hts_addurl = url;
|
||||||
return (opt->state._hts_addurl != NULL);
|
return (opt->state._hts_addurl != NULL);
|
||||||
}
|
}
|
||||||
HTSEXT_API int hts_resetaddurl(httrackp * opt) {
|
HTSEXT_API hts_boolean hts_resetaddurl(httrackp *opt) {
|
||||||
opt->state._hts_addurl = NULL;
|
opt->state._hts_addurl = NULL;
|
||||||
return (opt->state._hts_addurl != NULL);
|
return (opt->state._hts_addurl != NULL);
|
||||||
}
|
}
|
||||||
@@ -3702,7 +3703,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
|||||||
if (from->maxsoc > 0)
|
if (from->maxsoc > 0)
|
||||||
to->maxsoc = from->maxsoc;
|
to->maxsoc = from->maxsoc;
|
||||||
|
|
||||||
if (from->nearlink > -1)
|
/* hts_boolean/enum fields are unsigned (GCC), so a bare `> -1` unset-guard
|
||||||
|
is always false; cast to int to keep the -1 "unset" sentinel test. */
|
||||||
|
if ((int) from->nearlink > -1)
|
||||||
to->nearlink = from->nearlink;
|
to->nearlink = from->nearlink;
|
||||||
|
|
||||||
if (from->timeout > -1)
|
if (from->timeout > -1)
|
||||||
@@ -3729,10 +3732,10 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
|||||||
if (from->hostcontrol > -1)
|
if (from->hostcontrol > -1)
|
||||||
to->hostcontrol = from->hostcontrol;
|
to->hostcontrol = from->hostcontrol;
|
||||||
|
|
||||||
if (from->errpage > -1)
|
if ((int) from->errpage > -1)
|
||||||
to->errpage = from->errpage;
|
to->errpage = from->errpage;
|
||||||
|
|
||||||
if (from->parseall > -1)
|
if ((int) from->parseall > -1)
|
||||||
to->parseall = from->parseall;
|
to->parseall = from->parseall;
|
||||||
|
|
||||||
// test all: bit 8 de travel
|
// test all: bit 8 de travel
|
||||||
@@ -3844,7 +3847,7 @@ int htsAddLink(htsmoduleStruct * str, char *link) {
|
|||||||
a = opt->savename_type;
|
a = opt->savename_type;
|
||||||
b = opt->savename_83;
|
b = opt->savename_83;
|
||||||
opt->savename_type = 0;
|
opt->savename_type = 0;
|
||||||
opt->savename_83 = 0;
|
opt->savename_83 = HTS_SAVENAME_83_LONG;
|
||||||
// note: adr,fil peuvent être patchés
|
// note: adr,fil peuvent être patchés
|
||||||
r =
|
r =
|
||||||
url_savename(&afs, NULL, NULL, NULL, opt, sback, cache, hashptr, ptr, numero_passe,
|
url_savename(&afs, NULL, NULL, NULL, opt, sback, cache, hashptr, ptr, numero_passe,
|
||||||
|
|||||||
@@ -612,12 +612,12 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
/* Terminal is a tty, may ask questions and display funny information */
|
/* Terminal is a tty, may ask questions and display funny information */
|
||||||
if (isatty(1)) {
|
if (isatty(1)) {
|
||||||
opt->quiet = 0;
|
opt->quiet = 0;
|
||||||
opt->verbosedisplay = 1;
|
opt->verbosedisplay = HTS_VERBOSE_SIMPLE;
|
||||||
}
|
}
|
||||||
/* Not a tty, no stdin input or funny output! */
|
/* Not a tty, no stdin input or funny output! */
|
||||||
else {
|
else {
|
||||||
opt->quiet = 1;
|
opt->quiet = 1;
|
||||||
opt->verbosedisplay = 0;
|
opt->verbosedisplay = HTS_VERBOSE_NONE;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -953,9 +953,11 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
p = buff;
|
p = buff;
|
||||||
do {
|
do {
|
||||||
int insert_after_argc;
|
int insert_after_argc;
|
||||||
|
int quoted; /* "" unquotes to empty but is still a real token (#106) */
|
||||||
|
|
||||||
// read next
|
// read next
|
||||||
lastp = p;
|
lastp = p;
|
||||||
|
quoted = (p != NULL && *p == '"');
|
||||||
if (p) {
|
if (p) {
|
||||||
p = next_token(p, 1);
|
p = next_token(p, 1);
|
||||||
if (p) {
|
if (p) {
|
||||||
@@ -966,7 +968,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
|
|
||||||
/* Insert parameters BUT so that they can be in the same order */
|
/* Insert parameters BUT so that they can be in the same order */
|
||||||
if (lastp) {
|
if (lastp) {
|
||||||
if (strnotempty(lastp)) {
|
if (strnotempty(lastp) || quoted) {
|
||||||
insert_after_argc = argc - insert_after;
|
insert_after_argc = argc - insert_after;
|
||||||
cmdl_ins(lastp, insert_after_argc, (argv + insert_after), x_argvblk,
|
cmdl_ins(lastp, insert_after_argc, (argv + insert_after), x_argvblk,
|
||||||
x_argvblk_size, x_ptr);
|
x_argvblk_size, x_ptr);
|
||||||
@@ -1815,24 +1817,22 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
com++;
|
com++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 'L':
|
case 'L': {
|
||||||
{
|
sscanf(com + 1, "%d", (int *) &opt->savename_83);
|
||||||
sscanf(com + 1, "%d", &opt->savename_83);
|
switch (opt->savename_83) {
|
||||||
switch (opt->savename_83) {
|
case 0: // 8-3 (ISO9660 L1)
|
||||||
case 0: // 8-3 (ISO9660 L1)
|
opt->savename_83 = HTS_SAVENAME_83_DOS;
|
||||||
opt->savename_83 = 1;
|
break;
|
||||||
break;
|
case 1:
|
||||||
case 1:
|
opt->savename_83 = HTS_SAVENAME_83_LONG;
|
||||||
opt->savename_83 = 0;
|
break;
|
||||||
break;
|
default: // 2 == ISO9660 (ISO9660 L2)
|
||||||
default: // 2 == ISO9660 (ISO9660 L2)
|
opt->savename_83 = HTS_SAVENAME_83_ISO9660;
|
||||||
opt->savename_83 = 2;
|
break;
|
||||||
break;
|
|
||||||
}
|
|
||||||
while(isdigit((unsigned char) *(com + 1)))
|
|
||||||
com++;
|
|
||||||
}
|
}
|
||||||
break;
|
while (isdigit((unsigned char) *(com + 1)))
|
||||||
|
com++;
|
||||||
|
} break;
|
||||||
case 's':
|
case 's':
|
||||||
if (isdigit((unsigned char) *(com + 1))) {
|
if (isdigit((unsigned char) *(com + 1))) {
|
||||||
sscanf(com + 1, "%d", (int *) &opt->robots);
|
sscanf(com + 1, "%d", (int *) &opt->robots);
|
||||||
@@ -1989,9 +1989,9 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
}
|
}
|
||||||
break; // url hack
|
break; // url hack
|
||||||
case 'v':
|
case 'v':
|
||||||
opt->verbosedisplay = 2;
|
opt->verbosedisplay = HTS_VERBOSE_FULL;
|
||||||
if (isdigit((unsigned char) *(com + 1))) {
|
if (isdigit((unsigned char) *(com + 1))) {
|
||||||
sscanf(com + 1, "%d", &opt->verbosedisplay);
|
sscanf(com + 1, "%d", (int *) &opt->verbosedisplay);
|
||||||
while(isdigit((unsigned char) *(com + 1)))
|
while(isdigit((unsigned char) *(com + 1)))
|
||||||
com++;
|
com++;
|
||||||
}
|
}
|
||||||
@@ -2004,9 +2004,9 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 'N':
|
case 'N':
|
||||||
opt->savename_delayed = 2;
|
opt->savename_delayed = HTS_SAVENAME_DELAYED_HARD;
|
||||||
if (isdigit((unsigned char) *(com + 1))) {
|
if (isdigit((unsigned char) *(com + 1))) {
|
||||||
sscanf(com + 1, "%d", &opt->savename_delayed);
|
sscanf(com + 1, "%d", (int *) &opt->savename_delayed);
|
||||||
while(isdigit((unsigned char) *(com + 1)))
|
while(isdigit((unsigned char) *(com + 1)))
|
||||||
com++;
|
com++;
|
||||||
}
|
}
|
||||||
@@ -2787,6 +2787,47 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 'l': /* lienrelatif: relative link from curr_fil to link */
|
||||||
|
if (na + 2 >= argc) {
|
||||||
|
HTS_PANIC_PRINTF(
|
||||||
|
"Option #l needs a link and a current-file path");
|
||||||
|
printf(
|
||||||
|
"Example: '-#l' 'host/dir/img.gif' 'host/dir/p.html'\n");
|
||||||
|
htsmain_free();
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
char s[HTS_URLMAXSIZE * 2];
|
||||||
|
|
||||||
|
if (lienrelatif(s, sizeof(s), argv[na + 1], argv[na + 2]) ==
|
||||||
|
0)
|
||||||
|
printf("relative=%s\n", s);
|
||||||
|
else
|
||||||
|
printf("relative=<ERROR>\n");
|
||||||
|
htsmain_free();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'i': /* ident_url_relatif: resolve a link -> adr/fil */
|
||||||
|
if (na + 3 >= argc) {
|
||||||
|
HTS_PANIC_PRINTF(
|
||||||
|
"Option #i needs a link, an origin address and file");
|
||||||
|
printf("Example: '-#i' '../img.gif' 'www.foo.com' "
|
||||||
|
"'/d/p.html'\n");
|
||||||
|
htsmain_free();
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
lien_adrfil af;
|
||||||
|
const int r = ident_url_relatif(argv[na + 1], argv[na + 2],
|
||||||
|
argv[na + 3], &af);
|
||||||
|
|
||||||
|
if (r == 0)
|
||||||
|
printf("adr=%s fil=%s\n", af.adr, af.fil);
|
||||||
|
else
|
||||||
|
printf("error=%d\n", r);
|
||||||
|
htsmain_free();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
break;
|
||||||
case '2': // mimedefs
|
case '2': // mimedefs
|
||||||
if (na + 1 >= argc) {
|
if (na + 1 >= argc) {
|
||||||
HTS_PANIC_PRINTF("Option #2 needs to be followed by an URL");
|
HTS_PANIC_PRINTF("Option #2 needs to be followed by an URL");
|
||||||
@@ -3096,6 +3137,78 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
|||||||
htsmain_free();
|
htsmain_free();
|
||||||
return 0;
|
return 0;
|
||||||
break;
|
break;
|
||||||
|
case '9': { // copy_htsopt selftest: httrack -#9
|
||||||
|
httrackp *from = hts_create_opt();
|
||||||
|
httrackp *to = hts_create_opt();
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
/* from-values differ from both the to-values and the
|
||||||
|
hts_create_opt() defaults (nearlink FALSE, errpage/parseall
|
||||||
|
TRUE), so a copy that no-ops or just resets to defaults is
|
||||||
|
caught too, not only the unsigned-guard bug. */
|
||||||
|
from->retry = 7; /* int field: positive control */
|
||||||
|
to->retry = 0;
|
||||||
|
from->nearlink = HTS_TRUE;
|
||||||
|
to->nearlink = HTS_FALSE;
|
||||||
|
from->errpage = HTS_FALSE;
|
||||||
|
to->errpage = HTS_TRUE;
|
||||||
|
from->parseall = HTS_FALSE;
|
||||||
|
to->parseall = HTS_TRUE;
|
||||||
|
|
||||||
|
copy_htsopt(from, to);
|
||||||
|
|
||||||
|
if (to->retry != 7)
|
||||||
|
err = 1;
|
||||||
|
if (to->nearlink != HTS_TRUE)
|
||||||
|
err = 1;
|
||||||
|
if (to->errpage != HTS_FALSE)
|
||||||
|
err = 1;
|
||||||
|
if (to->parseall != HTS_FALSE)
|
||||||
|
err = 1;
|
||||||
|
|
||||||
|
hts_free_opt(from);
|
||||||
|
hts_free_opt(to);
|
||||||
|
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
|
||||||
|
htsmain_free();
|
||||||
|
return err;
|
||||||
|
} break;
|
||||||
|
case 'Q': { // cookie request-header selftest: httrack -#Q
|
||||||
|
static t_cookie cookie;
|
||||||
|
char hdr[1024];
|
||||||
|
/* RFC 6265: bare name=value pairs, no $Version/$Path (#151). */
|
||||||
|
const char *expected = "Cookie: name=value; has_js=1" H_CRLF;
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
const char *dom = "www.example.com";
|
||||||
|
int added;
|
||||||
|
|
||||||
|
cookie.max_len = (int) sizeof(cookie.data);
|
||||||
|
cookie.data[0] = '\0';
|
||||||
|
added = cookie_add(&cookie, "name", "value", dom, "/");
|
||||||
|
added |= cookie_add(&cookie, "has_js", "1", dom, "/");
|
||||||
|
/* different domain: must be filtered out */
|
||||||
|
added |= cookie_add(&cookie, "junk", "x", "other.org", "/");
|
||||||
|
if (added) {
|
||||||
|
printf("cookie-header: FAIL (cookie_add setup)\n");
|
||||||
|
htsmain_free();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
http_cookie_header_selftest(&cookie, dom, "/", hdr,
|
||||||
|
sizeof(hdr));
|
||||||
|
if (strcmp(hdr, expected) != 0)
|
||||||
|
err = 1;
|
||||||
|
if (strstr(hdr, "$Version") != NULL ||
|
||||||
|
strstr(hdr, "$Path") != NULL)
|
||||||
|
err = 1;
|
||||||
|
if (strstr(hdr, "junk") != NULL) // wrong-domain cookie leaked
|
||||||
|
err = 1;
|
||||||
|
printf("cookie-header: %s\n", err ? "FAIL" : "OK");
|
||||||
|
if (err)
|
||||||
|
printf(" got: %s\n", hdr);
|
||||||
|
htsmain_free();
|
||||||
|
return err;
|
||||||
|
} break;
|
||||||
case '!':
|
case '!':
|
||||||
HTS_PANIC_PRINTF
|
HTS_PANIC_PRINTF
|
||||||
("Option #! is disabled for security reasons");
|
("Option #! is disabled for security reasons");
|
||||||
|
|||||||
@@ -33,43 +33,43 @@ EOF
|
|||||||
else
|
else
|
||||||
GET "${url}"
|
GET "${url}"
|
||||||
fi
|
fi
|
||||||
) \
|
) |
|
||||||
| grep -E '^<!ENTITY [a-zA-Z0-9_]' \
|
grep -E '^<!ENTITY [a-zA-Z0-9_]' |
|
||||||
| sed \
|
sed \
|
||||||
-e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
|
-e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
|
||||||
-e 's/-->$//' \
|
-e 's/-->$//' \
|
||||||
-e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/'\
|
-e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/' |
|
||||||
| ( \
|
(
|
||||||
read A
|
read -r A
|
||||||
while test -n "$A"; do
|
while test -n "$A"; do
|
||||||
ent="${A%% *}"
|
ent="${A%% *}"
|
||||||
code=$(echo "$A"|cut -f2 -d' ')
|
code=$(echo "$A" | cut -f2 -d' ')
|
||||||
# compute hash
|
# compute hash
|
||||||
hash=0
|
hash=0
|
||||||
i=0
|
i=0
|
||||||
a=1664525
|
a=1664525
|
||||||
c=1013904223
|
c=1013904223
|
||||||
m="$[1 << 32]"
|
m="$((1 << 32))"
|
||||||
while test "$i" -lt ${#ent}; do
|
while test "$i" -lt ${#ent}; do
|
||||||
d="$(echo -n "${ent:${i}:1}"|hexdump -v -e '/1 "%d"')"
|
d="$(echo -n "${ent:${i}:1}" | hexdump -v -e '/1 "%d"')"
|
||||||
hash="$[((${hash}*${a})%(${m})+${d}+${c})%(${m})]"
|
hash="$((((hash * a) % (m) + d + c) % (m)))"
|
||||||
i=$[${i}+1]
|
i=$((i + 1))
|
||||||
done
|
done
|
||||||
echo -e " /* $A */"
|
echo -e " /* $A */"
|
||||||
echo -e " case ${hash}u:"
|
echo -e " case ${hash}u:"
|
||||||
echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {"
|
echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {"
|
||||||
echo -e " return ${code};"
|
echo -e " return ${code};"
|
||||||
echo -e " }"
|
echo -e " }"
|
||||||
echo -e " break;"
|
echo -e " break;"
|
||||||
|
|
||||||
# next
|
# next
|
||||||
read A
|
read -r A
|
||||||
done
|
done
|
||||||
)
|
)
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
}
|
}
|
||||||
/* unknown */
|
/* unknown */
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
EOF
|
EOF
|
||||||
) > ${dest}
|
) >${dest}
|
||||||
|
|||||||
@@ -242,6 +242,14 @@ Please visit our Website: http://www.httrack.com
|
|||||||
#define HTS_NOPARAM "(none)"
|
#define HTS_NOPARAM "(none)"
|
||||||
#define HTS_NOPARAM2 "\"(none)\""
|
#define HTS_NOPARAM2 "\"(none)\""
|
||||||
|
|
||||||
|
/* Boolean flag for option fields and API yes/no returns. An enum (not C bool)
|
||||||
|
so it stays int-sized: option fields keep the httrackp layout/ABI, and a
|
||||||
|
return type stays compatible with the int it replaces. */
|
||||||
|
#ifndef HTS_DEF_DEFSTRUCT_hts_boolean
|
||||||
|
#define HTS_DEF_DEFSTRUCT_hts_boolean
|
||||||
|
typedef enum hts_boolean { HTS_FALSE = 0, HTS_TRUE = 1 } hts_boolean;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Larger/smaller of two values. Macros: arguments are evaluated twice. */
|
/* Larger/smaller of two values. Macros: arguments are evaluated twice. */
|
||||||
#define maximum(A,B) ( (A) > (B) ? (A) : (B) )
|
#define maximum(A,B) ( (A) > (B) ? (A) : (B) )
|
||||||
|
|
||||||
|
|||||||
295
src/htslib.c
295
src/htslib.c
@@ -644,6 +644,165 @@ T_SOC http_fopen(httrackp * opt, const char *adr, const char *fil, htsblk * reto
|
|||||||
return http_xfopen(opt, 0, 1, 1, NULL, adr, fil, retour);
|
return http_xfopen(opt, 0, 1, 1, NULL, adr, fil, retour);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Read a CRLF line from a non-blocking socket (waits up to timeout per recv).
|
||||||
|
// Returns the line length (0 = empty), or -1 on timeout/EOF/error.
|
||||||
|
static int proxy_getline(T_SOC soc, char *s, int max, int timeout) {
|
||||||
|
int j = 0;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
unsigned char ch;
|
||||||
|
int n;
|
||||||
|
|
||||||
|
if (!check_readinput_t(soc, timeout))
|
||||||
|
return -1; // timed out waiting for data
|
||||||
|
n = (int) recv(soc, &ch, 1, 0);
|
||||||
|
if (n == 1) {
|
||||||
|
if (ch == 13) // CR
|
||||||
|
continue;
|
||||||
|
if (ch == 10) // LF: end of line
|
||||||
|
break;
|
||||||
|
if (j >= max - 1)
|
||||||
|
return -1; // line too long: bound the read against a hostile proxy
|
||||||
|
s[j++] = (char) ch;
|
||||||
|
} else if (n == 0) {
|
||||||
|
return -1; // connection closed
|
||||||
|
} else {
|
||||||
|
#ifdef _WIN32
|
||||||
|
if (WSAGetLastError() == WSAEWOULDBLOCK)
|
||||||
|
continue;
|
||||||
|
#else
|
||||||
|
if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
|
||||||
|
continue;
|
||||||
|
#endif
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s[j] = '\0';
|
||||||
|
return j;
|
||||||
|
}
|
||||||
|
|
||||||
|
int http_proxy_tunnel(httrackp *opt, htsblk *retour, const char *adr,
|
||||||
|
int timeout) {
|
||||||
|
const T_SOC soc = retour->soc;
|
||||||
|
const char *const host = jump_identification_const(adr); // host[:port]
|
||||||
|
const char *const portsep = jump_toport_const(adr); // ":port" or NULL
|
||||||
|
char BIGSTK authority[HTS_URLMAXSIZE * 2];
|
||||||
|
char BIGSTK req[HTS_URLMAXSIZE * 4 + 1100];
|
||||||
|
char line[1024];
|
||||||
|
int code;
|
||||||
|
|
||||||
|
if (soc == INVALID_SOCKET)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
// CONNECT needs an explicit host:port; default the https port
|
||||||
|
authority[0] = '\0';
|
||||||
|
if (portsep != NULL)
|
||||||
|
strlcatbuff(authority, host, sizeof(authority)); // already host:port
|
||||||
|
else
|
||||||
|
snprintf(authority, sizeof(authority), "%s:%d", host, 443);
|
||||||
|
|
||||||
|
// backstop: never let a stray CR/LF in the host smuggle a second line into
|
||||||
|
// the CONNECT request (the host is already sanitized upstream)
|
||||||
|
{
|
||||||
|
const char *c;
|
||||||
|
|
||||||
|
for (c = authority; *c != '\0'; c++) {
|
||||||
|
if ((unsigned char) *c < ' ') {
|
||||||
|
strcpybuff(retour->msg, "proxy CONNECT: invalid host");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(req, sizeof(req), "CONNECT %s HTTP/1.0" H_CRLF "Host: %s" H_CRLF,
|
||||||
|
authority, authority);
|
||||||
|
|
||||||
|
// creds go on the CONNECT, not the tunneled origin request
|
||||||
|
if (link_has_authorization(retour->req.proxy.name)) {
|
||||||
|
const char *a = jump_identification_const(retour->req.proxy.name);
|
||||||
|
const char *astart = jump_protocol_const(retour->req.proxy.name);
|
||||||
|
char autorisation[1100];
|
||||||
|
char user_pass[256];
|
||||||
|
|
||||||
|
autorisation[0] = user_pass[0] = '\0';
|
||||||
|
strncatbuff(user_pass, astart, (int) (a - astart) - 1);
|
||||||
|
strcpybuff(user_pass, unescape_http(OPT_GET_BUFF(opt),
|
||||||
|
OPT_GET_BUFF_SIZE(opt), user_pass));
|
||||||
|
code64((unsigned char *) user_pass, (int) strlen(user_pass),
|
||||||
|
(unsigned char *) autorisation, 0);
|
||||||
|
strlcatbuff(req, "Proxy-Authorization: Basic ", sizeof(req));
|
||||||
|
strlcatbuff(req, autorisation, sizeof(req));
|
||||||
|
strlcatbuff(req, H_CRLF, sizeof(req));
|
||||||
|
}
|
||||||
|
strlcatbuff(req, H_CRLF, sizeof(req)); // end of request headers
|
||||||
|
|
||||||
|
// raw send: ssl is set, so sendc() would route to TLS
|
||||||
|
{
|
||||||
|
const char *p = req;
|
||||||
|
size_t remain = strlen(req);
|
||||||
|
int stalls = 0;
|
||||||
|
|
||||||
|
while (remain > 0) {
|
||||||
|
const int n = (int) send(soc, p, (int) remain, 0);
|
||||||
|
|
||||||
|
if (n > 0) {
|
||||||
|
p += n;
|
||||||
|
remain -= (size_t) n;
|
||||||
|
stalls = 0;
|
||||||
|
} else {
|
||||||
|
#ifdef _WIN32
|
||||||
|
const int wouldblock = (WSAGetLastError() == WSAEWOULDBLOCK);
|
||||||
|
#else
|
||||||
|
const int wouldblock =
|
||||||
|
(errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR);
|
||||||
|
#endif
|
||||||
|
// don't spin forever on a fatal error or an unwritable socket
|
||||||
|
if (!wouldblock || !check_writeinput_t(soc, timeout) ||
|
||||||
|
++stalls > 100) {
|
||||||
|
strcpybuff(retour->msg, "proxy CONNECT: write error");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// proxy status line: "HTTP/1.x <code> ..."
|
||||||
|
if (proxy_getline(soc, line, sizeof(line), timeout) < 0) {
|
||||||
|
strcpybuff(retour->msg, "proxy CONNECT: no response");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (sscanf(line, "HTTP/%*d.%*d %d", &code) < 1)
|
||||||
|
code = 0;
|
||||||
|
if (code < 200 || code >= 300) {
|
||||||
|
snprintf(retour->msg, sizeof(retour->msg), "proxy CONNECT refused: %s",
|
||||||
|
strnotempty(line) ? line : "(no status)");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// drain headers to the blank line; cap the count so a flooding proxy can't
|
||||||
|
// stall the crawl
|
||||||
|
{
|
||||||
|
int headers = 0;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
const int n = proxy_getline(soc, line, sizeof(line), timeout);
|
||||||
|
|
||||||
|
if (n < 0) {
|
||||||
|
strcpybuff(retour->msg, "proxy CONNECT: truncated response");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (n == 0)
|
||||||
|
break; // blank line: tunnel ready
|
||||||
|
if (++headers > 64) {
|
||||||
|
strcpybuff(retour->msg, "proxy CONNECT: too many response headers");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
// ouverture d'une liaison http, envoi d'une requète
|
// ouverture d'une liaison http, envoi d'une requète
|
||||||
// mode: 0 GET 1 HEAD [2 POST]
|
// mode: 0 GET 1 HEAD [2 POST]
|
||||||
// treat: traiter header?
|
// treat: traiter header?
|
||||||
@@ -680,14 +839,14 @@ T_SOC http_xfopen(httrackp * opt, int mode, int treat, int waitconnect,
|
|||||||
|
|
||||||
/* connexion */
|
/* connexion */
|
||||||
if (retour) {
|
if (retour) {
|
||||||
if ((!(retour->req.proxy.active))
|
/* no proxy, or proxy not usable here (local file) */
|
||||||
|| ((strcmp(adr, "file://") == 0)
|
if ((!(retour->req.proxy.active)) || (strcmp(adr, "file://") == 0)) {
|
||||||
|| (strncmp(adr, "https://", 8) == 0)
|
|
||||||
)
|
|
||||||
) { /* pas de proxy, ou non utilisable ici */
|
|
||||||
soc = newhttp(opt, adr, retour, -1, waitconnect);
|
soc = newhttp(opt, adr, retour, -1, waitconnect);
|
||||||
} else {
|
} else {
|
||||||
soc = newhttp(opt, retour->req.proxy.name, retour, retour->req.proxy.port, waitconnect); // ouvrir sur le proxy à la place
|
// to the proxy; https tunnels to the origin via CONNECT in back_wait
|
||||||
|
// (#85)
|
||||||
|
soc = newhttp(opt, retour->req.proxy.name, retour, retour->req.proxy.port,
|
||||||
|
waitconnect);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
soc = newhttp(opt, adr, NULL, -1, waitconnect);
|
soc = newhttp(opt, adr, NULL, -1, waitconnect);
|
||||||
@@ -874,6 +1033,50 @@ static void print_buffer(buff_struct*const str, const char *format, ...) {
|
|||||||
assertf(str->pos < str->capacity);
|
assertf(str->pos < str->capacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Append the request "Cookie:" header line for every stored cookie matching
|
||||||
|
domain/path. RFC 6265 form: bare "name=value" pairs joined by "; ", no
|
||||||
|
$Version/$Path attributes (those are RFC 2965 syntax that modern servers
|
||||||
|
reject, issue #151). Returns the number of cookies emitted. */
|
||||||
|
static int append_cookie_header(buff_struct *bstr, t_cookie *cookie,
|
||||||
|
const char *domain, const char *path) {
|
||||||
|
char buffer[8192];
|
||||||
|
char *b;
|
||||||
|
int cook = 0;
|
||||||
|
int max_cookies = 8;
|
||||||
|
|
||||||
|
if (cookie == NULL)
|
||||||
|
return 0;
|
||||||
|
b = cookie->data;
|
||||||
|
do {
|
||||||
|
b = cookie_find(b, "", domain, path); // next matching cookie
|
||||||
|
if (b != NULL) {
|
||||||
|
max_cookies--;
|
||||||
|
if (!cook) {
|
||||||
|
print_buffer(bstr, "Cookie: ");
|
||||||
|
cook = 1;
|
||||||
|
} else
|
||||||
|
print_buffer(bstr, "; ");
|
||||||
|
print_buffer(bstr, "%s", cookie_get(buffer, b, 5));
|
||||||
|
print_buffer(bstr, "=%s", cookie_get(buffer, b, 6));
|
||||||
|
b = cookie_nextfield(b);
|
||||||
|
}
|
||||||
|
} while (b != NULL && max_cookies > 0);
|
||||||
|
if (cook)
|
||||||
|
print_buffer(bstr, H_CRLF);
|
||||||
|
return cook;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Self-test entry for append_cookie_header(): build the request Cookie line
|
||||||
|
into dst (always NUL-terminated). Returns the number of cookies emitted. */
|
||||||
|
int http_cookie_header_selftest(t_cookie *cookie, const char *domain,
|
||||||
|
const char *path, char *dst, size_t dst_size) {
|
||||||
|
buff_struct bstr = {dst, dst_size, 0};
|
||||||
|
|
||||||
|
assertf(dst != NULL && dst_size > 0);
|
||||||
|
dst[0] = '\0';
|
||||||
|
return append_cookie_header(&bstr, cookie, domain, path);
|
||||||
|
}
|
||||||
|
|
||||||
// envoi d'une requète
|
// envoi d'une requète
|
||||||
int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||||
const char *xsend, const char *adr, const char *fil,
|
const char *xsend, const char *adr, const char *fil,
|
||||||
@@ -999,8 +1202,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
|||||||
if (xsend)
|
if (xsend)
|
||||||
print_buffer(&bstr, "%s", xsend); // éventuelles autres lignes
|
print_buffer(&bstr, "%s", xsend); // éventuelles autres lignes
|
||||||
|
|
||||||
// tester proxy authentication
|
// for https, auth rides the CONNECT (the tunneled GET would leak it)
|
||||||
if (retour->req.proxy.active) {
|
if (retour->req.proxy.active && strncmp(adr, "https://", 8) != 0) {
|
||||||
if (link_has_authorization(retour->req.proxy.name)) { // et hop, authentification proxy!
|
if (link_has_authorization(retour->req.proxy.name)) { // et hop, authentification proxy!
|
||||||
const char *a = jump_identification_const(retour->req.proxy.name);
|
const char *a = jump_identification_const(retour->req.proxy.name);
|
||||||
const char *astart = jump_protocol_const(retour->req.proxy.name);
|
const char *astart = jump_protocol_const(retour->req.proxy.name);
|
||||||
@@ -1048,34 +1251,9 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
|||||||
search_tag + strlen(POSTTOK) + 1))));
|
search_tag + strlen(POSTTOK) + 1))));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// gestion cookies?
|
// send stored cookies matching this host/path
|
||||||
if (cookie) {
|
if (cookie) {
|
||||||
char buffer[8192];
|
append_cookie_header(&bstr, cookie, jump_identification_const(adr), fil);
|
||||||
char *b = cookie->data;
|
|
||||||
int cook = 0;
|
|
||||||
int max_cookies = 8;
|
|
||||||
|
|
||||||
do {
|
|
||||||
b = cookie_find(b, "", jump_identification_const(adr), fil); // prochain cookie satisfaisant aux conditions
|
|
||||||
if (b != NULL) {
|
|
||||||
max_cookies--;
|
|
||||||
if (!cook) {
|
|
||||||
print_buffer(&bstr, "Cookie: $Version=1; ");
|
|
||||||
cook = 1;
|
|
||||||
} else
|
|
||||||
print_buffer(&bstr, "; ");
|
|
||||||
print_buffer(&bstr, "%s", cookie_get(buffer, b, 5));
|
|
||||||
print_buffer(&bstr, "=%s", cookie_get(buffer, b, 6));
|
|
||||||
print_buffer(&bstr, "; $Path=%s", cookie_get(buffer, b, 2));
|
|
||||||
b = cookie_nextfield(b);
|
|
||||||
}
|
|
||||||
} while(b != NULL && max_cookies > 0);
|
|
||||||
if (cook) { // on a envoyé un (ou plusieurs) cookie?
|
|
||||||
print_buffer(&bstr, H_CRLF);
|
|
||||||
#if DEBUG_COOK
|
|
||||||
printf("Header:\n%s\n", bstr.buffer);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// gérer le keep-alive (garder socket)
|
// gérer le keep-alive (garder socket)
|
||||||
if (retour->req.http11 && !retour->req.nokeepalive) {
|
if (retour->req.http11 && !retour->req.nokeepalive) {
|
||||||
@@ -1808,6 +1986,24 @@ int check_readinput_t(T_SOC soc, int timeout) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// wait until the socket is writable, up to timeout seconds
|
||||||
|
int check_writeinput_t(T_SOC soc, int timeout) {
|
||||||
|
if (soc != INVALID_SOCKET) {
|
||||||
|
fd_set fds;
|
||||||
|
struct timeval tv;
|
||||||
|
const int isoc = (int) soc;
|
||||||
|
|
||||||
|
assertf(isoc == soc);
|
||||||
|
FD_ZERO(&fds);
|
||||||
|
FD_SET(isoc, &fds);
|
||||||
|
tv.tv_sec = timeout;
|
||||||
|
tv.tv_usec = 0;
|
||||||
|
select(isoc + 1, NULL, &fds, NULL, &tv);
|
||||||
|
return FD_ISSET(isoc, &fds) ? 1 : 0;
|
||||||
|
} else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
// idem, sauf qu'ici on peut choisir la taille max de données à recevoir
|
// idem, sauf qu'ici on peut choisir la taille max de données à recevoir
|
||||||
// SI bufl==0 alors le buffer est censé être de 8kos, et on recoit par bloc de lignes
|
// SI bufl==0 alors le buffer est censé être de 8kos, et on recoit par bloc de lignes
|
||||||
// en éliminant les cr (ex: header), arrêt si double-lf
|
// en éliminant les cr (ex: header), arrêt si double-lf
|
||||||
@@ -2409,6 +2605,8 @@ int ident_url_absolute(const char *url, lien_adrfil *adrfil) {
|
|||||||
for(i = 0; adrfil->fil[i] != '\0'; i++)
|
for(i = 0; adrfil->fil[i] != '\0'; i++)
|
||||||
if (adrfil->fil[i] == '\\')
|
if (adrfil->fil[i] == '\\')
|
||||||
adrfil->fil[i] = '/';
|
adrfil->fil[i] = '/';
|
||||||
|
// collapse ../ like the http branch above (path-traversal safety)
|
||||||
|
fil_simplifie(adrfil->fil);
|
||||||
}
|
}
|
||||||
|
|
||||||
// no hostname
|
// no hostname
|
||||||
@@ -3646,8 +3844,9 @@ HTSEXT_API char *unescape_http(char *const catbuff, const size_t size, const cha
|
|||||||
// DOES NOT DECODE %25 (part of CHAR_DELIM)
|
// DOES NOT DECODE %25 (part of CHAR_DELIM)
|
||||||
// no_high & 1: decode high chars
|
// no_high & 1: decode high chars
|
||||||
// no_high & 2: decode space
|
// no_high & 2: decode space
|
||||||
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size,
|
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size,
|
||||||
const char *s, const int no_high) {
|
const char *s,
|
||||||
|
const hts_boolean no_high) {
|
||||||
size_t i, j;
|
size_t i, j;
|
||||||
|
|
||||||
RUNTIME_TIME_CHECK_SIZE(size);
|
RUNTIME_TIME_CHECK_SIZE(size);
|
||||||
@@ -3931,8 +4130,8 @@ void hts_replace(char *s, char from, char to) {
|
|||||||
|
|
||||||
// guess a local file's mime type (e.g. fil="toto.gif" -> s="image/gif")
|
// guess a local file's mime type (e.g. fil="toto.gif" -> s="image/gif")
|
||||||
// returns 1 if a type was written to s, 0 otherwise
|
// returns 1 if a type was written to s, 0 otherwise
|
||||||
int guess_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
hts_boolean guess_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||||
const char *fil) {
|
const char *fil) {
|
||||||
return get_httptype_sized(opt, s, ssize, fil, 1);
|
return get_httptype_sized(opt, s, ssize, fil, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3945,8 +4144,8 @@ void guess_httptype(httrackp * opt, char *s, const char *fil) {
|
|||||||
// write the mime type for fil into s (capacity ssize)
|
// write the mime type for fil into s (capacity ssize)
|
||||||
// flag: 1 to always return a type (the "application/..." / octet-stream
|
// flag: 1 to always return a type (the "application/..." / octet-stream
|
||||||
// fallback) returns 1 if a type was written to s, 0 otherwise
|
// fallback) returns 1 if a type was written to s, 0 otherwise
|
||||||
HTSEXT_API int get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||||
const char *fil, int flag) {
|
const char *fil, hts_boolean flag) {
|
||||||
// userdef overrides get_httptype (a rule with an empty value, e.g. "--assume
|
// userdef overrides get_httptype (a rule with an empty value, e.g. "--assume
|
||||||
// cgi=", matches but writes nothing: report it as "no type" like the old
|
// cgi=", matches but writes nothing: report it as "no type" like the old
|
||||||
// code, whose callers tested strnotempty(s))
|
// code, whose callers tested strnotempty(s))
|
||||||
@@ -4196,7 +4395,7 @@ HTSEXT_API int is_userknowntype(httrackp * opt, const char *fil) {
|
|||||||
|
|
||||||
// page dynamique?
|
// page dynamique?
|
||||||
// is_dyntype(get_ext("foo.asp"))
|
// is_dyntype(get_ext("foo.asp"))
|
||||||
HTSEXT_API int is_dyntype(const char *fil) {
|
HTSEXT_API hts_boolean is_dyntype(const char *fil) {
|
||||||
int j = 0;
|
int j = 0;
|
||||||
|
|
||||||
if (!fil)
|
if (!fil)
|
||||||
@@ -4214,7 +4413,7 @@ HTSEXT_API int is_dyntype(const char *fil) {
|
|||||||
|
|
||||||
// types critiques qui ne doivent pas être changés car renvoyés par des serveurs qui ne
|
// types critiques qui ne doivent pas être changés car renvoyés par des serveurs qui ne
|
||||||
// connaissent pas le type
|
// connaissent pas le type
|
||||||
int may_unknown(httrackp * opt, const char *st) {
|
hts_boolean may_unknown(httrackp *opt, const char *st) {
|
||||||
int j = 0;
|
int j = 0;
|
||||||
|
|
||||||
// types média
|
// types média
|
||||||
@@ -5236,7 +5435,8 @@ HTSEXT_API int hts_uninit_module(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// legacy. do not use
|
// legacy. do not use
|
||||||
HTSEXT_API int hts_log(httrackp * opt, const char *prefix, const char *msg) {
|
HTSEXT_API hts_boolean hts_log(httrackp *opt, const char *prefix,
|
||||||
|
const char *msg) {
|
||||||
if (opt->log != NULL) {
|
if (opt->log != NULL) {
|
||||||
fspc(opt, opt->log, prefix);
|
fspc(opt, opt->log, prefix);
|
||||||
fprintf(opt->log, "%s" LF, msg);
|
fprintf(opt->log, "%s" LF, msg);
|
||||||
@@ -5466,9 +5666,10 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
|||||||
"Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)");
|
"Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)");
|
||||||
StringCopy(opt->referer, "");
|
StringCopy(opt->referer, "");
|
||||||
StringCopy(opt->from, "");
|
StringCopy(opt->from, "");
|
||||||
opt->savename_83 = 0; // noms longs par défaut
|
opt->savename_83 = HTS_SAVENAME_83_LONG; // long names by default
|
||||||
opt->savename_type = 0; // avec structure originale
|
opt->savename_type = 0; // avec structure originale
|
||||||
opt->savename_delayed = 2; // hard delayed type (default)
|
opt->savename_delayed =
|
||||||
|
HTS_SAVENAME_DELAYED_HARD; // always delay the type check (default)
|
||||||
opt->delayed_cached = HTS_TRUE;
|
opt->delayed_cached = HTS_TRUE;
|
||||||
opt->mimehtml = HTS_FALSE;
|
opt->mimehtml = HTS_FALSE;
|
||||||
opt->parsejava = HTSPARSE_DEFAULT; // parser classes
|
opt->parsejava = HTSPARSE_DEFAULT; // parser classes
|
||||||
@@ -5493,7 +5694,7 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
|||||||
opt->parseall = HTS_TRUE;
|
opt->parseall = HTS_TRUE;
|
||||||
opt->parsedebug = HTS_FALSE;
|
opt->parsedebug = HTS_FALSE;
|
||||||
opt->norecatch = HTS_FALSE;
|
opt->norecatch = HTS_FALSE;
|
||||||
opt->verbosedisplay = 0; // pas d'animation texte
|
opt->verbosedisplay = HTS_VERBOSE_NONE; // no text animation
|
||||||
opt->sizehack = HTS_FALSE;
|
opt->sizehack = HTS_FALSE;
|
||||||
opt->urlhack = HTS_TRUE;
|
opt->urlhack = HTS_TRUE;
|
||||||
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
|
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
|
||||||
|
|||||||
16
src/htslib.h
16
src/htslib.h
@@ -182,6 +182,11 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode, const char *xsend
|
|||||||
const char *adr, const char *fil,
|
const char *adr, const char *fil,
|
||||||
const char *referer_adr, const char *referer_fil,
|
const char *referer_adr, const char *referer_fil,
|
||||||
htsblk * retour);
|
htsblk * retour);
|
||||||
|
/* Build the request "Cookie:" header line for stored cookies matching
|
||||||
|
domain/path into dst (NUL-terminated). Exposed for the -#Q self-test;
|
||||||
|
wraps the same logic http_sendhead() uses. Returns cookies emitted. */
|
||||||
|
int http_cookie_header_selftest(t_cookie *cookie, const char *domain,
|
||||||
|
const char *path, char *dst, size_t dst_size);
|
||||||
|
|
||||||
//int newhttp(char* iadr,char* err=NULL);
|
//int newhttp(char* iadr,char* err=NULL);
|
||||||
T_SOC newhttp(httrackp * opt, const char *iadr, htsblk * retour, int port,
|
T_SOC newhttp(httrackp * opt, const char *iadr, htsblk * retour, int port,
|
||||||
@@ -193,6 +198,17 @@ HTS_INLINE void deletesoc_r(htsblk * r);
|
|||||||
htsblk http_test(httrackp * opt, const char *adr, const char *fil, char *loc);
|
htsblk http_test(httrackp * opt, const char *adr, const char *fil, char *loc);
|
||||||
int check_readinput(htsblk * r);
|
int check_readinput(htsblk * r);
|
||||||
int check_readinput_t(T_SOC soc, int timeout);
|
int check_readinput_t(T_SOC soc, int timeout);
|
||||||
|
int check_writeinput_t(T_SOC soc, int timeout);
|
||||||
|
|
||||||
|
/* Open an HTTP CONNECT tunnel through the active proxy for an https request:
|
||||||
|
`retour->soc` must already be TCP-connected to the proxy, and `adr` is the
|
||||||
|
origin authority (url_adr, e.g. "https://host:port"). Sends the CONNECT
|
||||||
|
request (with Proxy-Authorization when the proxy carries credentials) and
|
||||||
|
reads the proxy's status line, so the caller's TLS handshake then runs
|
||||||
|
end-to-end with the origin. Blocks up to `timeout` seconds. Returns 1 on a
|
||||||
|
2xx tunnel, 0 on failure (retour->msg/statuscode set). */
|
||||||
|
int http_proxy_tunnel(httrackp *opt, htsblk *retour, const char *adr,
|
||||||
|
int timeout);
|
||||||
void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * retour,
|
void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * retour,
|
||||||
char *rcvd);
|
char *rcvd);
|
||||||
void treatfirstline(htsblk * retour, const char *rcvd);
|
void treatfirstline(htsblk * retour, const char *rcvd);
|
||||||
|
|||||||
@@ -184,10 +184,11 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
|
|
||||||
/* 8-3 ? */
|
/* 8-3 ? */
|
||||||
switch (opt->savename_83) {
|
switch (opt->savename_83) {
|
||||||
case 1: // 8-3
|
case HTS_SAVENAME_83_DOS: // 8-3
|
||||||
max_char = 8;
|
max_char = 8;
|
||||||
break;
|
break;
|
||||||
case 2: // Level 2 File names may be up to 31 characters.
|
case HTS_SAVENAME_83_ISO9660: // Level 2 File names may be up to 31
|
||||||
|
// characters.
|
||||||
max_char = 31;
|
max_char = 31;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@@ -324,7 +325,7 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* replace shtml to html.. */
|
/* replace shtml to html.. */
|
||||||
if (opt->savename_delayed == 2)
|
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD)
|
||||||
is_html = -1; /* ALWAYS delay type */
|
is_html = -1; /* ALWAYS delay type */
|
||||||
else
|
else
|
||||||
is_html = ishtml(opt, fil);
|
is_html = ishtml(opt, fil);
|
||||||
@@ -363,7 +364,9 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
) {
|
) {
|
||||||
// tester type avec requète HEAD si on ne connait pas le type du fichier
|
// tester type avec requète HEAD si on ne connait pas le type du fichier
|
||||||
if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
|
if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
|
||||||
if (opt->savename_delayed == 2 || (ishtest = ishtml(opt, fil)) < 0) { // on ne sait pas si c'est un html ou un fichier..
|
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD ||
|
||||||
|
(ishtest = ishtml(opt, fil)) <
|
||||||
|
0) { // unsure whether it's html or a file
|
||||||
// lire dans le cache
|
// lire dans le cache
|
||||||
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
|
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
|
||||||
|
|
||||||
@@ -393,11 +396,12 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
//
|
//
|
||||||
} else if (opt->savename_delayed != 2 && is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
} else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD &&
|
||||||
Lookup mimetype not only by extension,
|
is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
||||||
but also by filename */
|
Lookup mimetype not only by extension,
|
||||||
/* Note: "foo.cgi => text/html" means that foo.cgi shall have the text/html MIME file type,
|
but also by filename */
|
||||||
that is, ".html" */
|
/* Note: "foo.cgi => text/html" means that foo.cgi shall have the
|
||||||
|
text/html MIME file type, that is, ".html" */
|
||||||
char BIGSTK mime[1024];
|
char BIGSTK mime[1024];
|
||||||
|
|
||||||
mime[0] = ext[0] = '\0';
|
mime[0] = ext[0] = '\0';
|
||||||
@@ -408,9 +412,13 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// note: if savename_delayed is enabled, the naming will be temporary (and slightly invalid!)
|
// note: if savename_delayed is enabled, the naming will be temporary
|
||||||
// note: if we are about to stop (opt->state.stop), back_add() will fail later
|
// (and slightly invalid!)
|
||||||
else if (opt->savename_delayed != 0 && !opt->state.stop) {
|
//
|
||||||
|
// note: if we are about to stop (opt->state.stop), back_add() will
|
||||||
|
// fail later
|
||||||
|
else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
|
||||||
|
!opt->state.stop) {
|
||||||
// Check if the file is ready in backing. We basically take the same logic as later.
|
// Check if the file is ready in backing. We basically take the same logic as later.
|
||||||
// FIXME: we should cleanup and factorize this unholy mess
|
// FIXME: we should cleanup and factorize this unholy mess
|
||||||
if (headers != NULL && headers->status >= 0 && !is_redirect) {
|
if (headers != NULL && headers->status >= 0 && !is_redirect) {
|
||||||
@@ -698,7 +706,7 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
}
|
}
|
||||||
// restaurer
|
// restaurer
|
||||||
opt->state._hts_in_html_parsing = hihp;
|
opt->state._hts_in_html_parsing = hihp;
|
||||||
} // caché?
|
} // caché?
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1190,7 +1198,8 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
// Not used anymore unless non-delayed types.
|
// Not used anymore unless non-delayed types.
|
||||||
// de même en cas de manque d'extension on en place une de manière forcée..
|
// de même en cas de manque d'extension on en place une de manière forcée..
|
||||||
// cela évite les /chez/toto et les /chez/toto/index.html incompatibles
|
// cela évite les /chez/toto et les /chez/toto/index.html incompatibles
|
||||||
if (opt->savename_type != -1 && opt->savename_delayed != 2) {
|
if (opt->savename_type != -1 &&
|
||||||
|
opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD) {
|
||||||
char *a = afs->save + strlen(afs->save) - 1;
|
char *a = afs->save + strlen(afs->save) - 1;
|
||||||
|
|
||||||
while((a > afs->save) && (*a != '.') && (*a != '/'))
|
while((a > afs->save) && (*a != '.') && (*a != '/'))
|
||||||
@@ -1236,31 +1245,21 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
size_t i;
|
size_t i;
|
||||||
for(i = 0 ; afs->save[i] != '\0' ; i++) {
|
for(i = 0 ; afs->save[i] != '\0' ; i++) {
|
||||||
unsigned char c = (unsigned char) afs->save[i];
|
unsigned char c = (unsigned char) afs->save[i];
|
||||||
if (c < 32 // control
|
if (c < 32 // control
|
||||||
|| c == 127 // unwise
|
|| c == 127 // unwise
|
||||||
|| c == '~' // unix unwise
|
|| c == '~' // unix unwise
|
||||||
|| c == '\\' // windows separator
|
|| c == '\\' // windows separator
|
||||||
|| c == ':' // windows forbidden
|
|| c == ':' // windows forbidden
|
||||||
|| c == '*' // windows forbidden
|
|| c == '*' // windows forbidden
|
||||||
|| c == '?' // windows forbidden
|
|| c == '?' // windows forbidden
|
||||||
|| c == '\"' // windows forbidden
|
|| c == '\"' // windows forbidden
|
||||||
|| c == '<' // windows forbidden
|
|| c == '<' // windows forbidden
|
||||||
|| c == '>' // windows forbidden
|
|| c == '>' // windows forbidden
|
||||||
|| c == '|' // windows forbidden
|
|| c == '|' // windows forbidden
|
||||||
//|| c == '@' // ?
|
//|| c == '@' // ?
|
||||||
||
|
|| (opt->savename_83 == HTS_SAVENAME_83_ISO9660 // CDROM
|
||||||
(
|
&& (c == '-' || c == '=' || c == '+'))) {
|
||||||
opt->savename_83 == 2 // CDROM
|
afs->save[i] = '_';
|
||||||
&&
|
|
||||||
(
|
|
||||||
c == '-'
|
|
||||||
|| c == '='
|
|
||||||
|| c == '+'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
{
|
|
||||||
afs->save[i] = '_';
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1521,7 +1520,8 @@ int url_savename(lien_adrfilsave *const afs,
|
|||||||
char *a = afs->save + strlen(afs->save) - 1;
|
char *a = afs->save + strlen(afs->save) - 1;
|
||||||
char *b;
|
char *b;
|
||||||
int n = 2;
|
int n = 2;
|
||||||
char collisionSeparator = ((opt->savename_83 != 2) ? '-' : '_');
|
char collisionSeparator =
|
||||||
|
((opt->savename_83 != HTS_SAVENAME_83_ISO9660) ? '-' : '_');
|
||||||
|
|
||||||
tempo[0] = '\0';
|
tempo[0] = '\0';
|
||||||
|
|
||||||
|
|||||||
53
src/htsopt.h
53
src/htsopt.h
@@ -342,24 +342,44 @@ typedef enum hts_seeker {
|
|||||||
HTS_SEEKER_UP = 1 << 1 /**< may ascend to parent directories */
|
HTS_SEEKER_UP = 1 << 1 /**< may ascend to parent directories */
|
||||||
} hts_seeker;
|
} hts_seeker;
|
||||||
|
|
||||||
/* Link-following scope, stored in the low byte of opt->travel. */
|
/* opt->travel: link-following scope in the low byte, flags OR'd in above it. */
|
||||||
typedef enum hts_travel_scope {
|
typedef enum hts_travel_scope {
|
||||||
HTS_TRAVEL_SAME_ADDRESS = 0, /**< stay on the same address (host) */
|
HTS_TRAVEL_SAME_ADDRESS = 0, /**< stay on the same address (host) */
|
||||||
HTS_TRAVEL_SAME_DOMAIN = 1, /**< stay on the same principal domain */
|
HTS_TRAVEL_SAME_DOMAIN = 1, /**< stay on the same principal domain */
|
||||||
HTS_TRAVEL_SAME_TLD = 2, /**< stay on the same TLD (e.g. .com) */
|
HTS_TRAVEL_SAME_TLD = 2, /**< stay on the same TLD (e.g. .com) */
|
||||||
HTS_TRAVEL_EVERYWHERE = 7 /**< follow links anywhere on the web */
|
HTS_TRAVEL_EVERYWHERE = 7, /**< follow links anywhere on the web */
|
||||||
|
HTS_TRAVEL_TEST_ALL = 1 << 8 /**< also test forbidden URLs (-t) */
|
||||||
} hts_travel_scope;
|
} hts_travel_scope;
|
||||||
|
|
||||||
/* Flags OR'd into opt->travel above the scope value. */
|
/* Mask selecting the scope value out of opt->travel. */
|
||||||
#define HTS_TRAVEL_SCOPE_MASK 0xff /**< mask selecting the scope value */
|
#define HTS_TRAVEL_SCOPE_MASK 0xff
|
||||||
#define HTS_TRAVEL_TEST_ALL (1 << 8) /**< also test forbidden URLs (-t) */
|
|
||||||
|
|
||||||
/* Boolean option flag. An enum (not C bool) so the option fields stay int-sized
|
/* Text progress display detail (opt->verbosedisplay). */
|
||||||
and the httrackp layout/ABI is unchanged. */
|
typedef enum hts_verbosedisplay {
|
||||||
#ifndef HTS_DEF_DEFSTRUCT_hts_boolean
|
HTS_VERBOSE_NONE = 0, /**< no animated progress display (default) */
|
||||||
#define HTS_DEF_DEFSTRUCT_hts_boolean
|
HTS_VERBOSE_SIMPLE = 1, /**< minimal single-line progress */
|
||||||
typedef enum hts_boolean { HTS_FALSE = 0, HTS_TRUE = 1 } hts_boolean;
|
HTS_VERBOSE_FULL = 2 /**< full animated progress */
|
||||||
#endif
|
} hts_verbosedisplay;
|
||||||
|
|
||||||
|
/* Delayed file-type resolution policy (opt->savename_delayed). */
|
||||||
|
typedef enum hts_savename_delayed {
|
||||||
|
HTS_SAVENAME_DELAYED_NONE = 0, /**< resolve the type immediately */
|
||||||
|
HTS_SAVENAME_DELAYED_SOFT = 1, /**< delay the type check when unknown */
|
||||||
|
HTS_SAVENAME_DELAYED_HARD = 2 /**< always delay the type check (default) */
|
||||||
|
} hts_savename_delayed;
|
||||||
|
|
||||||
|
/* Saved-name length layout (opt->savename_83). */
|
||||||
|
typedef enum hts_savename_83 {
|
||||||
|
HTS_SAVENAME_83_LONG = 0, /**< long file names (default) */
|
||||||
|
HTS_SAVENAME_83_DOS = 1, /**< DOS 8.3 names (ISO9660 level 1) */
|
||||||
|
HTS_SAVENAME_83_ISO9660 = 2 /**< ISO9660 level 2 names (up to 31 chars) */
|
||||||
|
} hts_savename_83;
|
||||||
|
|
||||||
|
/* Host-banning triggers (opt->hostcontrol bitmask). */
|
||||||
|
typedef enum hts_hostcontrol {
|
||||||
|
HTS_HOSTCONTROL_BAN_TIMEOUT = 1 << 0, /**< ban a timing-out host */
|
||||||
|
HTS_HOSTCONTROL_BAN_SLOW = 1 << 1 /**< ban a too-slow host */
|
||||||
|
} hts_hostcontrol;
|
||||||
|
|
||||||
#ifndef HTS_DEF_FWSTRUCT_lien_buffers
|
#ifndef HTS_DEF_FWSTRUCT_lien_buffers
|
||||||
#define HTS_DEF_FWSTRUCT_lien_buffers
|
#define HTS_DEF_FWSTRUCT_lien_buffers
|
||||||
@@ -393,7 +413,7 @@ struct httrackp {
|
|||||||
hts_urlmode
|
hts_urlmode
|
||||||
urlmode; /**< saved-link rewriting style (relative, absolute, etc.) */
|
urlmode; /**< saved-link rewriting style (relative, absolute, etc.) */
|
||||||
hts_boolean no_type_change; // do not change file type according to MIME
|
hts_boolean no_type_change; // do not change file type according to MIME
|
||||||
int debug; /**< debug logging level */
|
hts_log_type debug; /**< debug logging level */
|
||||||
int getmode; /**< what to fetch (HTML, images, ...) bitmask */
|
int getmode; /**< what to fetch (HTML, images, ...) bitmask */
|
||||||
FILE *log; /**< informational log stream; NULL mutes it */
|
FILE *log; /**< informational log stream; NULL mutes it */
|
||||||
FILE *errlog; /**< error log stream; NULL mutes it */
|
FILE *errlog; /**< error log stream; NULL mutes it */
|
||||||
@@ -417,11 +437,12 @@ struct httrackp {
|
|||||||
// int aff_progress; // progress bar
|
// int aff_progress; // progress bar
|
||||||
hts_boolean shell; /**< driven by a shell over stdin/stdout pipes */
|
hts_boolean shell; /**< driven by a shell over stdin/stdout pipes */
|
||||||
t_proxy proxy; /**< proxy configuration */
|
t_proxy proxy; /**< proxy configuration */
|
||||||
int savename_83; /**< force 8.3 (DOS) file names */
|
hts_savename_83
|
||||||
|
savename_83; /**< saved-name length layout (long/DOS/ISO9660) */
|
||||||
int savename_type; /**< saved-name layout (original tree, flat, ...) */
|
int savename_type; /**< saved-name layout (original tree, flat, ...) */
|
||||||
String
|
String
|
||||||
savename_userdef; /**< user-defined name template (e.g. %h%p/%n%q.%t) */
|
savename_userdef; /**< user-defined name template (e.g. %h%p/%n%q.%t) */
|
||||||
int savename_delayed; // delayed type check
|
hts_savename_delayed savename_delayed; /**< delayed type-check policy */
|
||||||
hts_boolean
|
hts_boolean
|
||||||
delayed_cached; // delayed type check can be cached to speedup updates
|
delayed_cached; // delayed type check can be cached to speedup updates
|
||||||
hts_boolean mimehtml; /**< produce a single MIME/MHTML archive */
|
hts_boolean mimehtml; /**< produce a single MIME/MHTML archive */
|
||||||
@@ -437,7 +458,7 @@ struct httrackp {
|
|||||||
hts_boolean makestat; /**< maintain a transfer-statistics log */
|
hts_boolean makestat; /**< maintain a transfer-statistics log */
|
||||||
hts_boolean maketrack; /**< maintain an operations-statistics log */
|
hts_boolean maketrack; /**< maintain an operations-statistics log */
|
||||||
int parsejava; /**< Java/JS parsing mode; see htsparsejava_flags */
|
int parsejava; /**< Java/JS parsing mode; see htsparsejava_flags */
|
||||||
int hostcontrol; /**< drop hosts that are too slow, etc. */
|
int hostcontrol; /**< ban slow/timing-out hosts; see hts_hostcontrol bits */
|
||||||
hts_boolean errpage; /**< generate an error page on 404 and similar */
|
hts_boolean errpage; /**< generate an error page on 404 and similar */
|
||||||
hts_boolean
|
hts_boolean
|
||||||
check_type; /**< probe unknown-type links (cgi/asp/dir) and follow moves
|
check_type; /**< probe unknown-type links (cgi/asp/dir) and follow moves
|
||||||
@@ -462,7 +483,7 @@ struct httrackp {
|
|||||||
parseall; /**< parse aggressively, including unknown tags with links */
|
parseall; /**< parse aggressively, including unknown tags with links */
|
||||||
hts_boolean parsedebug; /**< parser debug mode */
|
hts_boolean parsedebug; /**< parser debug mode */
|
||||||
hts_boolean norecatch; /**< do not re-fetch files the user deleted locally */
|
hts_boolean norecatch; /**< do not re-fetch files the user deleted locally */
|
||||||
int verbosedisplay; /**< animated text progress display */
|
hts_verbosedisplay verbosedisplay; /**< animated text progress display */
|
||||||
String footer; /**< footer/info line injected into pages */
|
String footer; /**< footer/info line injected into pages */
|
||||||
int maxcache; /**< in-memory cache backing limit (bytes) */
|
int maxcache; /**< in-memory cache backing limit (bytes) */
|
||||||
// int maxcache_anticipate; // maximum links to anticipate (upper bound)
|
// int maxcache_anticipate; // maximum links to anticipate (upper bound)
|
||||||
|
|||||||
128
src/htsparse.c
128
src/htsparse.c
@@ -296,6 +296,48 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
|
|||||||
return dst;
|
return dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Byte before html, or a space sentinel at the buffer start where html[-1]
|
||||||
|
would underflow; space reads as the word boundary the guards want there. */
|
||||||
|
static HTS_INLINE char html_prevc(const char *html, const char *start) {
|
||||||
|
return html > start ? html[-1] : ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
/* True if [s, s+len) is exactly an HTTP method token (XHR.open's first
|
||||||
|
argument is a method, not a URL: #218). Case-insensitive. */
|
||||||
|
static int is_http_method(const char *s, size_t len) {
|
||||||
|
static const char *const methods[] = {"GET", "POST", "PUT",
|
||||||
|
"DELETE", "HEAD", "OPTIONS",
|
||||||
|
"PATCH", "TRACE", NULL};
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; methods[i] != NULL; i++) {
|
||||||
|
if (strlen(methods[i]) == len && strfield(s, methods[i]) == (int) len)
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Percent-encode '(' and ')' in a link emitted into an unquoted url(...) (CSS
|
||||||
|
or JS): a literal ')' closes the token early and the UA mis-parses the value
|
||||||
|
(#163). The UA decodes %28/%29 back to the saved-on-disk name. */
|
||||||
|
static void escape_url_parens(char *const s, const size_t size) {
|
||||||
|
char BIGSTK buff[HTS_URLMAXSIZE * 2];
|
||||||
|
size_t i, j;
|
||||||
|
|
||||||
|
for (i = 0, j = 0; s[i] != '\0' && j + 3 < size && j + 3 < sizeof(buff);
|
||||||
|
i++) {
|
||||||
|
if (s[i] == '(' || s[i] == ')') {
|
||||||
|
buff[j++] = '%';
|
||||||
|
buff[j++] = '2';
|
||||||
|
buff[j++] = s[i] == '(' ? '8' : '9';
|
||||||
|
} else {
|
||||||
|
buff[j++] = s[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
buff[j] = '\0';
|
||||||
|
strlcpybuff(s, buff, size);
|
||||||
|
}
|
||||||
|
|
||||||
/* Main parser */
|
/* Main parser */
|
||||||
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||||
char catbuff[CATBUFF_SIZE];
|
char catbuff[CATBUFF_SIZE];
|
||||||
@@ -556,7 +598,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if (opt->getmode & HTS_GETMODE_HTML) {
|
if (opt->getmode & HTS_GETMODE_HTML) {
|
||||||
p = strfield(html, "title");
|
p = strfield(html, "title");
|
||||||
if (p) {
|
if (p) {
|
||||||
if (*(html - 1) == '/')
|
if (html_prevc(html, r->adr) == '/')
|
||||||
p = 0; // /title
|
p = 0; // /title
|
||||||
} else {
|
} else {
|
||||||
if (strfield(html, "/html"))
|
if (strfield(html, "/html"))
|
||||||
@@ -1341,6 +1383,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
int can_avoid_quotes = 0;
|
int can_avoid_quotes = 0;
|
||||||
char quotes_replacement = '\0';
|
char quotes_replacement = '\0';
|
||||||
int ensure_not_mime = 0;
|
int ensure_not_mime = 0;
|
||||||
|
// .open(method,url): reject an HTTP-method first arg (#218)
|
||||||
|
int ensure_not_method = 0;
|
||||||
|
// @import: the quoted token is the URL; a trailing
|
||||||
|
// media/supports/layer condition is not part of it
|
||||||
|
int is_import = 0;
|
||||||
|
|
||||||
if (inscript_tag)
|
if (inscript_tag)
|
||||||
expected_end = ";\"\'"; // voir a href="javascript:doc.location='foo'"
|
expected_end = ";\"\'"; // voir a href="javascript:doc.location='foo'"
|
||||||
@@ -1357,9 +1404,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if (!nc)
|
if (!nc)
|
||||||
nc = strfield(html, ":location"); // javascript:location="doc"
|
nc = strfield(html, ":location"); // javascript:location="doc"
|
||||||
if (!nc) { // location="doc"
|
if (!nc) { // location="doc"
|
||||||
if ((nc = strfield(html, "location"))
|
if ((nc = strfield(html, "location")) &&
|
||||||
&& !isspace(*(html - 1))
|
!isspace(html_prevc(html, r->adr)))
|
||||||
)
|
|
||||||
nc = 0;
|
nc = 0;
|
||||||
}
|
}
|
||||||
if (!nc)
|
if (!nc)
|
||||||
@@ -1369,6 +1415,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
expected = '('; // parenthèse
|
expected = '('; // parenthèse
|
||||||
expected_end = "),"; // fin: virgule ou parenthèse
|
expected_end = "),"; // fin: virgule ou parenthèse
|
||||||
ensure_not_mime = 1; //* ensure the url is not a mime type */
|
ensure_not_mime = 1; //* ensure the url is not a mime type */
|
||||||
|
ensure_not_method = 1; // xhr.open: don't grab method
|
||||||
}
|
}
|
||||||
if (!nc)
|
if (!nc)
|
||||||
if ((nc = strfield(html, ".replace"))) { // window.replace("url")
|
if ((nc = strfield(html, ".replace"))) { // window.replace("url")
|
||||||
@@ -1380,7 +1427,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
expected = '('; // parenthèse
|
expected = '('; // parenthèse
|
||||||
expected_end = ")"; // fin: parenthèse
|
expected_end = ")"; // fin: parenthèse
|
||||||
}
|
}
|
||||||
if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') { // url(url)
|
if (!nc && (nc = strfield(html, "url")) &&
|
||||||
|
(!isalnum(html_prevc(html, r->adr))) &&
|
||||||
|
html_prevc(html, r->adr) != '_') { // url(url)
|
||||||
expected = '('; // parenthèse
|
expected = '('; // parenthèse
|
||||||
expected_end = ")"; // fin: parenthèse
|
expected_end = ")"; // fin: parenthèse
|
||||||
can_avoid_quotes = 1;
|
can_avoid_quotes = 1;
|
||||||
@@ -1390,6 +1439,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if ((nc = strfield(html, "import"))) { // import "url"
|
if ((nc = strfield(html, "import"))) { // import "url"
|
||||||
if (is_space(*(html + nc))) {
|
if (is_space(*(html + nc))) {
|
||||||
expected = 0; // no char expected
|
expected = 0; // no char expected
|
||||||
|
is_import = 1;
|
||||||
} else
|
} else
|
||||||
nc = 0;
|
nc = 0;
|
||||||
}
|
}
|
||||||
@@ -1407,6 +1457,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
|
if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
|
||||||
const char *b, *c;
|
const char *b, *c;
|
||||||
int ndelim = 1;
|
int ndelim = 1;
|
||||||
|
int valid_url = 0;
|
||||||
|
|
||||||
if ((*a == 34) || (*a == '\''))
|
if ((*a == 34) || (*a == '\''))
|
||||||
a++;
|
a++;
|
||||||
@@ -1421,12 +1472,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
b++;
|
b++;
|
||||||
}
|
}
|
||||||
c = b--;
|
c = b--;
|
||||||
c += ndelim;
|
// no closing delimiter here (truncated input):
|
||||||
while(*c == ' ')
|
// Don't scan past the buffer NUL or capture it.
|
||||||
c++;
|
if (*c != '\0') {
|
||||||
if ((strchr(expected_end, *c)) || (*c == '\n')
|
c += ndelim;
|
||||||
|| (*c == '\r')) {
|
while (*c == ' ')
|
||||||
c -= (ndelim + 1);
|
c++;
|
||||||
|
valid_url =
|
||||||
|
(strchr(expected_end, *c)) || (*c == '\n') ||
|
||||||
|
(*c == '\r') ||
|
||||||
|
(is_import && *(b + 1 + ndelim) == ' ');
|
||||||
|
}
|
||||||
|
if (valid_url) {
|
||||||
|
// URL end = last char (b), not the delimiter
|
||||||
|
c = b;
|
||||||
if ((int) (c - a + 1)) {
|
if ((int) (c - a + 1)) {
|
||||||
if (ensure_not_mime) {
|
if (ensure_not_mime) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
@@ -1442,6 +1501,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// XHR.open's "GET" etc. is a method, not a URL
|
||||||
|
if (a != NULL && ensure_not_method &&
|
||||||
|
is_http_method(a, (size_t) (c - a + 1))) {
|
||||||
|
a = NULL;
|
||||||
|
}
|
||||||
// Check for bogus links (Vasiliy)
|
// Check for bogus links (Vasiliy)
|
||||||
if (a != NULL) {
|
if (a != NULL) {
|
||||||
const size_t size = c - a + 1;
|
const size_t size = c - a + 1;
|
||||||
@@ -1485,7 +1549,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1692,6 +1755,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
hts_nodetect[i -
|
hts_nodetect[i -
|
||||||
1]);
|
1]);
|
||||||
}
|
}
|
||||||
|
// xmlns / xmlns:prefix declare
|
||||||
|
// XML namespaces, not resources
|
||||||
|
// (#191)
|
||||||
|
else {
|
||||||
|
const int xl = strfield(
|
||||||
|
intag_startattr, "xmlns");
|
||||||
|
const char xc =
|
||||||
|
intag_startattr[xl];
|
||||||
|
if (xl &&
|
||||||
|
(xc == ':' || xc == '=' ||
|
||||||
|
is_space(xc))) {
|
||||||
|
url_ok = 0;
|
||||||
|
hts_log_print(
|
||||||
|
opt, LOG_DEBUG,
|
||||||
|
"dirty parsing: xmlns "
|
||||||
|
"namespace avoided");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2967,6 +3048,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
|||||||
/* Never escape high-chars (we don't know the encoding!!) */
|
/* Never escape high-chars (we don't know the encoding!!) */
|
||||||
inplace_escape_uri_utf(tempo, sizeof(tempo));
|
inplace_escape_uri_utf(tempo, sizeof(tempo));
|
||||||
|
|
||||||
|
// unquoted url() (CSS/JS): keep parens escaped
|
||||||
|
if (ending_p == ')')
|
||||||
|
escape_url_parens(tempo, sizeof(tempo));
|
||||||
|
|
||||||
//if (!no_esc_utf)
|
//if (!no_esc_utf)
|
||||||
// escape_uri(tempo); // escape with %xx
|
// escape_uri(tempo); // escape with %xx
|
||||||
//else {
|
//else {
|
||||||
@@ -3722,7 +3807,8 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
|||||||
//case -1: can_retry=1; break;
|
//case -1: can_retry=1; break;
|
||||||
case STATUSCODE_TIMEOUT:
|
case STATUSCODE_TIMEOUT:
|
||||||
if (opt->hostcontrol) { // timeout et retry épuisés
|
if (opt->hostcontrol) { // timeout et retry épuisés
|
||||||
if ((opt->hostcontrol & 1) && (heap(ptr)->retry <= 0)) {
|
if ((opt->hostcontrol & HTS_HOSTCONTROL_BAN_TIMEOUT) &&
|
||||||
|
(heap(ptr)->retry <= 0)) {
|
||||||
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
||||||
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
||||||
hts_log_print(opt, LOG_DEBUG,
|
hts_log_print(opt, LOG_DEBUG,
|
||||||
@@ -3735,7 +3821,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
|||||||
break;
|
break;
|
||||||
case STATUSCODE_SLOW:
|
case STATUSCODE_SLOW:
|
||||||
if ((opt->hostcontrol) && (heap(ptr)->retry <= 0)) { // too slow
|
if ((opt->hostcontrol) && (heap(ptr)->retry <= 0)) { // too slow
|
||||||
if (opt->hostcontrol & 2) {
|
if (opt->hostcontrol & HTS_HOSTCONTROL_BAN_SLOW) {
|
||||||
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
||||||
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
||||||
hts_log_print(opt, LOG_DEBUG,
|
hts_log_print(opt, LOG_DEBUG,
|
||||||
@@ -4261,10 +4347,10 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
|
|||||||
char com[256];
|
char com[256];
|
||||||
|
|
||||||
linput(stdin, com, 200);
|
linput(stdin, com, 200);
|
||||||
if (opt->verbosedisplay == 2)
|
if (opt->verbosedisplay == HTS_VERBOSE_FULL)
|
||||||
opt->verbosedisplay = 1;
|
opt->verbosedisplay = HTS_VERBOSE_SIMPLE;
|
||||||
else
|
else
|
||||||
opt->verbosedisplay = 2;
|
opt->verbosedisplay = HTS_VERBOSE_FULL;
|
||||||
/* Info for wrappers */
|
/* Info for wrappers */
|
||||||
hts_log_print(opt, LOG_INFO, "engine: change-options");
|
hts_log_print(opt, LOG_INFO, "engine: change-options");
|
||||||
RUN_CALLBACK0(opt, chopt);
|
RUN_CALLBACK0(opt, chopt);
|
||||||
@@ -4374,7 +4460,7 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
|
|||||||
printf("%c\x0d", ("/-\\|")[roll]);
|
printf("%c\x0d", ("/-\\|")[roll]);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
} else if (opt->verbosedisplay == 1) {
|
} else if (opt->verbosedisplay == HTS_VERBOSE_SIMPLE) {
|
||||||
if (b >= 0) {
|
if (b >= 0) {
|
||||||
if (back[b].r.statuscode == HTTP_OK)
|
if (back[b].r.statuscode == HTTP_OK)
|
||||||
printf("%d/%d: %s%s (" LLintP " bytes) - OK\33[K\r", ptr, opt->lien_tot,
|
printf("%d/%d: %s%s (" LLintP " bytes) - OK\33[K\r", ptr, opt->lien_tot,
|
||||||
@@ -4465,8 +4551,8 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
|||||||
char in_error_msg[32];
|
char in_error_msg[32];
|
||||||
|
|
||||||
// resolve unresolved type
|
// resolve unresolved type
|
||||||
if (opt->savename_delayed != 0 && *forbidden_url == 0 && IS_DELAYED_EXT(afs->save)
|
if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
|
||||||
&& !opt->state.stop) {
|
*forbidden_url == 0 && IS_DELAYED_EXT(afs->save) && !opt->state.stop) {
|
||||||
int loops;
|
int loops;
|
||||||
int continue_loop;
|
int continue_loop;
|
||||||
|
|
||||||
@@ -4850,7 +4936,7 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // delayed type check ?
|
} // delayed type check ?
|
||||||
|
|
||||||
ENGINE_SAVE_CONTEXT_BASE();
|
ENGINE_SAVE_CONTEXT_BASE();
|
||||||
|
|
||||||
|
|||||||
@@ -1213,7 +1213,7 @@ HTSEXT_API find_handle hts_findfirst(char *path) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
HTSEXT_API int hts_findnext(find_handle find) {
|
HTSEXT_API hts_boolean hts_findnext(find_handle find) {
|
||||||
if (find) {
|
if (find) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
if ((FindNextFileA(find->handle, &find->hdata)))
|
if ((FindNextFileA(find->handle, &find->hdata)))
|
||||||
@@ -1273,7 +1273,7 @@ HTSEXT_API int hts_findgetsize(find_handle find) {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
HTSEXT_API int hts_findisdir(find_handle find) {
|
HTSEXT_API hts_boolean hts_findisdir(find_handle find) {
|
||||||
if (find) {
|
if (find) {
|
||||||
if (!hts_findissystem(find)) {
|
if (!hts_findissystem(find)) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@@ -1287,7 +1287,7 @@ HTSEXT_API int hts_findisdir(find_handle find) {
|
|||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
HTSEXT_API int hts_findisfile(find_handle find) {
|
HTSEXT_API hts_boolean hts_findisfile(find_handle find) {
|
||||||
if (find) {
|
if (find) {
|
||||||
if (!hts_findissystem(find)) {
|
if (!hts_findissystem(find)) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@@ -1301,7 +1301,7 @@ HTSEXT_API int hts_findisfile(find_handle find) {
|
|||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
HTSEXT_API int hts_findissystem(find_handle find) {
|
HTSEXT_API hts_boolean hts_findissystem(find_handle find) {
|
||||||
if (find) {
|
if (find) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
if (find->hdata.
|
if (find->hdata.
|
||||||
|
|||||||
@@ -108,15 +108,15 @@ HTSEXT_API int hts_buildtopindex(httrackp * opt, const char *path,
|
|||||||
// Portable directory find functions
|
// Portable directory find functions
|
||||||
// Directory find functions
|
// Directory find functions
|
||||||
HTSEXT_API find_handle hts_findfirst(char *path);
|
HTSEXT_API find_handle hts_findfirst(char *path);
|
||||||
HTSEXT_API int hts_findnext(find_handle find);
|
HTSEXT_API hts_boolean hts_findnext(find_handle find);
|
||||||
HTSEXT_API int hts_findclose(find_handle find);
|
HTSEXT_API int hts_findclose(find_handle find);
|
||||||
|
|
||||||
//
|
//
|
||||||
HTSEXT_API char *hts_findgetname(find_handle find);
|
HTSEXT_API char *hts_findgetname(find_handle find);
|
||||||
HTSEXT_API int hts_findgetsize(find_handle find);
|
HTSEXT_API int hts_findgetsize(find_handle find);
|
||||||
HTSEXT_API int hts_findisdir(find_handle find);
|
HTSEXT_API hts_boolean hts_findisdir(find_handle find);
|
||||||
HTSEXT_API int hts_findisfile(find_handle find);
|
HTSEXT_API hts_boolean hts_findisfile(find_handle find);
|
||||||
HTSEXT_API int hts_findissystem(find_handle find);
|
HTSEXT_API hts_boolean hts_findissystem(find_handle find);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -206,7 +206,8 @@ HTSEXT_API htsErrorCallback hts_get_error_callback(void);
|
|||||||
/* Logging */
|
/* Logging */
|
||||||
/** Legacy: write prefix then msg to opt->log. Returns 0 if written, 1 if
|
/** Legacy: write prefix then msg to opt->log. Returns 0 if written, 1 if
|
||||||
opt->log is NULL. Prefer hts_log_print(). */
|
opt->log is NULL. Prefer hts_log_print(). */
|
||||||
HTSEXT_API int hts_log(httrackp * opt, const char *prefix, const char *msg);
|
HTSEXT_API hts_boolean hts_log(httrackp *opt, const char *prefix,
|
||||||
|
const char *msg);
|
||||||
|
|
||||||
/** printf-style log at level @p type (an hts_log_type, optionally |LOG_ERRNO).
|
/** printf-style log at level @p type (an hts_log_type, optionally |LOG_ERRNO).
|
||||||
Forwards to the registered log callback, and when the level is <= opt->debug
|
Forwards to the registered log callback, and when the level is <= opt->debug
|
||||||
@@ -313,7 +314,8 @@ HTSEXT_API T_SOC catch_url_init(int *port, char *adr);
|
|||||||
"ip:port". The buffers are caller-allocated and not bounds-checked: @p data
|
"ip:port". The buffers are caller-allocated and not bounds-checked: @p data
|
||||||
must be CATCH_URL_DATA_SIZE bytes, and @p url / @p method must fit the
|
must be CATCH_URL_DATA_SIZE bytes, and @p url / @p method must fit the
|
||||||
captured request line. */
|
captured request line. */
|
||||||
HTSEXT_API int catch_url(T_SOC soc, char *url, char *method, char *data);
|
HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||||
|
char *data);
|
||||||
|
|
||||||
/* State */
|
/* State */
|
||||||
/** Whether the engine is parsing HTML. Returns 0 if not, otherwise the percent
|
/** Whether the engine is parsing HTML. Returns 0 if not, otherwise the percent
|
||||||
@@ -334,10 +336,10 @@ HTSEXT_API int hts_is_exiting(httrackp * opt);
|
|||||||
caller-owned, NULL-terminated array of strings; the engine stores the
|
caller-owned, NULL-terminated array of strings; the engine stores the
|
||||||
pointer without copying, so the array and its strings must stay valid until
|
pointer without copying, so the array and its strings must stay valid until
|
||||||
the engine consumes them. @return nonzero if a list is now set. */
|
the engine consumes them. @return nonzero if a list is now set. */
|
||||||
HTSEXT_API int hts_addurl(httrackp * opt, char **url);
|
HTSEXT_API hts_boolean hts_addurl(httrackp *opt, char **url);
|
||||||
|
|
||||||
/** Clear any pending add-URL list set by hts_addurl(). Always returns 0. */
|
/** Clear any pending add-URL list set by hts_addurl(). Always returns 0. */
|
||||||
HTSEXT_API int hts_resetaddurl(httrackp * opt);
|
HTSEXT_API hts_boolean hts_resetaddurl(httrackp *opt);
|
||||||
|
|
||||||
/** Apply the runtime-tunable options from @p from onto @p to, to adjust a live
|
/** Apply the runtime-tunable options from @p from onto @p to, to adjust a live
|
||||||
mirror. Only fields set to a non-sentinel value are copied; the rest of @p
|
mirror. Only fields set to a non-sentinel value are copied; the rest of @p
|
||||||
@@ -356,7 +358,7 @@ HTSEXT_API int hts_setpause(httrackp * opt, int);
|
|||||||
lock, so it is safe to call from another thread). @p force is currently
|
lock, so it is safe to call from another thread). @p force is currently
|
||||||
ignored.
|
ignored.
|
||||||
@return 0; no-op if @p opt is NULL. */
|
@return 0; no-op if @p opt is NULL. */
|
||||||
HTSEXT_API int hts_request_stop(httrackp * opt, int force);
|
HTSEXT_API int hts_request_stop(httrackp *opt, hts_boolean force);
|
||||||
|
|
||||||
/** Queue a single in-progress file, by URL, to be cancelled by the engine.
|
/** Queue a single in-progress file, by URL, to be cancelled by the engine.
|
||||||
@p url is copied internally. Takes the state lock, so it is thread-safe.
|
@p url is copied internally. Takes the state lock, so it is thread-safe.
|
||||||
@@ -373,7 +375,7 @@ HTSEXT_API void hts_cancel_parsing(httrackp * opt);
|
|||||||
|
|
||||||
/** Nonzero once the mirror has fully ended. Read under the engine state lock,
|
/** Nonzero once the mirror has fully ended. Read under the engine state lock,
|
||||||
so safe to poll from another thread. Wait for this before hts_free_opt(). */
|
so safe to poll from another thread. Wait for this before hts_free_opt(). */
|
||||||
HTSEXT_API int hts_has_stopped(httrackp * opt);
|
HTSEXT_API hts_boolean hts_has_stopped(httrackp *opt);
|
||||||
|
|
||||||
/* Tools */
|
/* Tools */
|
||||||
/** Ensure the directory chain leading to @p path exists, creating missing
|
/** Ensure the directory chain leading to @p path exists, creating missing
|
||||||
@@ -390,7 +392,7 @@ HTSEXT_API int structcheck_utf8(const char *path);
|
|||||||
/** Whether the directory containing @p path exists. The basename is stripped
|
/** Whether the directory containing @p path exists. The basename is stripped
|
||||||
first, so passing a file path tests its parent directory. @return 1 if it is
|
first, so passing a file path tests its parent directory. @return 1 if it is
|
||||||
a directory, 0 otherwise. */
|
a directory, 0 otherwise. */
|
||||||
HTSEXT_API int dir_exists(const char *path);
|
HTSEXT_API hts_boolean dir_exists(const char *path);
|
||||||
|
|
||||||
/** Write the HTTP reason phrase for @p statuscode into @p msg, a caller buffer
|
/** Write the HTTP reason phrase for @p statuscode into @p msg, a caller buffer
|
||||||
of at least 64 bytes. For an unknown code a non-empty @p msg is kept,
|
of at least 64 bytes. For an unknown code a non-empty @p msg is kept,
|
||||||
@@ -573,14 +575,15 @@ HTSEXT_API char *unescape_http(char *const catbuff, const size_t size, const cha
|
|||||||
must-avoid escapes are kept encoded, and %25 is never decoded). @p no_high &
|
must-avoid escapes are kept encoded, and %25 is never decoded). @p no_high &
|
||||||
1 also decodes high (>= 128) bytes; @p no_high & 2 also decodes an escaped
|
1 also decodes high (>= 128) bytes; @p no_high & 2 also decodes an escaped
|
||||||
space. Returns @p catbuff. */
|
space. Returns @p catbuff. */
|
||||||
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size, const char *s, const int no_high);
|
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size,
|
||||||
|
const char *s, const hts_boolean no_high);
|
||||||
|
|
||||||
/** Determine the MIME type of local file name @p fil into @p s (capacity
|
/** Determine the MIME type of local file name @p fil into @p s (capacity
|
||||||
@p ssize): user --assume rules, then ".html", then the built-in extension
|
@p ssize): user --assume rules, then ".html", then the built-in extension
|
||||||
table. @p flag != 0 forces a fallback type. @return 1 if a type was written,
|
table. @p flag != 0 forces a fallback type. @return 1 if a type was written,
|
||||||
0 otherwise. */
|
0 otherwise. */
|
||||||
HTSEXT_API int get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||||
const char *fil, int flag);
|
const char *fil, hts_boolean flag);
|
||||||
|
|
||||||
/** @deprecated Use get_httptype_sized(). Assumes @p s has at least
|
/** @deprecated Use get_httptype_sized(). Assumes @p s has at least
|
||||||
HTS_MIMETYPE_SIZE capacity. */
|
HTS_MIMETYPE_SIZE capacity. */
|
||||||
@@ -600,7 +603,7 @@ HTSEXT_API int is_userknowntype(httrackp * opt, const char *fil);
|
|||||||
|
|
||||||
/** 1 if @p fil, an extension such as "asp" or "php" (not a full filename), is a
|
/** 1 if @p fil, an extension such as "asp" or "php" (not a full filename), is a
|
||||||
known dynamic-page type, else 0. */
|
known dynamic-page type, else 0. */
|
||||||
HTSEXT_API int is_dyntype(const char *fil);
|
HTSEXT_API hts_boolean is_dyntype(const char *fil);
|
||||||
|
|
||||||
/** Extract the extension of @p fil (text after the last '.', stopping at '?')
|
/** Extract the extension of @p fil (text after the last '.', stopping at '?')
|
||||||
into caller scratch @p catbuff (capacity @p size) and return it. Returns ""
|
into caller scratch @p catbuff (capacity @p size) and return it. Returns ""
|
||||||
@@ -610,12 +613,12 @@ HTSEXT_API const char *get_ext(char *catbuff, size_t size, const char *fil);
|
|||||||
|
|
||||||
/** 1 if MIME type @p st must not be reclassified or renamed (hypertext types
|
/** 1 if MIME type @p st must not be reclassified or renamed (hypertext types
|
||||||
and a built-in keep-list of commonly mislabeled types), else 0. */
|
and a built-in keep-list of commonly mislabeled types), else 0. */
|
||||||
HTSEXT_API int may_unknown(httrackp * opt, const char *st);
|
HTSEXT_API hts_boolean may_unknown(httrackp *opt, const char *st);
|
||||||
|
|
||||||
/** Guess the MIME type of local file @p fil into @p s (capacity @p ssize),
|
/** Guess the MIME type of local file @p fil into @p s (capacity @p ssize),
|
||||||
always producing a type. @return 1 if a type was written. */
|
always producing a type. @return 1 if a type was written. */
|
||||||
HTSEXT_API int guess_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
HTSEXT_API hts_boolean guess_httptype_sized(httrackp *opt, char *s,
|
||||||
const char *fil);
|
size_t ssize, const char *fil);
|
||||||
|
|
||||||
/** @deprecated Use guess_httptype_sized(). Assumes @p s has at least
|
/** @deprecated Use guess_httptype_sized(). Assumes @p s has at least
|
||||||
HTS_MIMETYPE_SIZE capacity. */
|
HTS_MIMETYPE_SIZE capacity. */
|
||||||
@@ -677,7 +680,7 @@ HTSEXT_API find_handle hts_findfirst(char *path);
|
|||||||
|
|
||||||
/** Advance to the next directory entry. Returns 1 if an entry is available, 0
|
/** Advance to the next directory entry. Returns 1 if an entry is available, 0
|
||||||
at end of directory. */
|
at end of directory. */
|
||||||
HTSEXT_API int hts_findnext(find_handle find);
|
HTSEXT_API hts_boolean hts_findnext(find_handle find);
|
||||||
|
|
||||||
/** Close the iteration and free @p find. Always returns 0; NULL is accepted. */
|
/** Close the iteration and free @p find. Always returns 0; NULL is accepted. */
|
||||||
HTSEXT_API int hts_findclose(find_handle find);
|
HTSEXT_API int hts_findclose(find_handle find);
|
||||||
@@ -692,16 +695,16 @@ HTSEXT_API int hts_findgetsize(find_handle find);
|
|||||||
|
|
||||||
/** 1 if the current entry is a directory, else 0 (a system/special entry, see
|
/** 1 if the current entry is a directory, else 0 (a system/special entry, see
|
||||||
hts_findissystem(), reports 0). */
|
hts_findissystem(), reports 0). */
|
||||||
HTSEXT_API int hts_findisdir(find_handle find);
|
HTSEXT_API hts_boolean hts_findisdir(find_handle find);
|
||||||
|
|
||||||
/** 1 if the current entry is a regular file, else 0 (a system/special entry,
|
/** 1 if the current entry is a regular file, else 0 (a system/special entry,
|
||||||
see hts_findissystem(), reports 0). */
|
see hts_findissystem(), reports 0). */
|
||||||
HTSEXT_API int hts_findisfile(find_handle find);
|
HTSEXT_API hts_boolean hts_findisfile(find_handle find);
|
||||||
|
|
||||||
/** 1 if the current entry is a special/system entry to skip: "." or "..", on
|
/** 1 if the current entry is a special/system entry to skip: "." or "..", on
|
||||||
POSIX also device/fifo/socket nodes, on Windows also system, hidden or
|
POSIX also device/fifo/socket nodes, on Windows also system, hidden or
|
||||||
temporary entries. Else 0. */
|
temporary entries. Else 0. */
|
||||||
HTSEXT_API int hts_findissystem(find_handle find);
|
HTSEXT_API hts_boolean hts_findissystem(find_handle find);
|
||||||
|
|
||||||
/* UTF-8 aware FILE API */
|
/* UTF-8 aware FILE API */
|
||||||
/* On non-Windows these macros resolve directly to the POSIX calls. On Windows
|
/* On non-Windows these macros resolve directly to the POSIX calls. On Windows
|
||||||
|
|||||||
@@ -288,7 +288,7 @@ static void __cdecl htsshow_uninit(t_hts_callbackarg * carg) {
|
|||||||
}
|
}
|
||||||
static int __cdecl htsshow_start(t_hts_callbackarg * carg, httrackp * opt) {
|
static int __cdecl htsshow_start(t_hts_callbackarg * carg, httrackp * opt) {
|
||||||
use_show = 0;
|
use_show = 0;
|
||||||
if (opt->verbosedisplay == 2) {
|
if (opt->verbosedisplay == HTS_VERBOSE_FULL) {
|
||||||
use_show = 1;
|
use_show = 1;
|
||||||
vt_clear();
|
vt_clear();
|
||||||
}
|
}
|
||||||
@@ -852,7 +852,7 @@ static void sig_doback(int blind) { // mettre en backing
|
|||||||
if (global_opt != NULL) {
|
if (global_opt != NULL) {
|
||||||
// suppress logging and asking lousy questions
|
// suppress logging and asking lousy questions
|
||||||
global_opt->quiet = 1;
|
global_opt->quiet = 1;
|
||||||
global_opt->verbosedisplay = 0;
|
global_opt->verbosedisplay = HTS_VERBOSE_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!blind)
|
if (!blind)
|
||||||
|
|||||||
149
src/webhttrack
149
src/webhttrack
@@ -4,131 +4,140 @@
|
|||||||
# Initializes the htsserver GUI frontend and launch the default browser
|
# Initializes the htsserver GUI frontend and launch the default browser
|
||||||
|
|
||||||
BROWSEREXE=
|
BROWSEREXE=
|
||||||
SRCHBROWSEREXE="x-www-browser www-browser iceape mozilla firefox-developer-edition firefox icecat iceweasel abrowser firebird galeon konqueror midori opera google-chrome chrome chromium chromium-browser netscape firefox-developer-edition"
|
SRCHBROWSEREXE=(x-www-browser www-browser iceape mozilla firefox-developer-edition firefox icecat iceweasel abrowser firebird galeon konqueror midori opera google-chrome chrome chromium chromium-browser netscape firefox-developer-edition)
|
||||||
|
# shellcheck disable=SC2153 # BROWSER is the standard freedesktop env var, not a typo
|
||||||
if test -n "${BROWSER}"; then
|
if test -n "${BROWSER}"; then
|
||||||
# sensible-browser will f up if BROWSER is not set
|
# sensible-browser will f up if BROWSER is not set
|
||||||
SRCHBROWSEREXE="xdg-open sensible-browser ${SRCHBROWSEREXE}"
|
SRCHBROWSEREXE=(xdg-open sensible-browser "${SRCHBROWSEREXE[@]}")
|
||||||
fi
|
fi
|
||||||
# Patch for Darwin/Mac by Ross Williams
|
# Patch for Darwin/Mac by Ross Williams
|
||||||
if test "`uname -s`" == "Darwin"; then
|
if test "$(uname -s)" == "Darwin"; then
|
||||||
# Darwin/Mac OS X uses a system 'open' command to find
|
# Darwin/Mac OS X uses a system 'open' command to find
|
||||||
# the default browser. The -W flag causes it to wait for
|
# the default browser. The -W flag causes it to wait for
|
||||||
# the browser to exit
|
# the browser to exit
|
||||||
BROWSEREXE="/usr/bin/open -W"
|
BROWSEREXE="/usr/bin/open -W"
|
||||||
fi
|
fi
|
||||||
BINWD=`dirname "$0"`
|
BINWD=$(dirname "$0")
|
||||||
SRCHPATH="$BINWD /usr/local/bin /usr/share/bin /usr/bin /usr/lib/httrack /usr/local/lib/httrack /usr/local/share/httrack /opt/local/bin /sw/bin ${HOME}/usr/bin ${HOME}/bin"
|
SRCHPATH=("$BINWD" /usr/local/bin /usr/share/bin /usr/bin /usr/lib/httrack /usr/local/lib/httrack /usr/local/share/httrack /opt/local/bin /sw/bin "${HOME}/usr/bin" "${HOME}/bin")
|
||||||
SRCHPATH="$SRCHPATH "`echo $PATH | tr ":" " "`
|
IFS=':' read -ra pathdirs <<<"$PATH"
|
||||||
SRCHDISTPATH="$BINWD/../share $BINWD/.. /usr/share /usr/local /usr /local /usr/local/share ${HOME}/usr ${HOME}/usr/share /opt/local/share /sw ${HOME}/usr/local ${HOME}/usr/share"
|
for d in "${pathdirs[@]}"; do
|
||||||
|
# drop empty PATH fields, matching the old echo|tr word-split
|
||||||
|
test -n "$d" && SRCHPATH+=("$d")
|
||||||
|
done
|
||||||
|
SRCHDISTPATH=("$BINWD/../share" "$BINWD/.." /usr/share /usr/local /usr /local /usr/local/share "${HOME}/usr" "${HOME}/usr/share" /opt/local/share /sw "${HOME}/usr/local" "${HOME}/usr/share")
|
||||||
|
|
||||||
###
|
###
|
||||||
# And now some famous cuisine
|
# And now some famous cuisine
|
||||||
|
|
||||||
function log {
|
function log {
|
||||||
echo "$0($$): $@" >&2
|
echo "$0($$): $*" >&2
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
function launch_browser {
|
function launch_browser {
|
||||||
log "Launching $1"
|
log "Launching $1"
|
||||||
browser=$1
|
browser=$1
|
||||||
url=$2
|
url=$2
|
||||||
log "Spawning browser.."
|
log "Spawning browser.."
|
||||||
${browser} "${url}"
|
${browser} "${url}"
|
||||||
# note: browser can hiddenly use the -remote feature of
|
# note: browser can hiddenly use the -remote feature of
|
||||||
# mozilla and therefore return immediately
|
# mozilla and therefore return immediately
|
||||||
log "Browser (or helper) exited"
|
log "Browser (or helper) exited"
|
||||||
}
|
}
|
||||||
|
|
||||||
# First ensure that we can launch the server
|
# First ensure that we can launch the server
|
||||||
BINPATH=
|
BINPATH=
|
||||||
for i in ${SRCHPATH}; do
|
for i in "${SRCHPATH[@]}"; do
|
||||||
! test -n "${BINPATH}" && test -x ${i}/htsserver && BINPATH=${i}
|
! test -n "${BINPATH}" && test -x "${i}/htsserver" && BINPATH="${i}"
|
||||||
done
|
done
|
||||||
for i in ${SRCHDISTPATH}; do
|
for i in "${SRCHDISTPATH[@]}"; do
|
||||||
! test -n "${DISTPATH}" && test -f "${i}/httrack/lang.def" && DISTPATH="${i}/httrack"
|
! test -n "${DISTPATH}" && test -f "${i}/httrack/lang.def" && DISTPATH="${i}/httrack"
|
||||||
done
|
done
|
||||||
test -n "${BINPATH}" || ! log "Could not find htsserver" || exit 1
|
test -n "${BINPATH}" || ! log "Could not find htsserver" || exit 1
|
||||||
test -n "${DISTPATH}" || ! log "Could not find httrack directory" || exit 1
|
test -n "${DISTPATH}" || ! log "Could not find httrack directory" || exit 1
|
||||||
test -f ${DISTPATH}/lang.def || ! log "Could not find ${DISTPATH}/lang.def" || exit 1
|
test -f "${DISTPATH}/lang.def" || ! log "Could not find ${DISTPATH}/lang.def" || exit 1
|
||||||
test -f ${DISTPATH}/lang.indexes || ! log "Could not find ${DISTPATH}/lang.indexes" || exit 1
|
test -f "${DISTPATH}/lang.indexes" || ! log "Could not find ${DISTPATH}/lang.indexes" || exit 1
|
||||||
test -d ${DISTPATH}/lang || ! log "Could not find ${DISTPATH}/lang" || exit 1
|
test -d "${DISTPATH}/lang" || ! log "Could not find ${DISTPATH}/lang" || exit 1
|
||||||
test -d ${DISTPATH}/html || ! log "Could not find ${DISTPATH}/html" || exit 1
|
test -d "${DISTPATH}/html" || ! log "Could not find ${DISTPATH}/html" || exit 1
|
||||||
|
|
||||||
# Locale
|
# Locale
|
||||||
HTSLANG="${LC_MESSAGES}"
|
HTSLANG="${LC_MESSAGES}"
|
||||||
! test -n "${HTSLANG}" && HTSLANG="${LC_ALL}"
|
! test -n "${HTSLANG}" && HTSLANG="${LC_ALL}"
|
||||||
! test -n "${HTSLANG}" && HTSLANG="${LANG}"
|
! test -n "${HTSLANG}" && HTSLANG="${LANG}"
|
||||||
HTSLANG="`echo $LANG | cut -f1 -d'.' | cut -f1 -d'_'`"
|
HTSLANG="$(echo "$LANG" | cut -f1 -d'.' | cut -f1 -d'_')"
|
||||||
LANGN=`grep -E "^${HTSLANG}:" ${DISTPATH}/lang.indexes | cut -f2 -d':'`
|
LANGN=$(grep -E "^${HTSLANG}:" "${DISTPATH}/lang.indexes" | cut -f2 -d':')
|
||||||
! test -n "${LANGN}" && LANGN=1
|
! test -n "${LANGN}" && LANGN=1
|
||||||
|
|
||||||
# Find the browser
|
# Find the browser
|
||||||
# note: not all systems have sensible-browser or www-browser alternative
|
# note: not all systems have sensible-browser or www-browser alternative
|
||||||
# thefeore, we have to find a bit more if sensible-browser could not be found
|
# thefeore, we have to find a bit more if sensible-browser could not be found
|
||||||
|
|
||||||
for i in ${SRCHBROWSEREXE}; do
|
for i in "${SRCHBROWSEREXE[@]}"; do
|
||||||
for j in ${SRCHPATH}; do
|
for j in "${SRCHPATH[@]}"; do
|
||||||
if test -x ${j}/${i}; then
|
if test -x "${j}/${i}"; then
|
||||||
BROWSEREXE=${j}/${i}
|
BROWSEREXE="${j}/${i}"
|
||||||
fi
|
fi
|
||||||
test -n "$BROWSEREXE" && break
|
test -n "$BROWSEREXE" && break
|
||||||
done
|
done
|
||||||
test -n "$BROWSEREXE" && break
|
test -n "$BROWSEREXE" && break
|
||||||
done
|
done
|
||||||
test -n "$BROWSEREXE" || ! log "Could not find any suitable browser" || exit 1
|
test -n "$BROWSEREXE" || ! log "Could not find any suitable browser" || exit 1
|
||||||
|
|
||||||
# "browse" command
|
# "browse" command
|
||||||
if test "$1" = "browse"; then
|
if test "$1" = "browse"; then
|
||||||
if test -f "${HOME}/.httrack.ini"; then
|
if test -f "${HOME}/.httrack.ini"; then
|
||||||
INDEXF=`cat ${HOME}/.httrack.ini | tr '\r' '\n' | grep -E "^path=" | cut -f2- -d'='`
|
INDEXF=$(tr '\r' '\n' <"${HOME}/.httrack.ini" | grep -E "^path=" | cut -f2- -d'=')
|
||||||
if test -n "${INDEXF}" -a -d "${INDEXF}" -a -f "${INDEXF}/index.html"; then
|
if test -n "${INDEXF}" -a -d "${INDEXF}" -a -f "${INDEXF}/index.html"; then
|
||||||
INDEXF="${INDEXF}/index.html"
|
INDEXF="${INDEXF}/index.html"
|
||||||
else
|
else
|
||||||
INDEXF=""
|
INDEXF=""
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
if ! test -n "$INDEXF"; then
|
if ! test -n "$INDEXF"; then
|
||||||
INDEXF="${HOME}/websites/index.html"
|
INDEXF="${HOME}/websites/index.html"
|
||||||
fi
|
fi
|
||||||
launch_browser "${BROWSEREXE}" "file://${INDEXF}"
|
launch_browser "${BROWSEREXE}" "file://${INDEXF}"
|
||||||
exit $?
|
exit $?
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Create a temporary filename
|
# Create a temporary filename
|
||||||
TMPSRVFILE="$(mktemp ${TMPDIR:-/tmp}/.webhttrack.XXXXXXXX)" || ! log "Could not create the temporary file ${TMPSRVFILE}" || exit 1
|
TMPSRVFILE="$(mktemp "${TMPDIR:-/tmp}/.webhttrack.XXXXXXXX")" || ! log "Could not create the temporary file ${TMPSRVFILE}" || exit 1
|
||||||
# Launch htsserver binary and setup the server
|
# Launch htsserver binary and setup the server
|
||||||
(${BINPATH}/htsserver "${DISTPATH}/" --ppid "$$" path "${HOME}/websites" lang "${LANGN}" $@; echo SRVURL=error) > ${TMPSRVFILE}&
|
(
|
||||||
|
"${BINPATH}/htsserver" "${DISTPATH}/" --ppid "$$" path "${HOME}/websites" lang "${LANGN}" "$@"
|
||||||
|
echo SRVURL=error
|
||||||
|
) >"${TMPSRVFILE}" &
|
||||||
# Find the generated SRVURL
|
# Find the generated SRVURL
|
||||||
SRVURL=
|
SRVURL=
|
||||||
MAXCOUNT=60
|
MAXCOUNT=60
|
||||||
while ! test -n "$SRVURL"; do
|
while ! test -n "$SRVURL"; do
|
||||||
MAXCOUNT=$[$MAXCOUNT - 1]
|
MAXCOUNT=$((MAXCOUNT - 1))
|
||||||
test $MAXCOUNT -gt 0 || exit 1
|
test $MAXCOUNT -gt 0 || exit 1
|
||||||
test $MAXCOUNT -lt 50 && echo "waiting for server to reply.."
|
test $MAXCOUNT -lt 50 && echo "waiting for server to reply.."
|
||||||
SRVURL=`grep -E URL= ${TMPSRVFILE} | cut -f2- -d=`
|
SRVURL=$(grep -E URL= "${TMPSRVFILE}" | cut -f2- -d=)
|
||||||
test ! "$SRVURL" = "error" || ! log "Could not spawn htsserver" || exit 1
|
test ! "$SRVURL" = "error" || ! log "Could not spawn htsserver" || exit 1
|
||||||
test -n "$SRVURL" || sleep 1
|
test -n "$SRVURL" || sleep 1
|
||||||
done
|
done
|
||||||
|
|
||||||
# Cleanup function
|
# Cleanup function
|
||||||
|
# shellcheck disable=SC2120 # $1 is an optional "signal caught" marker; bare calls are intentional
|
||||||
function cleanup {
|
function cleanup {
|
||||||
test -n "$1" && log "Nasty signal caught, cleaning up.."
|
test -n "$1" && log "Nasty signal caught, cleaning up.."
|
||||||
# Do not kill if browser exited (chrome bug issue) ; server will die itself
|
# Do not kill if browser exited (chrome bug issue) ; server will die itself
|
||||||
test -n "$1" && test -f ${TMPSRVFILE} && SRVPID=`grep -E PID= ${TMPSRVFILE} | cut -f2- -d=`
|
test -n "$1" && test -f "${TMPSRVFILE}" && SRVPID=$(grep -E PID= "${TMPSRVFILE}" | cut -f2- -d=)
|
||||||
test -n "${SRVPID}" && kill -9 ${SRVPID}
|
test -n "${SRVPID}" && kill -9 "${SRVPID}"
|
||||||
test -f ${TMPSRVFILE} && rm ${TMPSRVFILE}
|
test -f "${TMPSRVFILE}" && rm "${TMPSRVFILE}"
|
||||||
test -n "$1" && log "..Done"
|
test -n "$1" && log "..Done"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
# Cleanup in case of emergency
|
# Cleanup in case of emergency
|
||||||
trap "cleanup now; exit" 1 2 3 4 5 6 7 8 9 11 13 14 15 16 19 24 25
|
trap "cleanup now; exit" HUP INT QUIT ILL TRAP ABRT BUS FPE SEGV PIPE ALRM TERM STKFLT XCPU XFSZ
|
||||||
|
|
||||||
# Got SRVURL, launch browser
|
# Got SRVURL, launch browser
|
||||||
launch_browser "${BROWSEREXE}" "${SRVURL}"
|
launch_browser "${BROWSEREXE}" "${SRVURL}"
|
||||||
|
|
||||||
# That's all, folks!
|
# That's all, folks!
|
||||||
trap "" 1 2 3 4 5 6 7 8 9 11 13 14 15 16 19 24 25
|
trap "" HUP INT QUIT ILL TRAP ABRT BUS FPE SEGV PIPE ALRM TERM STKFLT XCPU XFSZ
|
||||||
cleanup
|
cleanup
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
@@ -6,11 +6,11 @@ set -euo pipefail
|
|||||||
# charset -> UTF-8 conversion (hts_convertStringToUTF8).
|
# charset -> UTF-8 conversion (hts_convertStringToUTF8).
|
||||||
# -#3 <charset> <string> prints the string re-decoded from <charset> as UTF-8.
|
# -#3 <charset> <string> prints the string re-decoded from <charset> as UTF-8.
|
||||||
conv() {
|
conv() {
|
||||||
test "$(httrack -O /dev/null -#3 "$1" "$2")" == "$3" || exit 1
|
test "$(httrack -O /dev/null -#3 "$1" "$2")" == "$3" || exit 1
|
||||||
}
|
}
|
||||||
# crash probe: malformed input must exit cleanly, not abort.
|
# crash probe: malformed input must exit cleanly, not abort.
|
||||||
runs() {
|
runs() {
|
||||||
httrack -O /dev/null -#3 "$1" "$2" >/dev/null 2>&1 || exit 1
|
httrack -O /dev/null -#3 "$1" "$2" >/dev/null 2>&1 || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# the source bytes below are UTF-8 (this file is UTF-8); "café" is 0x63 61 66 C3 A9.
|
# the source bytes below are UTF-8 (this file is UTF-8); "café" is 0x63 61 66 C3 A9.
|
||||||
|
|||||||
15
tests/01_engine-cookies.test
Executable file
15
tests/01_engine-cookies.test
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Issue #151 guard: the request Cookie header must be bare RFC 6265 name=value
|
||||||
|
# pairs, no $Version/$Path attributes. Driven by the 'httrack -#Q' selftest.
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
# A trailing token is required; a bare '-#Q' falls through to the usage screen.
|
||||||
|
out=$(httrack -#Q run)
|
||||||
|
|
||||||
|
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||||
|
test "$out" = "cookie-header: OK" || {
|
||||||
|
echo "expected 'cookie-header: OK', got: $out" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
17
tests/01_engine-copyopt.test
Executable file
17
tests/01_engine-copyopt.test
Executable file
@@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Regression guard for the unsigned-enum sentinel trap: copy_htsopt's
|
||||||
|
# `if (from->X > -1)` guard is always false for unsigned hts_boolean fields, so
|
||||||
|
# they silently stop being copied. Driven by the in-process 'httrack -#9' test.
|
||||||
|
# Keep POSIX-portable (harness runs it via $(BASH), a plain /bin/sh on macOS).
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
# A trailing token is required; a bare '-#9' falls through to the usage screen.
|
||||||
|
out=$(httrack -#9 run)
|
||||||
|
|
||||||
|
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||||
|
test "$out" = "copy-htsopt: OK" || {
|
||||||
|
echo "expected 'copy-htsopt: OK', got: $out" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
@@ -89,4 +89,37 @@ grep -q NEWCONTENT "$(find "$out" -path '*/a.html' -print -quit)" || {
|
|||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# --- 3. an empty quoted arg survives the doit.log round-trip (#106) ----------
|
||||||
|
# -%F "" (empty footer) records an empty "" token in doit.log; -r2 follows it so
|
||||||
|
# a "drop the empty token" bug shifts -r2 into -%F's slot (the reprise then sees
|
||||||
|
# -%F -r2 and panics "%F needs to be followed by ..."), making the bug visible
|
||||||
|
# rather than a harmless run off the end of argv.
|
||||||
|
out2="$tmp/out2"
|
||||||
|
rc=0
|
||||||
|
"$bin" "$url" -O "$out2" --quiet -n -%v0 -%F "" -r2 >/dev/null 2>&1 || rc=$?
|
||||||
|
test "$rc" -eq 0 || {
|
||||||
|
echo "FAIL: initial mirror with empty footer exited $rc"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
# precondition: the writer put the empty token on disk for the reader to reload.
|
||||||
|
grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
|
||||||
|
echo "FAIL: empty footer not recorded as -%F \"\" -r2 in doit.log"
|
||||||
|
grep -- '-%F' "$out2/hts-cache/doit.log" || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
# no-url reprise: the reader rebuilds argv from doit.log and rewrites doit.log
|
||||||
|
# from it. The empty token surviving in the regenerated file proves the reader
|
||||||
|
# kept it (a drop/swallow would panic above or rewrite -%F without the "").
|
||||||
|
rc=0
|
||||||
|
"$bin" -O "$out2" --quiet >/dev/null 2>&1 || rc=$?
|
||||||
|
test "$rc" -eq 0 || {
|
||||||
|
echo "FAIL: empty-footer reprise exited $rc (empty token dropped from doit.log?)"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
|
||||||
|
echo "FAIL: empty footer did not survive the doit.log reload round-trip"
|
||||||
|
grep -- '-%F' "$out2/hts-cache/doit.log" || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
@@ -6,11 +6,11 @@ set -euo pipefail
|
|||||||
# HTML entity unescaping (hts_unescapeEntitiesWithCharset).
|
# HTML entity unescaping (hts_unescapeEntitiesWithCharset).
|
||||||
# -#6 <string> prints the string with entities decoded (UTF-8 output).
|
# -#6 <string> prints the string with entities decoded (UTF-8 output).
|
||||||
ent() {
|
ent() {
|
||||||
test "$(httrack -O /dev/null -#6 "$1")" == "$2" || exit 1
|
test "$(httrack -O /dev/null -#6 "$1")" == "$2" || exit 1
|
||||||
}
|
}
|
||||||
# crash probe: malformed input must exit cleanly, not abort.
|
# crash probe: malformed input must exit cleanly, not abort.
|
||||||
runs() {
|
runs() {
|
||||||
httrack -O /dev/null -#6 "$1" >/dev/null 2>&1 || exit 1
|
httrack -O /dev/null -#6 "$1" >/dev/null 2>&1 || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# named entities
|
# named entities
|
||||||
|
|||||||
@@ -7,10 +7,10 @@ set -euo pipefail
|
|||||||
# -#0 <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
|
# -#0 <filter> <string> prints "<string> does match <filter>" or "... does NOT match ...".
|
||||||
|
|
||||||
match() {
|
match() {
|
||||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does match $1" || exit 1
|
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does match $1" || exit 1
|
||||||
}
|
}
|
||||||
nomatch() {
|
nomatch() {
|
||||||
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does NOT match $1" || exit 1
|
test "$(httrack -O /dev/null -#0 "$1" "$2")" == "$2 does NOT match $1" || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# bare star matches everything
|
# bare star matches everything
|
||||||
@@ -67,7 +67,7 @@ nomatch '*[\[]' 'a'
|
|||||||
# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
|
# filter guide claims (GitHub #148); it parses as the class {'[','\'} followed
|
||||||
# by a trailing literal ']'. These assertions document the current (buggy)
|
# by a trailing literal ']'. These assertions document the current (buggy)
|
||||||
# behavior so any future matcher fix is a deliberate, visible change.
|
# behavior so any future matcher fix is a deliberate, visible change.
|
||||||
nomatch '*[\[\]]' '[' # not matched, despite the docs
|
nomatch '*[\[\]]' '[' # not matched, despite the docs
|
||||||
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
|
match '*[\[\]]' ']' # only via the empty class-match + trailing ']'
|
||||||
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
|
match '*[\[\]]' '[]' # one of {'[','\'} then the trailing ']'
|
||||||
nomatch '*[\[\]]' '[]x'
|
nomatch '*[\[\]]' '[]x'
|
||||||
|
|||||||
@@ -7,10 +7,10 @@ set -euo pipefail
|
|||||||
# -#2 <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
|
# -#2 <path> prints "<path> is '<mime>'" then "and its local type is '.<ext>'".
|
||||||
|
|
||||||
mime() {
|
mime() {
|
||||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is '$2'" || exit 1
|
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is '$2'" || exit 1
|
||||||
}
|
}
|
||||||
unknown() {
|
unknown() {
|
||||||
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
|
test "$(httrack -O /dev/null -#2 "$1" | head -1)" == "$1 is of an unknown MIME type" || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
mime '/a/b.html' 'text/html'
|
mime '/a/b.html' 'text/html'
|
||||||
|
|||||||
@@ -154,4 +154,173 @@ grep -Eq "style=\"background-image:url\('ibgs\.gif'\)\"" "$saved2" ||
|
|||||||
grep -q 'title="file://' "$saved2" ||
|
grep -q 'title="file://' "$saved2" ||
|
||||||
! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1
|
! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1
|
||||||
|
|
||||||
|
# xmlns / xmlns:prefix decls must not be crawled (#191). Local file:// targets so a
|
||||||
|
# regression downloads them; each is the LAST attr (heuristic only scans a value before '>').
|
||||||
|
site3="$tmp/xmlns"
|
||||||
|
mkdir -p "$site3"
|
||||||
|
for f in ns og rdfs real; do gif "$site3/$f.gif"; done
|
||||||
|
cat >"$site3/index.html" <<EOF
|
||||||
|
<html xmlns="file://$site3/ns.gif"><body>
|
||||||
|
<svg xmlns:og="file://$site3/og.gif"></svg>
|
||||||
|
<div class="c" xmlns:rdfs="file://$site3/rdfs.gif"></div>
|
||||||
|
<a href="file://$site3/real.gif">real link</a>
|
||||||
|
</body></html>
|
||||||
|
EOF
|
||||||
|
out3="$tmp/xmlns-out"
|
||||||
|
crawl "$site3/index.html" "$out3"
|
||||||
|
|
||||||
|
# the real link is still captured
|
||||||
|
found "real.gif" "$out3"
|
||||||
|
# namespace-declaration targets must not be fetched (default + prefixed forms)
|
||||||
|
notfound "ns.gif" "$out3"
|
||||||
|
notfound "og.gif" "$out3"
|
||||||
|
notfound "rdfs.gif" "$out3"
|
||||||
|
|
||||||
|
# CSS @import (#94): every form's target is captured, crawling the .css directly.
|
||||||
|
# The "cond"/"sup"/"spc" cases carry a trailing media/supports/layer condition (or
|
||||||
|
# a space before ';'); they are the negative controls: without the parser fix the
|
||||||
|
# URL is dropped, so a regression fails these found() checks.
|
||||||
|
site4="$tmp/cssimport"
|
||||||
|
mkdir -p "$site4"
|
||||||
|
for f in nq dqu squ dqs sqs med cond sup lay spc; do printf 'body{}\n' >"$site4/$f.css"; done
|
||||||
|
cat >"$site4/main.css" <<'EOF'
|
||||||
|
@import url(nq.css);
|
||||||
|
@import url("dqu.css");
|
||||||
|
@import url('squ.css');
|
||||||
|
@import "dqs.css";
|
||||||
|
@import 'sqs.css';
|
||||||
|
@import url(med.css) screen and (min-width: 400px);
|
||||||
|
@import "cond.css" screen;
|
||||||
|
@import "sup.css" supports(display: flex);
|
||||||
|
@import url(lay.css) layer(base);
|
||||||
|
@import "spc.css" ;
|
||||||
|
EOF
|
||||||
|
out4="$tmp/cssimport-out"
|
||||||
|
crawl "$site4/main.css" "$out4"
|
||||||
|
for f in nq dqu squ dqs sqs med cond sup lay spc; do found "$f.css" "$out4"; done
|
||||||
|
|
||||||
|
# Over-capture guard: the trailing condition is not part of the URL, so it must
|
||||||
|
# survive the rewrite verbatim. A regression that grabs it would mangle these.
|
||||||
|
m4=$(find "$out4" -type f -path '*/file/*' -name main.css -print -quit)
|
||||||
|
test -n "$m4" || ! echo "FAIL: saved main.css not found" || exit 1
|
||||||
|
for cond in '@import "cond.css" screen;' 'supports(display: flex)' 'layer(base)'; do
|
||||||
|
grep -Fq "$cond" "$m4" ||
|
||||||
|
! echo "FAIL #94: '$cond' altered on rewrite (condition captured as URL?)" || exit 1
|
||||||
|
done
|
||||||
|
|
||||||
|
# Malformed input: an unterminated @import quote (truncated CSS) must not crash or
|
||||||
|
# capture a bogus link; a valid sibling import is still captured. Guards a heap
|
||||||
|
# overflow on the URL-end scan that aborts under ASan (CI sanitizer job).
|
||||||
|
site5="$tmp/cssimport-trunc"
|
||||||
|
mkdir -p "$site5"
|
||||||
|
printf 'body{}\n' >"$site5/good.css"
|
||||||
|
printf '@import "good.css";\n@import "trunc' >"$site5/main.css"
|
||||||
|
out5="$tmp/cssimport-trunc-out"
|
||||||
|
crawl "$site5/main.css" "$out5"
|
||||||
|
found "good.css" "$out5"
|
||||||
|
notfound "trunc" "$out5"
|
||||||
|
|
||||||
|
# Offset-0 underflow (#396): a token at the buffer start makes the detector's
|
||||||
|
# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
|
||||||
|
# url() target is still captured; here it just must not underflow.
|
||||||
|
site6="$tmp/parse-off0"
|
||||||
|
mkdir -p "$site6"
|
||||||
|
printf 'body{}\n' >"$site6/off0.css"
|
||||||
|
printf 'url(off0.css)\n' >"$site6/main.css"
|
||||||
|
out6="$tmp/parse-off0-out"
|
||||||
|
crawl "$site6/main.css" "$out6"
|
||||||
|
found "off0.css" "$out6"
|
||||||
|
|
||||||
|
# XMLHttpRequest.open(method, url) (#218): the first argument is an HTTP method,
|
||||||
|
# not a URL. Without the fix "GET" is captured as a link and fetched (the offline
|
||||||
|
# fixture saves a bare file named GET; a live server mangles it to GET.html).
|
||||||
|
# window.open(url) detection must be unaffected.
|
||||||
|
site7="$tmp/xhropen"
|
||||||
|
mkdir -p "$site7"
|
||||||
|
gif "$site7/winopen.gif"
|
||||||
|
cat >"$site7/index.html" <<EOF
|
||||||
|
<html><body><script>
|
||||||
|
var x = new XMLHttpRequest();
|
||||||
|
x.open("GET", "ajax_info.txt");
|
||||||
|
var y = new XMLHttpRequest();
|
||||||
|
y.open("Post", "submit.cgi");
|
||||||
|
window.open("file://$site7/winopen.gif");
|
||||||
|
</script></body></html>
|
||||||
|
EOF
|
||||||
|
out7="$tmp/xhropen-out"
|
||||||
|
crawl "$site7/index.html" "$out7"
|
||||||
|
# negative control: without the fix a file named exactly GET is downloaded
|
||||||
|
notfound "GET" "$out7"
|
||||||
|
# methods are matched case-insensitively (XHR spec normalizes them): a mixed-case
|
||||||
|
# method is rejected too, so a file named Post must not appear either
|
||||||
|
notfound "Post" "$out7"
|
||||||
|
# regression guard: window.open(url) is still detected, so its absolute URL is
|
||||||
|
# rewritten to a local link. The rewrite only happens if the parser saw it, so
|
||||||
|
# these two assertions fail if .open detection broke (not a trivial --near save).
|
||||||
|
saved7=$(savedhtml "$out7")
|
||||||
|
test -n "$saved7" || ! echo "FAIL: saved xhr page not found" || exit 1
|
||||||
|
grep -Fq 'window.open("winopen.gif")' "$saved7" ||
|
||||||
|
! echo "FAIL #218: window.open(url) no longer detected/rewritten" || exit 1
|
||||||
|
! grep -Fq 'window.open("file://' "$saved7" ||
|
||||||
|
! echo "FAIL #218: window.open URL left absolute (not rewritten)" || exit 1
|
||||||
|
|
||||||
|
# Parens in an unquoted url(...) (#163): the source %28/%29 decode to literal
|
||||||
|
# '(' ')' in the saved name, but a literal ')' in the rewritten url() closes the
|
||||||
|
# token early, so they must stay encoded. Negative control: without the fix the
|
||||||
|
# %281%29 greps fail (parens are RFC2396 "mark" chars the escaper leaves alone).
|
||||||
|
site8="$tmp/cssparens"
|
||||||
|
mkdir -p "$site8"
|
||||||
|
for f in 'img (1).gif' 'a(b)c(1).gif' 'q (4).gif'; do gif "$site8/$f"; done
|
||||||
|
cat >"$site8/style.css" <<'EOF'
|
||||||
|
.a { background: url(img%20%281%29.gif); }
|
||||||
|
.b { background: url(a%28b%29c%281%29.gif); }
|
||||||
|
.c { background: url("q%20%284%29.gif"); }
|
||||||
|
EOF
|
||||||
|
out8="$tmp/cssparens-out"
|
||||||
|
crawl "$site8/style.css" "$out8"
|
||||||
|
found "img (1).gif" "$out8"
|
||||||
|
found "a(b)c(1).gif" "$out8"
|
||||||
|
found "q (4).gif" "$out8"
|
||||||
|
css8=$(find "$out8" -type f -path '*/file/*' -name style.css -print -quit)
|
||||||
|
test -n "$css8" || ! echo "FAIL: saved style.css not found" || exit 1
|
||||||
|
grep -Fq 'url(img%20%281%29.gif)' "$css8" ||
|
||||||
|
! echo "FAIL #163: parens in unquoted url() not percent-encoded on rewrite" || exit 1
|
||||||
|
grep -Fq 'url(a%28b%29c%281%29.gif)' "$css8" ||
|
||||||
|
! echo "FAIL #163: not every paren in a url() was percent-encoded" || exit 1
|
||||||
|
grep -Fq 'url("q%20%284%29.gif")' "$css8" ||
|
||||||
|
! echo "FAIL #163: quoted url() altered or parens left literal on rewrite" || exit 1
|
||||||
|
|
||||||
|
# The url() detector is not CSS-specific: <script> and inline style= get the
|
||||||
|
# same encoding, but ordinary href/src (ending_p is the quote, not ')') keep
|
||||||
|
# literal parens -- the attribute checks guard the gate against over-firing.
|
||||||
|
site9="$tmp/urlparens"
|
||||||
|
mkdir -p "$site9"
|
||||||
|
for f in 'js (1).gif' 'inl (2).gif' 'asrc (3).gif' 'ahref (4).gif'; do gif "$site9/$f"; done
|
||||||
|
cat >"$site9/index.html" <<EOF
|
||||||
|
<html><body>
|
||||||
|
<script>var bg = "url(js%20%281%29.gif)";</script>
|
||||||
|
<div style="background-image:url(inl%20%282%29.gif)"></div>
|
||||||
|
<img src="asrc%20%283%29.gif">
|
||||||
|
<a href="ahref%20%284%29.gif">link</a>
|
||||||
|
</body></html>
|
||||||
|
EOF
|
||||||
|
out9="$tmp/urlparens-out"
|
||||||
|
crawl "$site9/index.html" "$out9"
|
||||||
|
saved9=$(savedhtml "$out9")
|
||||||
|
test -n "$saved9" || ! echo "FAIL: saved urlparens page not found" || exit 1
|
||||||
|
# rewrite-only: the JS-string asset is not queued for download
|
||||||
|
grep -Fq 'url(js%20%281%29.gif)' "$saved9" ||
|
||||||
|
! echo "FAIL #163: parens in <script> url() not percent-encoded" || exit 1
|
||||||
|
found "inl (2).gif" "$out9"
|
||||||
|
grep -Fq 'url(inl%20%282%29.gif)' "$saved9" ||
|
||||||
|
! echo "FAIL #163: parens in inline style url() not percent-encoded" || exit 1
|
||||||
|
found "asrc (3).gif" "$out9"
|
||||||
|
found "ahref (4).gif" "$out9"
|
||||||
|
grep -Fq 'src="asrc%20(3).gif"' "$saved9" ||
|
||||||
|
! echo "FAIL #163: parens in a plain src attribute were wrongly encoded" || exit 1
|
||||||
|
grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
|
||||||
|
! echo "FAIL #163: parens in a plain href attribute were wrongly encoded" || exit 1
|
||||||
|
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
|
||||||
|
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
68
tests/01_engine-relative.test
Executable file
68
tests/01_engine-relative.test
Executable file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# lienrelatif (build relative path) + ident_url_relatif (resolve a link, collapse
|
||||||
|
# ./ and ../). Regression net for #137/#162; expected values hand-computed.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# relative path from <curr>'s directory to <link>
|
||||||
|
rel() {
|
||||||
|
local got
|
||||||
|
got=$(httrack -O /dev/null -#l "$1" "$2")
|
||||||
|
test "$got" == "relative=$3" ||
|
||||||
|
{
|
||||||
|
echo "FAIL rel($1, $2): got '$got' want 'relative=$3'"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# resolve <link> against origin <adr>/<fil> -> adr=.. fil=..
|
||||||
|
ident() {
|
||||||
|
local got
|
||||||
|
got=$(httrack -O /dev/null -#i "$1" "$2" "$3")
|
||||||
|
test "$got" == "$4" ||
|
||||||
|
{
|
||||||
|
echo "FAIL ident($1, $2, $3): got '$got' want '$4'"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
### lienrelatif
|
||||||
|
|
||||||
|
rel 'dir/page.html' 'dir/index.html' 'page.html'
|
||||||
|
rel 'dir/page.html' 'dir/page.html' 'page.html' # self-link
|
||||||
|
rel 'a.html' 'dir/index.html' '../a.html'
|
||||||
|
rel 'x.html' 'a/b/c/index.html' '../../../x.html'
|
||||||
|
rel 'h/a/x.jpg' 'h/a/sub/page.html' '../x.jpg'
|
||||||
|
rel 'a/b/c/x.html' 'index.html' 'a/b/c/x.html'
|
||||||
|
rel 'h/sub/x.jpg' 'h/page.html' 'sub/x.jpg'
|
||||||
|
rel 'h/dir2/x.jpg' 'h/dir1/page.html' '../dir2/x.jpg' # sibling dir
|
||||||
|
rel 'h/bc/x.jpg' 'h/b/page.html' '../bc/x.jpg' # b/bc prefix trap
|
||||||
|
rel 'h/b/x.jpg' 'h/bc/page.html' '../b/x.jpg'
|
||||||
|
rel 'h2/img/x.jpg' 'h1/p/page.html' '../../h2/img/x.jpg' # cross-host
|
||||||
|
rel 'img.cdn/photo.jpg' 'www.site/articles/2020/post.html' '../../../img.cdn/photo.jpg'
|
||||||
|
rel 'h/a/' 'h/a/sub/page.html' '../' # link is ancestor dir
|
||||||
|
rel 'x.html' 'page.html' 'x.html'
|
||||||
|
rel 'dir/page.html?x=1' 'dir/index.html?y=2' 'page.html' # ? stripped
|
||||||
|
|
||||||
|
### ident_url_relatif
|
||||||
|
|
||||||
|
ident 'img.gif' 'www.foo.com' '/dir/page.html' 'adr=www.foo.com fil=/dir/img.gif'
|
||||||
|
ident 'sub/img.gif' 'www.foo.com' '/dir/page.html' 'adr=www.foo.com fil=/dir/sub/img.gif'
|
||||||
|
ident '/img.gif' 'www.foo.com' '/dir/page.html' 'adr=www.foo.com fil=/img.gif'
|
||||||
|
# embedded ../ collapses (#137)
|
||||||
|
ident '../img.gif' 'www.foo.com' '/dir/sub/page.html' 'adr=www.foo.com fil=/dir/img.gif'
|
||||||
|
ident 'sub/../logo.png' 'www.foo.com' '/articles/2020/post.html' 'adr=www.foo.com fil=/articles/2020/logo.png'
|
||||||
|
ident '../../pix/sub/../logo.png' 'www.foo.com' '/articles/2020/post.html' 'adr=www.foo.com fil=/pix/logo.png'
|
||||||
|
ident '../../../../x.gif' 'www.foo.com' '/a/b/page.html' 'adr=www.foo.com fil=/x.gif' # above-root clamp
|
||||||
|
ident '?page=2' 'www.foo.com' '/dir/index.html?old=1' 'adr=www.foo.com fil=/dir/index.html?page=2'
|
||||||
|
ident 'http://other.com/a/b/../c/index.html' 'www.foo.com' '/p.html' 'adr=other.com fil=/a/c/index.html'
|
||||||
|
# file:// collapses ../ like the other schemes; traversal contained, // authority kept
|
||||||
|
ident 'file:///var/data/pix/sub/../logo.png' 'www.foo.com' '/p.html' 'adr=file:// fil=/var/data/pix/logo.png'
|
||||||
|
ident 'file:///a/b/c/../../d/e.gif' 'www.foo.com' '/p.html' 'adr=file:// fil=/a/d/e.gif'
|
||||||
|
ident 'file:///a/../../b' 'www.foo.com' '/p.html' 'adr=file:// fil=/b'
|
||||||
|
ident 'file://srv/share/../x' 'www.foo.com' '/p.html' 'adr=file:// fil=//srv/x'
|
||||||
|
ident 'mailto:foo@bar.com' 'www.foo.com' '/p.html' 'error=-1' # unsupported scheme
|
||||||
|
ident 'javascript:void(0)' 'www.foo.com' '/p.html' 'error=-1'
|
||||||
|
|
||||||
|
echo "OK"
|
||||||
@@ -5,7 +5,7 @@ set -euo pipefail
|
|||||||
|
|
||||||
# path simplify engine (fil_simplifie): collapses ./ and ../ segments.
|
# path simplify engine (fil_simplifie): collapses ./ and ../ segments.
|
||||||
simp() {
|
simp() {
|
||||||
test "$(httrack -O /dev/null -#1 "$1")" == "simplified=$2" || exit 1
|
test "$(httrack -O /dev/null -#1 "$1")" == "simplified=$2" || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
simp './foo/bar/' 'foo/bar/'
|
simp './foo/bar/' 'foo/bar/'
|
||||||
@@ -26,3 +26,17 @@ simp './a/../../b' 'b'
|
|||||||
|
|
||||||
# empty segments ('//') are not dot-segments and are preserved, per RFC 3986
|
# empty segments ('//') are not dot-segments and are preserved, per RFC 3986
|
||||||
simp 'a//b' 'a//b'
|
simp 'a//b' 'a//b'
|
||||||
|
simp 'a//b/../c' 'a//c'
|
||||||
|
|
||||||
|
# absolute paths keep the leading '/'; above-root '..' is clamped to it
|
||||||
|
simp '/a/../b' '/b'
|
||||||
|
simp '/a/../../b' '/b'
|
||||||
|
simp '/../x' '/x'
|
||||||
|
|
||||||
|
# collapses to nothing -> './' (relative) or '/' (absolute)
|
||||||
|
simp '..' './'
|
||||||
|
simp 'a/..' './'
|
||||||
|
simp '/' '/'
|
||||||
|
|
||||||
|
simp 'a/b/..' 'a/' # trailing bare '..'
|
||||||
|
simp 'a/../b?x=../y' 'b?x=../y' # '?' freezes simplification
|
||||||
|
|||||||
@@ -21,9 +21,15 @@ test "$out" == "strsafe: OK" || exit 1
|
|||||||
# the bounded macro aborts (non-zero exit), so don't let set -e trip on it
|
# the bounded macro aborts (non-zero exit), so don't let set -e trip on it
|
||||||
err=$(httrack -#8 overflow "this string is far too long for the buffer" 2>&1) || true
|
err=$(httrack -#8 overflow "this string is far too long for the buffer" 2>&1) || true
|
||||||
case "$err" in
|
case "$err" in
|
||||||
*"strsafe: NOT aborted"*) echo "over-capacity write was NOT caught" >&2; exit 1 ;;
|
*"strsafe: NOT aborted"*)
|
||||||
*"overflow while copying"*) ;;
|
echo "over-capacity write was NOT caught" >&2
|
||||||
*) echo "expected htssafe overflow abort, got: $err" >&2; exit 1 ;;
|
exit 1
|
||||||
|
;;
|
||||||
|
*"overflow while copying"*) ;;
|
||||||
|
*)
|
||||||
|
echo "expected htssafe overflow abort, got: $err" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
# Same guarantee for the htsbuff builder. The source is exactly the buffer
|
# Same guarantee for the htsbuff builder. The source is exactly the buffer
|
||||||
@@ -32,7 +38,13 @@ esac
|
|||||||
# aborted"). Match the specific htsbuff abort message, not just any assert.
|
# aborted"). Match the specific htsbuff abort message, not just any assert.
|
||||||
err=$(httrack -#8 overflow-buff "abcd" 2>&1) || true
|
err=$(httrack -#8 overflow-buff "abcd" 2>&1) || true
|
||||||
case "$err" in
|
case "$err" in
|
||||||
*"strsafe: NOT aborted"*) echo "htsbuff over-capacity write was NOT caught" >&2; exit 1 ;;
|
*"strsafe: NOT aborted"*)
|
||||||
*"htsbuff append overflow"*) ;;
|
echo "htsbuff over-capacity write was NOT caught" >&2
|
||||||
*) echo "expected htsbuff overflow abort, got: $err" >&2; exit 1 ;;
|
exit 1
|
||||||
|
;;
|
||||||
|
*"htsbuff append overflow"*) ;;
|
||||||
|
*)
|
||||||
|
echo "expected htsbuff overflow abort, got: $err" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|||||||
@@ -3,6 +3,6 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
||||||
|
|
||||||
bash crawl-test.sh --errors 0 --files 5 httrack http://ut.httrack.com/simple/basic.html
|
bash crawl-test.sh --errors 0 --files 5 httrack http://ut.httrack.com/simple/basic.html
|
||||||
|
|||||||
@@ -3,10 +3,10 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
||||||
|
|
||||||
bash crawl-test.sh --errors 0 --files 3 \
|
bash crawl-test.sh --errors 0 --files 3 \
|
||||||
--found ut.httrack.com/cookies/third.html \
|
--found ut.httrack.com/cookies/third.html \
|
||||||
--found ut.httrack.com/cookies/second.html \
|
--found ut.httrack.com/cookies/second.html \
|
||||||
--found ut.httrack.com/cookies/entrance.html \
|
--found ut.httrack.com/cookies/entrance.html \
|
||||||
httrack http://ut.httrack.com/cookies/entrance.php
|
httrack http://ut.httrack.com/cookies/entrance.php
|
||||||
|
|||||||
@@ -3,21 +3,21 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
||||||
|
|
||||||
# unicode tests
|
# unicode tests
|
||||||
bash crawl-test.sh \
|
bash crawl-test.sh \
|
||||||
--errors 1 --files 5 \
|
--errors 1 --files 5 \
|
||||||
--found 'café.ut.httrack.com/unicode-links/café3860.html' \
|
--found 'café.ut.httrack.com/unicode-links/café3860.html' \
|
||||||
--found 'café.ut.httrack.com/unicode-links/café30f4.html' \
|
--found 'café.ut.httrack.com/unicode-links/café30f4.html' \
|
||||||
--found 'café.ut.httrack.com/unicode-links/café5e1f.html' \
|
--found 'café.ut.httrack.com/unicode-links/café5e1f.html' \
|
||||||
--found 'café.ut.httrack.com/unicode-links/café7b30.html' \
|
--found 'café.ut.httrack.com/unicode-links/café7b30.html' \
|
||||||
httrack 'http://ut.httrack.com/unicode-links/idna.html' \
|
httrack 'http://ut.httrack.com/unicode-links/idna.html' \
|
||||||
'+*.ut.httrack.com/*' --robots=0
|
'+*.ut.httrack.com/*' --robots=0
|
||||||
|
|
||||||
# unicode tests (bogus links)
|
# unicode tests (bogus links)
|
||||||
bash crawl-test.sh \
|
bash crawl-test.sh \
|
||||||
--errors 0 --files 1 \
|
--errors 0 --files 1 \
|
||||||
--found 'ut.httrack.com/unicode-links/idna_bogus.html' \
|
--found 'ut.httrack.com/unicode-links/idna_bogus.html' \
|
||||||
httrack 'http://ut.httrack.com/unicode-links/idna_bogus.html' \
|
httrack 'http://ut.httrack.com/unicode-links/idna_bogus.html' \
|
||||||
'-*' --robots=0
|
'-*' --robots=0
|
||||||
|
|||||||
@@ -3,67 +3,67 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
||||||
|
|
||||||
# unicode tests
|
# unicode tests
|
||||||
bash crawl-test.sh \
|
bash crawl-test.sh \
|
||||||
--errors 1 --files 10 \
|
--errors 1 --files 10 \
|
||||||
--found ut.httrack.com/unicode-links/caf%a91bce.html \
|
--found ut.httrack.com/unicode-links/caf%a91bce.html \
|
||||||
--found ut.httrack.com/unicode-links/café30f4.html \
|
--found ut.httrack.com/unicode-links/café30f4.html \
|
||||||
--found ut.httrack.com/unicode-links/café3860.html \
|
--found ut.httrack.com/unicode-links/café3860.html \
|
||||||
--found ut.httrack.com/unicode-links/café463e.html \
|
--found ut.httrack.com/unicode-links/café463e.html \
|
||||||
--found ut.httrack.com/unicode-links/café5e1f.html \
|
--found ut.httrack.com/unicode-links/café5e1f.html \
|
||||||
--found ut.httrack.com/unicode-links/café7b30.html \
|
--found ut.httrack.com/unicode-links/café7b30.html \
|
||||||
--found ut.httrack.com/unicode-links/café8007.html \
|
--found ut.httrack.com/unicode-links/café8007.html \
|
||||||
--found ut.httrack.com/unicode-links/café9fa8.html \
|
--found ut.httrack.com/unicode-links/café9fa8.html \
|
||||||
--found ut.httrack.com/unicode-links/caféae52.html \
|
--found ut.httrack.com/unicode-links/caféae52.html \
|
||||||
--found ut.httrack.com/unicode-links/caféc009.html \
|
--found ut.httrack.com/unicode-links/caféc009.html \
|
||||||
--found ut.httrack.com/unicode-links/utf8.html \
|
--found ut.httrack.com/unicode-links/utf8.html \
|
||||||
httrack http://ut.httrack.com/unicode-links/utf8.html
|
httrack http://ut.httrack.com/unicode-links/utf8.html
|
||||||
|
|
||||||
bash crawl-test.sh \
|
bash crawl-test.sh \
|
||||||
--errors 4 --files 7 \
|
--errors 4 --files 7 \
|
||||||
--found ut.httrack.com/unicode-links/café3860.html \
|
--found ut.httrack.com/unicode-links/café3860.html \
|
||||||
--found ut.httrack.com/unicode-links/café9fa8.html \
|
--found ut.httrack.com/unicode-links/café9fa8.html \
|
||||||
--found ut.httrack.com/unicode-links/café30f4.html \
|
--found ut.httrack.com/unicode-links/café30f4.html \
|
||||||
--found ut.httrack.com/unicode-links/café5e1f.html \
|
--found ut.httrack.com/unicode-links/café5e1f.html \
|
||||||
--found ut.httrack.com/unicode-links/café7b30.html \
|
--found ut.httrack.com/unicode-links/café7b30.html \
|
||||||
--found ut.httrack.com/unicode-links/café8007.html \
|
--found ut.httrack.com/unicode-links/café8007.html \
|
||||||
--found ut.httrack.com/unicode-links/caf%e939bd.html \
|
--found ut.httrack.com/unicode-links/caf%e939bd.html \
|
||||||
--found ut.httrack.com/unicode-links/caf%e9ae52.html \
|
--found ut.httrack.com/unicode-links/caf%e9ae52.html \
|
||||||
--found ut.httrack.com/unicode-links/caféaec2.html \
|
--found ut.httrack.com/unicode-links/caféaec2.html \
|
||||||
--found ut.httrack.com/unicode-links/caféfad6.html \
|
--found ut.httrack.com/unicode-links/caféfad6.html \
|
||||||
--found ut.httrack.com/unicode-links/default.html \
|
--found ut.httrack.com/unicode-links/default.html \
|
||||||
httrack http://ut.httrack.com/unicode-links/default.html
|
httrack http://ut.httrack.com/unicode-links/default.html
|
||||||
|
|
||||||
bash crawl-test.sh \
|
bash crawl-test.sh \
|
||||||
--errors 2 --files 9 \
|
--errors 2 --files 9 \
|
||||||
--found ut.httrack.com/unicode-links/caf%a9ae52.html \
|
--found ut.httrack.com/unicode-links/caf%a9ae52.html \
|
||||||
--found ut.httrack.com/unicode-links/caf%a9bf59.html \
|
--found ut.httrack.com/unicode-links/caf%a9bf59.html \
|
||||||
--found ut.httrack.com/unicode-links/café30f4.html \
|
--found ut.httrack.com/unicode-links/café30f4.html \
|
||||||
--found ut.httrack.com/unicode-links/café3860.html \
|
--found ut.httrack.com/unicode-links/café3860.html \
|
||||||
--found ut.httrack.com/unicode-links/café5e1f.html \
|
--found ut.httrack.com/unicode-links/café5e1f.html \
|
||||||
--found ut.httrack.com/unicode-links/café647f.html \
|
--found ut.httrack.com/unicode-links/café647f.html \
|
||||||
--found ut.httrack.com/unicode-links/café7b30.html \
|
--found ut.httrack.com/unicode-links/café7b30.html \
|
||||||
--found ut.httrack.com/unicode-links/café8007.html \
|
--found ut.httrack.com/unicode-links/café8007.html \
|
||||||
--found ut.httrack.com/unicode-links/caféaec2.html \
|
--found ut.httrack.com/unicode-links/caféaec2.html \
|
||||||
--found ut.httrack.com/unicode-links/caféfad6.html \
|
--found ut.httrack.com/unicode-links/caféfad6.html \
|
||||||
--found ut.httrack.com/unicode-links/iso88591.html \
|
--found ut.httrack.com/unicode-links/iso88591.html \
|
||||||
httrack http://ut.httrack.com/unicode-links/iso88591.html
|
httrack http://ut.httrack.com/unicode-links/iso88591.html
|
||||||
|
|
||||||
bash crawl-test.sh \
|
bash crawl-test.sh \
|
||||||
--errors 4 --files 9 \
|
--errors 4 --files 9 \
|
||||||
--found ut.httrack.com/unicode-links/caf%a8%a6c72a.html \
|
--found ut.httrack.com/unicode-links/caf%a8%a6c72a.html \
|
||||||
--found ut.httrack.com/unicode-links/caf%a9bf59.html \
|
--found ut.httrack.com/unicode-links/caf%a9bf59.html \
|
||||||
--found ut.httrack.com/unicode-links/café8007.html \
|
--found ut.httrack.com/unicode-links/café8007.html \
|
||||||
--found ut.httrack.com/unicode-links/cafébf43.html \
|
--found ut.httrack.com/unicode-links/cafébf43.html \
|
||||||
--found ut.httrack.com/unicode-links/cafédcd8.html \
|
--found ut.httrack.com/unicode-links/cafédcd8.html \
|
||||||
--found ut.httrack.com/unicode-links/café2461.html \
|
--found ut.httrack.com/unicode-links/café2461.html \
|
||||||
--found ut.httrack.com/unicode-links/caf%a8%a61bce.html \
|
--found ut.httrack.com/unicode-links/caf%a8%a61bce.html \
|
||||||
--found ut.httrack.com/unicode-links/caf%a9ae52.html \
|
--found ut.httrack.com/unicode-links/caf%a9ae52.html \
|
||||||
--found ut.httrack.com/unicode-links/café7b30.html \
|
--found ut.httrack.com/unicode-links/café7b30.html \
|
||||||
--found ut.httrack.com/unicode-links/café30f4.html \
|
--found ut.httrack.com/unicode-links/café30f4.html \
|
||||||
--found ut.httrack.com/unicode-links/café5e1f.html \
|
--found ut.httrack.com/unicode-links/café5e1f.html \
|
||||||
--found ut.httrack.com/unicode-links/café3860.html \
|
--found ut.httrack.com/unicode-links/café3860.html \
|
||||||
--found ut.httrack.com/unicode-links/gb18030.html \
|
--found ut.httrack.com/unicode-links/gb18030.html \
|
||||||
httrack http://ut.httrack.com/unicode-links/gb18030.html
|
httrack http://ut.httrack.com/unicode-links/gb18030.html
|
||||||
|
|||||||
@@ -3,10 +3,10 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
||||||
|
|
||||||
# http://code.google.com/p/httrack/issues/detail?id=42&can=1
|
# http://code.google.com/p/httrack/issues/detail?id=42&can=1
|
||||||
# we expect 2 errors only because other links are too longs (to be modified if suitable)
|
# we expect 2 errors only because other links are too longs (to be modified if suitable)
|
||||||
bash crawl-test.sh --errors 2 --files 1 \
|
bash crawl-test.sh --errors 2 --files 1 \
|
||||||
--found ut.httrack.com/overflow/longquerywithaccents.html \
|
--found ut.httrack.com/overflow/longquerywithaccents.html \
|
||||||
httrack http://ut.httrack.com/overflow/longquerywithaccents.php
|
httrack http://ut.httrack.com/overflow/longquerywithaccents.php
|
||||||
|
|||||||
@@ -3,45 +3,45 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
||||||
|
|
||||||
# http://code.google.com/p/httrack/issues/detail?id=4&can=1
|
# http://code.google.com/p/httrack/issues/detail?id=4&can=1
|
||||||
bash crawl-test.sh --errors 0 --files 4 \
|
bash crawl-test.sh --errors 0 --files 4 \
|
||||||
--found ut.httrack.com/parsing/back5e1f.gif \
|
--found ut.httrack.com/parsing/back5e1f.gif \
|
||||||
--found ut.httrack.com/parsing/events.html \
|
--found ut.httrack.com/parsing/events.html \
|
||||||
--found ut.httrack.com/parsing/fade230f4.gif \
|
--found ut.httrack.com/parsing/fade230f4.gif \
|
||||||
--found ut.httrack.com/parsing/fade3860.gif \
|
--found ut.httrack.com/parsing/fade3860.gif \
|
||||||
httrack http://ut.httrack.com/parsing/events.html
|
httrack http://ut.httrack.com/parsing/events.html
|
||||||
|
|
||||||
# http://code.google.com/p/httrack/issues/detail?id=2&can=1
|
# http://code.google.com/p/httrack/issues/detail?id=2&can=1
|
||||||
bash crawl-test.sh --errors 0 --files 3 \
|
bash crawl-test.sh --errors 0 --files 3 \
|
||||||
--found ut.httrack.com/parsing/background-image.css \
|
--found ut.httrack.com/parsing/background-image.css \
|
||||||
--found ut.httrack.com/parsing/background-image.html \
|
--found ut.httrack.com/parsing/background-image.html \
|
||||||
--found ut.httrack.com/parsing/fade.gif \
|
--found ut.httrack.com/parsing/fade.gif \
|
||||||
httrack http://ut.httrack.com/parsing/background-image.html
|
httrack http://ut.httrack.com/parsing/background-image.html
|
||||||
|
|
||||||
# javascript parsing
|
# javascript parsing
|
||||||
bash crawl-test.sh --errors 0 --files 3 \
|
bash crawl-test.sh --errors 0 --files 3 \
|
||||||
--found ut.httrack.com/parsing/back.gif \
|
--found ut.httrack.com/parsing/back.gif \
|
||||||
--found ut.httrack.com/parsing/fade.gif \
|
--found ut.httrack.com/parsing/fade.gif \
|
||||||
--found ut.httrack.com/parsing/javascript.html \
|
--found ut.httrack.com/parsing/javascript.html \
|
||||||
httrack http://ut.httrack.com/parsing/javascript.html
|
httrack http://ut.httrack.com/parsing/javascript.html
|
||||||
|
|
||||||
# handling of + before query string
|
# handling of + before query string
|
||||||
bash crawl-test.sh --errors 0 --files 6 \
|
bash crawl-test.sh --errors 0 --files 6 \
|
||||||
--found ut.httrack.com/parsing/escaping.html \
|
--found ut.httrack.com/parsing/escaping.html \
|
||||||
--found "ut.httrack.com/parsing/foo bar30f4.html" \
|
--found "ut.httrack.com/parsing/foo bar30f4.html" \
|
||||||
--found "ut.httrack.com/parsing/foo bar5e1f.html" \
|
--found "ut.httrack.com/parsing/foo bar5e1f.html" \
|
||||||
--found "ut.httrack.com/parsing/foo+bar+plus3860.html" \
|
--found "ut.httrack.com/parsing/foo+bar+plus3860.html" \
|
||||||
--found "ut.httrack.com/parsing/foo barae52.html" \
|
--found "ut.httrack.com/parsing/foo barae52.html" \
|
||||||
--found "ut.httrack.com/parsing/foo bar7b30.html" \
|
--found "ut.httrack.com/parsing/foo bar7b30.html" \
|
||||||
httrack http://ut.httrack.com/parsing/escaping.html
|
httrack http://ut.httrack.com/parsing/escaping.html
|
||||||
|
|
||||||
# handling of # encoded in filename
|
# handling of # encoded in filename
|
||||||
# see http://code.google.com/p/httrack/issues/detail?id=25
|
# see http://code.google.com/p/httrack/issues/detail?id=25
|
||||||
bash crawl-test.sh --errors 2 --files 4 \
|
bash crawl-test.sh --errors 2 --files 4 \
|
||||||
--found "ut.httrack.com/parsing/escaping2.html" \
|
--found "ut.httrack.com/parsing/escaping2.html" \
|
||||||
--found "ut.httrack.com/parsing/++foo++bar++plus++.html" \
|
--found "ut.httrack.com/parsing/++foo++bar++plus++.html" \
|
||||||
--found "ut.httrack.com/parsing/foo#bar#.html" \
|
--found "ut.httrack.com/parsing/foo#bar#.html" \
|
||||||
--found "ut.httrack.com/parsing/foo bar.html" \
|
--found "ut.httrack.com/parsing/foo bar.html" \
|
||||||
httrack http://ut.httrack.com/parsing/escaping2.html
|
httrack http://ut.httrack.com/parsing/escaping2.html
|
||||||
|
|||||||
@@ -3,11 +3,11 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
bash check-network.sh || ! echo "skipping online unit tests" || exit 77
|
||||||
|
|
||||||
if test "${HTTPS_SUPPORT:-}" == "no"; then
|
if test "${HTTPS_SUPPORT:-}" == "no"; then
|
||||||
echo "no https support compiled, skipping"
|
echo "no https support compiled, skipping"
|
||||||
exit 77
|
exit 77
|
||||||
fi
|
fi
|
||||||
|
|
||||||
bash crawl-test.sh --errors 0 --files 5 httrack https://ut.httrack.com/simple/basic.html
|
bash crawl-test.sh --errors 0 --files 5 httrack https://ut.httrack.com/simple/basic.html
|
||||||
|
|||||||
136
tests/13_crawl_proxy_https.test
Normal file
136
tests/13_crawl_proxy_https.test
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Issue #85: an https crawl must go through the configured proxy (CONNECT
|
||||||
|
# tunnel), not bypass it and hit the origin directly. Fully local: a self-signed
|
||||||
|
# TLS origin plus a logging CONNECT proxy, so no network access is needed.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
: "${top_srcdir:=..}"
|
||||||
|
|
||||||
|
if test "${HTTPS_SUPPORT:-}" == "no"; then
|
||||||
|
echo "no https support compiled, skipping"
|
||||||
|
exit 77
|
||||||
|
fi
|
||||||
|
if ! command -v python3 >/dev/null 2>&1 || ! command -v openssl >/dev/null 2>&1; then
|
||||||
|
echo "python3/openssl missing, skipping"
|
||||||
|
exit 77
|
||||||
|
fi
|
||||||
|
|
||||||
|
server="$top_srcdir/tests/proxy-https-server.py"
|
||||||
|
tmpdir=$(mktemp -d)
|
||||||
|
pids=
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
for pid in $pids; do
|
||||||
|
kill "$pid" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
# self-signed cert for the local TLS origin (httrack does not verify certs)
|
||||||
|
openssl req -x509 -newkey rsa:2048 -keyout "$tmpdir/key.pem" \
|
||||||
|
-out "$tmpdir/cert.pem" -days 2 -nodes -subj "/CN=127.0.0.1" \
|
||||||
|
>/dev/null 2>&1
|
||||||
|
cat "$tmpdir/key.pem" "$tmpdir/cert.pem" >"$tmpdir/both.pem"
|
||||||
|
|
||||||
|
# start_server <logdir> <mode>: launches a proxy+origin pair, sets $origin_port
|
||||||
|
# and $proxy_port from its announced ephemeral ports.
|
||||||
|
start_server() {
|
||||||
|
local dir="$1" mode="$2" ports
|
||||||
|
mkdir -p "$dir"
|
||||||
|
ports="$dir/ports.txt"
|
||||||
|
python3 "$server" "$tmpdir/both.pem" "$dir" "$mode" \
|
||||||
|
>"$ports" 2>"$dir/server.err" &
|
||||||
|
pids="$pids $!"
|
||||||
|
for _ in $(seq 1 100); do
|
||||||
|
grep -q "^ready" "$ports" 2>/dev/null && break
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
grep -q "^ready" "$ports" 2>/dev/null || {
|
||||||
|
echo "server ($mode) did not start" >&2
|
||||||
|
cat "$dir/server.err" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
origin_port=$(awk '/^ORIGIN/{print $2}' "$ports")
|
||||||
|
proxy_port=$(awk '/^PROXY/{print $2}' "$ports")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run httrack, but kill it after a deadline so a hang (e.g. a missing bound on
|
||||||
|
# the proxy response) surfaces as the kill code $HANG_RC instead of stalling the
|
||||||
|
# whole job. A portable stand-in for `timeout`, which macOS lacks.
|
||||||
|
HANG_RC=137 # 128 + SIGKILL
|
||||||
|
run_crawl() {
|
||||||
|
local out="$1" proxy="$2" port="$3"
|
||||||
|
rm -rf "$out"
|
||||||
|
httrack "https://127.0.0.1:${port}/" --proxy "$proxy" \
|
||||||
|
-O "$out" -r1 -s0 --timeout=10 >"$out.log" 2>&1 &
|
||||||
|
local pid=$!
|
||||||
|
(sleep 60 && kill -9 "$pid" 2>/dev/null) &
|
||||||
|
local guard=$!
|
||||||
|
local rc=0
|
||||||
|
wait "$pid" 2>/dev/null || rc=$?
|
||||||
|
kill "$guard" 2>/dev/null || true
|
||||||
|
wait "$guard" 2>/dev/null || true
|
||||||
|
return "$rc"
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- working proxy ----------------------------------------------------------
|
||||||
|
ok="$tmpdir/ok"
|
||||||
|
start_server "$ok" ok
|
||||||
|
|
||||||
|
# 1. page retrieved AND the proxy saw a CONNECT to the origin
|
||||||
|
run_crawl "$ok/out" "127.0.0.1:${proxy_port}" "$origin_port"
|
||||||
|
grep -rq "ORIGIN-PAGE-85" "$ok/out" || {
|
||||||
|
echo "FAIL: origin page not downloaded through proxy" >&2
|
||||||
|
cat "$ok/out.log" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q "^CONNECT 127.0.0.1:${origin_port} " "$ok/proxy.log" || {
|
||||||
|
echo "FAIL: proxy never received a CONNECT (https bypassed the proxy)" >&2
|
||||||
|
cat "$ok/proxy.log" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
echo "OK: https tunneled through proxy via CONNECT"
|
||||||
|
|
||||||
|
# 2. authenticated proxy: creds ride the CONNECT, and NEVER reach the origin
|
||||||
|
: >"$ok/proxy.log"
|
||||||
|
: >"$ok/origin-headers.log"
|
||||||
|
run_crawl "$ok/out2" "user:secret@127.0.0.1:${proxy_port}" "$origin_port"
|
||||||
|
grep -rq "ORIGIN-PAGE-85" "$ok/out2" || {
|
||||||
|
echo "FAIL: origin page not downloaded through authenticated proxy" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
got=$(awk '/^AUTH Basic /{print $3}' "$ok/proxy.log" | head -1)
|
||||||
|
# base64("user:secret"); compared as a literal to stay portable (no base64 -d,
|
||||||
|
# which differs between GNU and BSD)
|
||||||
|
test "$got" == "dXNlcjpzZWNyZXQ=" || {
|
||||||
|
echo "FAIL: Proxy-Authorization not carried on CONNECT (got '$got')" >&2
|
||||||
|
cat "$ok/proxy.log" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
if grep -qi "proxy-authorization" "$ok/origin-headers.log"; then
|
||||||
|
echo "FAIL: proxy credentials leaked to the origin through the tunnel" >&2
|
||||||
|
cat "$ok/origin-headers.log" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "OK: proxy credentials carried on CONNECT, not leaked to origin"
|
||||||
|
|
||||||
|
# --- hostile proxy ----------------------------------------------------------
|
||||||
|
# A proxy that answers 200 then streams headers forever must not hang the crawl:
|
||||||
|
# the client bounds the response. run_crawl kills a hung httrack after 60s, so a
|
||||||
|
# missing bound surfaces as $HANG_RC here.
|
||||||
|
flood="$tmpdir/flood"
|
||||||
|
start_server "$flood" flood
|
||||||
|
rc=0
|
||||||
|
run_crawl "$flood/out" "127.0.0.1:${proxy_port}" "$origin_port" || rc=$?
|
||||||
|
test "$rc" -ne "$HANG_RC" || {
|
||||||
|
echo "FAIL: crawl hung on a flooding proxy (bounded read missing)" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -rq "ORIGIN-PAGE-85" "$flood/out" 2>/dev/null && {
|
||||||
|
echo "FAIL: flooding proxy unexpectedly served the page" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
echo "OK: bounded proxy response, no hang on a flooding proxy"
|
||||||
@@ -2,6 +2,7 @@
|
|||||||
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
||||||
# silently drop it from the dist tarball and break "make distcheck".
|
# silently drop it from the dist tarball and break "make distcheck".
|
||||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||||
|
proxy-https-server.py \
|
||||||
fixtures/cache-golden/hts-cache/new.zip
|
fixtures/cache-golden/hts-cache/new.zip
|
||||||
|
|
||||||
TESTS_ENVIRONMENT =
|
TESTS_ENVIRONMENT =
|
||||||
@@ -24,6 +25,8 @@ TESTS = \
|
|||||||
01_engine-cache-golden.test \
|
01_engine-cache-golden.test \
|
||||||
01_engine-charset.test \
|
01_engine-charset.test \
|
||||||
01_engine-cmdline.test \
|
01_engine-cmdline.test \
|
||||||
|
01_engine-cookies.test \
|
||||||
|
01_engine-copyopt.test \
|
||||||
01_engine-doitlog.test \
|
01_engine-doitlog.test \
|
||||||
01_engine-entities.test \
|
01_engine-entities.test \
|
||||||
01_engine-filter.test \
|
01_engine-filter.test \
|
||||||
@@ -32,6 +35,7 @@ TESTS = \
|
|||||||
01_engine-mime.test \
|
01_engine-mime.test \
|
||||||
01_engine-parse.test \
|
01_engine-parse.test \
|
||||||
01_engine-rcfile.test \
|
01_engine-rcfile.test \
|
||||||
|
01_engine-relative.test \
|
||||||
01_engine-simplify.test \
|
01_engine-simplify.test \
|
||||||
01_engine-strsafe.test \
|
01_engine-strsafe.test \
|
||||||
02_manpage-regen.test \
|
02_manpage-regen.test \
|
||||||
@@ -42,6 +46,7 @@ TESTS = \
|
|||||||
11_crawl-international.test \
|
11_crawl-international.test \
|
||||||
11_crawl-longurl.test \
|
11_crawl-longurl.test \
|
||||||
11_crawl-parsing.test \
|
11_crawl-parsing.test \
|
||||||
12_crawl_https.test
|
12_crawl_https.test \
|
||||||
|
13_crawl_proxy_https.test
|
||||||
|
|
||||||
CLEANFILES = check-network_sh.cache
|
CLEANFILES = check-network_sh.cache
|
||||||
|
|||||||
@@ -6,39 +6,39 @@
|
|||||||
|
|
||||||
# do not enable online tests (./configure --disable-online-unit-tests)
|
# do not enable online tests (./configure --disable-online-unit-tests)
|
||||||
if test "$ONLINE_UNIT_TESTS" == "no"; then
|
if test "$ONLINE_UNIT_TESTS" == "no"; then
|
||||||
echo "online tests are disabled" >&2
|
echo "online tests are disabled" >&2
|
||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
# enable online tests (--enable-online-unit-tests)
|
# enable online tests (--enable-online-unit-tests)
|
||||||
elif test "$ONLINE_UNIT_TESTS" == "yes"; then
|
elif test "$ONLINE_UNIT_TESTS" == "yes"; then
|
||||||
exit 0
|
exit 0
|
||||||
|
|
||||||
# check if online tests are reachable
|
# check if online tests are reachable
|
||||||
else
|
else
|
||||||
|
|
||||||
# test url
|
# test url
|
||||||
url=http://ut.httrack.com/enabled
|
url=http://ut.httrack.com/enabled
|
||||||
|
|
||||||
# cache file name
|
# cache file name
|
||||||
cache=check-network_sh.cache
|
cache=check-network_sh.cache
|
||||||
|
|
||||||
# cached result ?
|
# cached result ?
|
||||||
if test -f $cache ; then
|
if test -f $cache; then
|
||||||
if grep -q "ok" $cache ; then
|
if grep -q "ok" $cache; then
|
||||||
exit 0
|
exit 0
|
||||||
else
|
else
|
||||||
echo "online tests are disabled (cached)" >&2
|
echo "online tests are disabled (cached)" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# fetch single file
|
# fetch single file
|
||||||
elif bash crawl-test.sh --errors 0 --files 1 httrack --timeout=3 --max-time=3 "$url" 2>/dev/null >/dev/null ; then
|
elif bash crawl-test.sh --errors 0 --files 1 httrack --timeout=3 --max-time=3 "$url" 2>/dev/null >/dev/null; then
|
||||||
echo "ok" > $cache
|
echo "ok" >$cache
|
||||||
exit 0
|
exit 0
|
||||||
else
|
else
|
||||||
echo "error" > $cache
|
echo "error" >$cache
|
||||||
echo "online tests are disabled (auto)" >&2
|
echo "online tests are disabled (auto)" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -2,185 +2,184 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
function warning {
|
function warning {
|
||||||
echo "** $*" >&2
|
echo "** $*" >&2
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
function die {
|
function die {
|
||||||
warning "$*"
|
warning "$*"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
function debug {
|
function debug {
|
||||||
if test -n "$verbose"; then
|
if test -n "$verbose"; then
|
||||||
echo "$*" >&2
|
echo "$*" >&2
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
function info {
|
function info {
|
||||||
printf "[$*] ..\t" >&2
|
printf '[%s] ..\t' "$*" >&2
|
||||||
}
|
}
|
||||||
|
|
||||||
function result {
|
function result {
|
||||||
echo "$*" >&2
|
echo "$*" >&2
|
||||||
}
|
}
|
||||||
|
|
||||||
function cleanup {
|
function cleanup {
|
||||||
debug "cleaning function called"
|
debug "cleaning function called"
|
||||||
if test -n "$tmpdir"; then
|
if test -n "$tmpdir"; then
|
||||||
if test -d "$tmpdir"; then
|
if test -d "$tmpdir"; then
|
||||||
if test -z "$nopurge"; then
|
if test -z "$nopurge"; then
|
||||||
debug "cleaning up $tmpdir"
|
debug "cleaning up $tmpdir"
|
||||||
rm -rf "$tmpdir"
|
rm -rf "$tmpdir"
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if test -n "$crawlpid"; then
|
||||||
|
debug "killing $crawlpid"
|
||||||
|
kill -9 "$crawlpid"
|
||||||
|
crawlpid=
|
||||||
fi
|
fi
|
||||||
fi
|
|
||||||
if test -n "$crawlpid"; then
|
|
||||||
debug "killing $crawlpid"
|
|
||||||
kill -9 "$crawlpid"
|
|
||||||
crawlpid=
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function usage {
|
function usage {
|
||||||
cat << EOF
|
cat <<EOF
|
||||||
usage: $0
|
usage: $0
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
function assert_equals {
|
function assert_equals {
|
||||||
info "$1"
|
info "$1"
|
||||||
if test ! "$2" == "$3"; then
|
if test ! "$2" == "$3"; then
|
||||||
result "expected '$2', got '$3'"
|
result "expected '$2', got '$3'"
|
||||||
exit 1
|
exit 1
|
||||||
else
|
else
|
||||||
result "OK ($2)"
|
result "OK ($2)"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
function start-crawl {
|
function start-crawl {
|
||||||
# parse args
|
# parse args
|
||||||
pos=1
|
pos=1
|
||||||
while test "$#" -ge "$pos" ; do
|
while test "$#" -ge "$pos"; do
|
||||||
case "${!pos}" in
|
case "${!pos}" in
|
||||||
--debug)
|
--debug)
|
||||||
verbose=1
|
verbose=1
|
||||||
;;
|
;;
|
||||||
--no-purge|--summary|--print-files)
|
--no-purge | --summary | --print-files) ;;
|
||||||
;;
|
--errors | --files | --found | --not-found | --directory)
|
||||||
--errors|--files|--found|--not-found|--directory)
|
pos=$((pos + 1))
|
||||||
pos=$[${pos}+1]
|
test "$#" -ge "$pos" || warning "missing argument" || return 1
|
||||||
test "$#" -ge "$pos" || warning "missing argument" || return 1
|
;;
|
||||||
;;
|
httrack)
|
||||||
httrack)
|
pos=$((pos + 1))
|
||||||
pos=$[${pos}+1]
|
break
|
||||||
break;
|
;;
|
||||||
;;
|
*)
|
||||||
*)
|
warning "unrecognized option ${!pos}"
|
||||||
warning "unrecognized option ${!pos}"
|
return 1
|
||||||
return 1
|
;;
|
||||||
;;
|
esac
|
||||||
esac
|
pos=$((pos + 1))
|
||||||
pos=$[${pos}+1]
|
done
|
||||||
done
|
debug "remaining args: ${*:pos}"
|
||||||
debug "remaining args: ${@:${pos}}"
|
|
||||||
|
|
||||||
# ut/ won't exceed 2 minutes
|
# ut/ won't exceed 2 minutes
|
||||||
moreargs="--quiet --max-time=120 --timeout=30 --connection-per-second=5"
|
moreargs=(--quiet --max-time=120 --timeout=30 --connection-per-second=5)
|
||||||
|
|
||||||
# proxy environment ?
|
# proxy environment ?
|
||||||
if test -n "$http_proxy"; then
|
if test -n "${http_proxy:-}"; then
|
||||||
moreargs="$moreargs --proxy $http_proxy"
|
moreargs+=(--proxy "$http_proxy")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
test -n "$tmpdir" || ! warning "no tmpdir" || return 1
|
test -n "$tmpdir" || ! warning "no tmpdir" || return 1
|
||||||
tmp="${tmpdir}/crawl"
|
tmp="${tmpdir}/crawl"
|
||||||
rm -rf "$tmp"
|
|
||||||
mkdir "$tmp" || ! warning "could not create $tmp" || return 1
|
|
||||||
|
|
||||||
which httrack >/dev/null || ! warning "could not find httrack" || return 1
|
|
||||||
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')
|
|
||||||
test -n "$ver" || ! warning "could not run httrack" || return 1
|
|
||||||
|
|
||||||
# start crawl
|
|
||||||
log="${tmp}/log"
|
|
||||||
debug starting httrack -O "${tmp}" ${moreargs} ${@:${pos}}
|
|
||||||
info "running httrack ${@:${pos}}"
|
|
||||||
httrack -O "${tmp}" --user-agent="httrack $ver ut ($(uname -omrs))" ${moreargs} ${@:${pos}} >"${log}" 2>&1 &
|
|
||||||
crawlpid="$!"
|
|
||||||
debug "started cralwer on pid $crawlpid"
|
|
||||||
wait "$crawlpid"
|
|
||||||
result="$?"
|
|
||||||
crawlpid=
|
|
||||||
test "$result" -eq 0 || ! result "error code $result" || return 1
|
|
||||||
result "OK"
|
|
||||||
grep -iE "^[0-9\:]*[[:space:]]Error:" "${tmp}/hts-log.txt" >&2
|
|
||||||
|
|
||||||
# now audit
|
|
||||||
while test "$#" -gt 0; do
|
|
||||||
case "$1" in
|
|
||||||
--no-purge)
|
|
||||||
nopurge=1
|
|
||||||
;;
|
|
||||||
--summary)
|
|
||||||
grep -E "^HTTrack Website Copier/[^ ]* mirror complete in " "${tmp}/hts-log.txt"
|
|
||||||
;;
|
|
||||||
--print-files)
|
|
||||||
find "${tmp}" -mindepth 1 -type f
|
|
||||||
;;
|
|
||||||
--errors)
|
|
||||||
shift
|
|
||||||
assert_equals "checking errors" "$1" "$(grep -iEc "^[0-9\:]*[[:space:]]Error:" "${tmp}/hts-log.txt")"
|
|
||||||
;;
|
|
||||||
--found)
|
|
||||||
shift
|
|
||||||
info "checking for $1"
|
|
||||||
if test -f "${tmp}/$1" ; then
|
|
||||||
result "OK"
|
|
||||||
else
|
|
||||||
result "not found"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
--not-found)
|
|
||||||
shift
|
|
||||||
info "checking for $1"
|
|
||||||
if test -f "${tmp}/$1" ; then
|
|
||||||
result "OK"
|
|
||||||
else
|
|
||||||
result "not found"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
--directory)
|
|
||||||
shift
|
|
||||||
info "checking for $1"
|
|
||||||
if test -d "${tmp}/$1" ; then
|
|
||||||
result "OK"
|
|
||||||
else
|
|
||||||
result "not found"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
--files)
|
|
||||||
shift
|
|
||||||
nFiles=$(grep -E "^HTTrack Website Copier/[^ ]* mirror complete in " "${tmp}/hts-log.txt" \
|
|
||||||
| sed -e 's/.*[[:space:]]\([^ ]*\)[[:space:]]files written.*/\1/g')
|
|
||||||
assert_equals "checking files" "$1" "$nFiles"
|
|
||||||
;;
|
|
||||||
httrack)
|
|
||||||
break;
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
shift
|
|
||||||
done
|
|
||||||
|
|
||||||
# cleanup
|
|
||||||
if test -z "$nopurge"; then
|
|
||||||
rm -rf "$tmp"
|
rm -rf "$tmp"
|
||||||
else
|
mkdir "$tmp" || ! warning "could not create $tmp" || return 1
|
||||||
tmpdir=
|
|
||||||
fi
|
which httrack >/dev/null || ! warning "could not find httrack" || return 1
|
||||||
|
ver=$(httrack -O /dev/null --version | sed -e 's/HTTrack version //')
|
||||||
|
test -n "$ver" || ! warning "could not run httrack" || return 1
|
||||||
|
|
||||||
|
# start crawl
|
||||||
|
log="${tmp}/log"
|
||||||
|
debug starting httrack -O "${tmp}" "${moreargs[@]}" "${@:pos}"
|
||||||
|
info "running httrack ${*:pos}"
|
||||||
|
httrack -O "${tmp}" --user-agent="httrack $ver ut ($(uname -omrs))" "${moreargs[@]}" "${@:pos}" >"${log}" 2>&1 &
|
||||||
|
crawlpid="$!"
|
||||||
|
debug "started cralwer on pid $crawlpid"
|
||||||
|
wait "$crawlpid"
|
||||||
|
result="$?"
|
||||||
|
crawlpid=
|
||||||
|
test "$result" -eq 0 || ! result "error code $result" || return 1
|
||||||
|
result "OK"
|
||||||
|
grep -iE "^[0-9\:]*[[:space:]]Error:" "${tmp}/hts-log.txt" >&2
|
||||||
|
|
||||||
|
# now audit
|
||||||
|
while test "$#" -gt 0; do
|
||||||
|
case "$1" in
|
||||||
|
--no-purge)
|
||||||
|
nopurge=1
|
||||||
|
;;
|
||||||
|
--summary)
|
||||||
|
grep -E "^HTTrack Website Copier/[^ ]* mirror complete in " "${tmp}/hts-log.txt"
|
||||||
|
;;
|
||||||
|
--print-files)
|
||||||
|
find "${tmp}" -mindepth 1 -type f
|
||||||
|
;;
|
||||||
|
--errors)
|
||||||
|
shift
|
||||||
|
assert_equals "checking errors" "$1" "$(grep -iEc "^[0-9\:]*[[:space:]]Error:" "${tmp}/hts-log.txt")"
|
||||||
|
;;
|
||||||
|
--found)
|
||||||
|
shift
|
||||||
|
info "checking for $1"
|
||||||
|
if test -f "${tmp}/$1"; then
|
||||||
|
result "OK"
|
||||||
|
else
|
||||||
|
result "not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
--not-found)
|
||||||
|
shift
|
||||||
|
info "checking for $1"
|
||||||
|
if test -f "${tmp}/$1"; then
|
||||||
|
result "OK"
|
||||||
|
else
|
||||||
|
result "not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
--directory)
|
||||||
|
shift
|
||||||
|
info "checking for $1"
|
||||||
|
if test -d "${tmp}/$1"; then
|
||||||
|
result "OK"
|
||||||
|
else
|
||||||
|
result "not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
--files)
|
||||||
|
shift
|
||||||
|
nFiles=$(grep -E "^HTTrack Website Copier/[^ ]* mirror complete in " "${tmp}/hts-log.txt" |
|
||||||
|
sed -e 's/.*[[:space:]]\([^ ]*\)[[:space:]]files written.*/\1/g')
|
||||||
|
assert_equals "checking files" "$1" "$nFiles"
|
||||||
|
;;
|
||||||
|
httrack)
|
||||||
|
break
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
shift
|
||||||
|
done
|
||||||
|
|
||||||
|
# cleanup
|
||||||
|
if test -z "$nopurge"; then
|
||||||
|
rm -rf "$tmp"
|
||||||
|
else
|
||||||
|
tmpdir=
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# check args
|
# check args
|
||||||
@@ -195,7 +194,7 @@ tmpdir=
|
|||||||
crawlpid=
|
crawlpid=
|
||||||
nopurge=
|
nopurge=
|
||||||
verbose=
|
verbose=
|
||||||
trap "cleanup" 0 1 2 3 4 5 6 7 8 9 11 13 14 15 16 19 24 25
|
trap cleanup EXIT HUP INT QUIT ILL TRAP ABRT BUS FPE SEGV PIPE ALRM TERM STKFLT XCPU XFSZ
|
||||||
|
|
||||||
# working directory
|
# working directory
|
||||||
tmpdir="${tmptopdir}/httrack_ut.$$"
|
tmpdir="${tmptopdir}/httrack_ut.$$"
|
||||||
|
|||||||
151
tests/proxy-https-server.py
Normal file
151
tests/proxy-https-server.py
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Local CONNECT proxy + self-signed HTTPS origin for the issue #85 test.
|
||||||
|
|
||||||
|
Starts a TLS origin server and an HTTP proxy that honours CONNECT, on ephemeral
|
||||||
|
ports. Every request line the proxy receives (and any Proxy-Authorization) is
|
||||||
|
appended to the proxy log; every header the origin receives over the tunnel is
|
||||||
|
appended to the origin log. That lets the test assert both that an https crawl
|
||||||
|
tunneled through the proxy and that proxy credentials never leaked to the origin.
|
||||||
|
|
||||||
|
Proxy modes (argv[3], default "ok"):
|
||||||
|
ok - honour CONNECT and tunnel to the origin
|
||||||
|
flood - answer 200 then stream headers forever with no blank line, to exercise
|
||||||
|
the client's bound on the proxy response (must not hang the crawl)
|
||||||
|
|
||||||
|
Usage: proxy-https-server.py <cert.pem> <logdir> [mode]
|
||||||
|
Prints "ORIGIN <port>", "PROXY <port>", then "ready" (one per line) on stdout.
|
||||||
|
"""
|
||||||
|
import http.server
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import socketserver
|
||||||
|
import ssl
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
|
||||||
|
ORIGIN_BODY = b"<html><body>ORIGIN-PAGE-85</body></html>"
|
||||||
|
PROXY_LOG = "proxy.log"
|
||||||
|
ORIGIN_LOG = "origin-headers.log"
|
||||||
|
|
||||||
|
|
||||||
|
def make_origin(logdir):
|
||||||
|
class Origin(http.server.BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
with open(os.path.join(logdir, ORIGIN_LOG), "a") as handle:
|
||||||
|
for key in self.headers.keys():
|
||||||
|
handle.write(key + "\n")
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "text/html")
|
||||||
|
self.send_header("Content-Length", str(len(ORIGIN_BODY)))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(ORIGIN_BODY)
|
||||||
|
|
||||||
|
def log_message(self, *args):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return Origin
|
||||||
|
|
||||||
|
|
||||||
|
def start_origin(certfile, logdir):
|
||||||
|
httpd = socketserver.TCPServer(("127.0.0.1", 0), make_origin(logdir))
|
||||||
|
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||||
|
ctx.load_cert_chain(certfile)
|
||||||
|
httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
|
||||||
|
port = httpd.socket.getsockname()[1]
|
||||||
|
threading.Thread(target=httpd.serve_forever, daemon=True).start()
|
||||||
|
return port
|
||||||
|
|
||||||
|
|
||||||
|
def pipe(src, dst):
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
data = src.recv(65536)
|
||||||
|
if not data:
|
||||||
|
break
|
||||||
|
dst.sendall(data)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
for sock in (src, dst):
|
||||||
|
try:
|
||||||
|
sock.shutdown(socket.SHUT_RDWR)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def handle_client(conn, logdir, mode):
|
||||||
|
rfile = conn.makefile("rb")
|
||||||
|
request_line = rfile.readline().decode("latin-1").strip()
|
||||||
|
auth = None
|
||||||
|
while True:
|
||||||
|
line = rfile.readline().decode("latin-1")
|
||||||
|
if line in ("\r\n", "\n", ""):
|
||||||
|
break
|
||||||
|
key, _, value = line.partition(":")
|
||||||
|
if key.strip().lower() == "proxy-authorization":
|
||||||
|
auth = value.strip()
|
||||||
|
with open(os.path.join(logdir, PROXY_LOG), "a") as handle:
|
||||||
|
handle.write(request_line + "\n")
|
||||||
|
if auth is not None:
|
||||||
|
handle.write("AUTH " + auth + "\n")
|
||||||
|
parts = request_line.split()
|
||||||
|
if not (len(parts) >= 2 and parts[0] == "CONNECT"):
|
||||||
|
conn.sendall(b"HTTP/1.0 501 Not Implemented\r\n\r\n")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
if mode == "flood":
|
||||||
|
# 200, then an endless header stream with no terminating blank line: the
|
||||||
|
# client must bound this and give up, not hang.
|
||||||
|
try:
|
||||||
|
conn.sendall(b"HTTP/1.0 200 Connection established\r\n")
|
||||||
|
while True:
|
||||||
|
conn.sendall(b"X-Pad: 0123456789\r\n")
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
host, _, port = parts[1].partition(":")
|
||||||
|
try:
|
||||||
|
upstream = socket.create_connection((host, int(port or 443)))
|
||||||
|
except OSError:
|
||||||
|
conn.sendall(b"HTTP/1.0 502 Bad Gateway\r\n\r\n")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
conn.sendall(b"HTTP/1.0 200 Connection established\r\n\r\n")
|
||||||
|
threading.Thread(target=pipe, args=(conn, upstream), daemon=True).start()
|
||||||
|
pipe(upstream, conn)
|
||||||
|
|
||||||
|
|
||||||
|
def start_proxy(logdir, mode):
|
||||||
|
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||||
|
srv.bind(("127.0.0.1", 0))
|
||||||
|
srv.listen(16)
|
||||||
|
port = srv.getsockname()[1]
|
||||||
|
|
||||||
|
def serve():
|
||||||
|
while True:
|
||||||
|
conn, _ = srv.accept()
|
||||||
|
threading.Thread(
|
||||||
|
target=handle_client, args=(conn, logdir, mode), daemon=True
|
||||||
|
).start()
|
||||||
|
|
||||||
|
threading.Thread(target=serve, daemon=True).start()
|
||||||
|
return port
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
certfile, logdir = sys.argv[1], sys.argv[2]
|
||||||
|
mode = sys.argv[3] if len(sys.argv) > 3 else "ok"
|
||||||
|
for name in (PROXY_LOG, ORIGIN_LOG):
|
||||||
|
open(os.path.join(logdir, name), "w").close()
|
||||||
|
origin_port = start_origin(certfile, logdir)
|
||||||
|
proxy_port = start_proxy(logdir, mode)
|
||||||
|
print("ORIGIN %d" % origin_port, flush=True)
|
||||||
|
print("PROXY %d" % proxy_port, flush=True)
|
||||||
|
print("ready", flush=True)
|
||||||
|
threading.Event().wait()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -2,19 +2,19 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
error=0
|
error=0
|
||||||
for i in *.test ; do
|
for i in *.test; do
|
||||||
if bash $i ; then
|
if bash "$i"; then
|
||||||
echo "$i: passed" >&2
|
echo "$i: passed" >&2
|
||||||
else
|
else
|
||||||
echo "$i: ERROR" >&2
|
echo "$i: ERROR" >&2
|
||||||
error=$[${error}+1]
|
error=$((error + 1))
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
if test "$error" -eq 0; then
|
if test "$error" -eq 0; then
|
||||||
echo "all tests passed" >&2
|
echo "all tests passed" >&2
|
||||||
else
|
else
|
||||||
echo "${error} test(s) failed" >&2
|
echo "${error} test(s) failed" >&2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
exit $error
|
exit $error
|
||||||
|
|||||||
Reference in New Issue
Block a user