mirror of
https://github.com/xroche/httrack.git
synced 2026-06-25 19:47:36 +03:00
Compare commits
4 Commits
fix-206-up
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5501faa7b1 | ||
|
|
6322b6fb1f | ||
|
|
58f368a91a | ||
|
|
c97b3e233e |
@@ -353,6 +353,14 @@ static void basic_selftests(void) {
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
|
||||
"noextfile", 1) == 1);
|
||||
assertf(strcmp(r.contenttype, "application/octet-stream") == 0);
|
||||
// empty fil: no extension to scan; must not over-read before the string.
|
||||
// flag==0 -> 0 (nothing written), flag==1 -> octet-stream.
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype), "",
|
||||
0) == 0);
|
||||
assertf(r.contenttype[0] == '\0');
|
||||
assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype), "",
|
||||
1) == 1);
|
||||
assertf(strcmp(r.contenttype, "application/octet-stream") == 0);
|
||||
// a user --assume rule with an empty value matches but writes nothing:
|
||||
// get_userhttptype returns 1 with the buffer empty, so get_httptype_sized
|
||||
// must still report 0 (callers test the return like the old
|
||||
|
||||
@@ -4177,9 +4177,10 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
/* Check html -> text/html */
|
||||
const char *a = fil + strlen(fil) - 1;
|
||||
|
||||
while((*a != '.') && (*a != '/') && (a > fil))
|
||||
/* a < fil when fil is empty: bound before dereferencing */
|
||||
while ((a > fil) && (*a != '.') && (*a != '/'))
|
||||
a--;
|
||||
if (*a == '.' && strlen(a) < 32) {
|
||||
if (a >= fil && *a == '.' && strlen(a) < 32) {
|
||||
int j = 0;
|
||||
|
||||
a++;
|
||||
|
||||
11
tests/21_local-intl-update.test
Normal file
11
tests/21_local-intl-update.test
Normal file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# #157: a dotless, accented URL named .html on the first crawl must keep .html
|
||||
# across an update -- not revert to the extensionless name.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'intl/Instalação_CVS_no_Ubuntu.html' \
|
||||
--not-found 'intl/Instalação_CVS_no_Ubuntu' \
|
||||
httrack 'BASEURL/intl/index.html'
|
||||
17
tests/22_local-broken-size.test
Executable file
17
tests/22_local-broken-size.test
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
# Issues #32/#41: a Content-Length that disagrees with the body warns "bogus
|
||||
# state (broken size)" and skips the cache; -%B (tolerant) accepts it.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# Default: warn, but the file is still written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-found 'bogus state \(broken size' \
|
||||
httrack 'BASEURL/size/index.html'
|
||||
|
||||
# -%B (tolerant): no warning, file written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'size/oversize.bin' \
|
||||
--log-not-found 'bogus state' \
|
||||
httrack 'BASEURL/size/index.html' '-%B'
|
||||
19
tests/23_local-errpage.test
Normal file
19
tests/23_local-errpage.test
Normal file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
# Issue #17: with "no error pages" (-o0), 4xx/5xx bodies must not be written;
|
||||
# a genuine 0-byte 200 stays. Default (-o1) writes the error page. (#17's purge
|
||||
# half also does not reproduce; the purge path is not exercised here.)
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# -o0: 404 suppressed, good page and the legit 0-byte 200 kept.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'errpage/good.html' \
|
||||
--found 'errpage/empty.html' \
|
||||
--not-found 'errpage/missing.html' \
|
||||
httrack 'BASEURL/errpage/index.html' '-o0'
|
||||
|
||||
# Control -o1 (default): the 404 error page is written.
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
|
||||
--found 'errpage/missing.html' \
|
||||
httrack 'BASEURL/errpage/index.html' '-o1'
|
||||
@@ -60,6 +60,9 @@ TESTS = \
|
||||
17_local-empty-ct.test \
|
||||
18_local-update.test \
|
||||
19_local-connect-fallback.test \
|
||||
20_local-resume-loop.test
|
||||
20_local-resume-loop.test \
|
||||
21_local-intl-update.test \
|
||||
22_local-broken-size.test \
|
||||
23_local-errpage.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -14,7 +14,9 @@
|
||||
# Usage:
|
||||
# bash local-crawl.sh [--tls] [--root DIR] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
|
||||
set -u
|
||||
|
||||
@@ -107,7 +109,7 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--found | --not-found | --directory)
|
||||
--found | --not-found | --directory | --log-found | --log-not-found)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
@@ -196,6 +198,15 @@ if test -n "$rerun"; then
|
||||
exit 1
|
||||
}
|
||||
result "OK (update)"
|
||||
# The update summary reports "files updated"; a fresh crawl never does. Assert
|
||||
# it so a regression that bypasses the cache (re-crawls fresh) can't pass.
|
||||
info "checking update used the cache"
|
||||
if grep -aqE "mirror complete in .*files updated" "${out}/hts-log.txt"; then
|
||||
result "OK"
|
||||
else
|
||||
result "update pass did not report cache activity"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||
@@ -248,6 +259,22 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--log-found)
|
||||
i=$((i + 1))
|
||||
info "checking log matches ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then result "OK"; else
|
||||
result "not in log"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--log-not-found)
|
||||
i=$((i + 1))
|
||||
info "checking log lacks ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then
|
||||
result "present in log"
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
@@ -177,6 +177,17 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
body, ctype = self.TYPE_MATRIX[path]
|
||||
self.send_raw(body, ctype)
|
||||
|
||||
# --- special chars in URLs across an update (issue #157) ---------------
|
||||
# A dotless, accented basename served as text/html (MediaWiki style). The
|
||||
# name the first crawl picks (.html) must survive the update pass.
|
||||
INTL_NAME = "Instalação_CVS_no_Ubuntu"
|
||||
|
||||
def route_intl_index(self):
|
||||
self.send_html('\t<a href="%s">accented</a>\n' % self.INTL_NAME)
|
||||
|
||||
def route_intl_page(self):
|
||||
self.send_raw(b"<html><body>accented page</body></html>\n", "text/html")
|
||||
|
||||
# resume / 416 loop (#206): the first GET stalls after a prefix so the crawl
|
||||
# can be interrupted (partial + temp-ref); every later request is 416.
|
||||
RESUME_PREFIX = b"PARTIAL-" + b"x" * 4096 # flushed before the stall
|
||||
@@ -214,6 +225,39 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
# error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
|
||||
# bodies off disk; a genuine 0-byte 200 is a valid file and stays.
|
||||
def route_errpage_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="good.html">good</a>\n'
|
||||
'\t<a href="missing.html">missing</a>\n'
|
||||
'\t<a href="empty.html">empty</a>\n'
|
||||
)
|
||||
|
||||
def route_errpage_good(self):
|
||||
self.send_raw(b"<html><body>good page</body></html>\n", "text/html")
|
||||
|
||||
def route_errpage_missing(self):
|
||||
self.send_html("\t404 error body", status=404, extra_status="Not Found")
|
||||
|
||||
def route_errpage_empty(self):
|
||||
self.send_raw(b"", "text/html")
|
||||
|
||||
# broken Content-Length (#32/#41): declared size != bytes sent. httrack
|
||||
# warns "bogus state (broken size)" and skips the cache unless -%B.
|
||||
def route_size_index(self):
|
||||
self.send_html('\t<a href="oversize.bin">over</a>\n')
|
||||
|
||||
def route_size_oversize(self):
|
||||
body = b"A" * 100
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
self.send_header("Content-Length", str(len(body) - 2)) # lie: too short
|
||||
self.send_header("Connection", "close")
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
@@ -233,8 +277,16 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/types/style.css": route_types,
|
||||
"/types/data.json": route_types,
|
||||
"/types/gen.php": route_types,
|
||||
"/intl/index.html": route_intl_index,
|
||||
"/intl/" + INTL_NAME: route_intl_page,
|
||||
"/resume/index.html": route_resume_index,
|
||||
"/resume/blob.txt": route_resume,
|
||||
"/size/index.html": route_size_index,
|
||||
"/size/oversize.bin": route_size_oversize,
|
||||
"/errpage/index.html": route_errpage_index,
|
||||
"/errpage/good.html": route_errpage_good,
|
||||
"/errpage/missing.html": route_errpage_missing,
|
||||
"/errpage/empty.html": route_errpage_empty,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
@@ -242,7 +294,8 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
handler = self.ROUTES.get(path)
|
||||
# Match percent-encoded paths (accented #157 route) by their decoded form.
|
||||
handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
|
||||
if handler is not None:
|
||||
handler(self)
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user