mirror of
https://github.com/xroche/httrack.git
synced 2026-07-05 08:34:10 +03:00
Compare commits
2 Commits
master
...
wait-socke
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
75fd5ae90f | ||
|
|
1e7744865f |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -39,6 +39,3 @@ Makefile
|
||||
|
||||
# Editor / autotools backup files.
|
||||
*~
|
||||
|
||||
# Python bytecode (tests/local-server.py).
|
||||
__pycache__/
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
AC_PREREQ([2.71])
|
||||
|
||||
AC_INIT([httrack], [3.49.11], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_INIT([httrack], [3.49.10], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_COPYRIGHT([
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2015 Xavier Roche and other contributors
|
||||
@@ -29,11 +29,10 @@ AC_CONFIG_SRCDIR(src/httrack.c)
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
AM_INIT_AUTOMAKE([subdir-objects])
|
||||
# 3:3:0: 3.49.11 only adds enum values, macros and inline helpers to the
|
||||
# installed headers (no struct layout or exported signature changed vs
|
||||
# 3.49.10), so it stays soname .so.3; bump revision.
|
||||
# 3:2:0: 3.49.10 only appends tail fields to the options struct (no existing
|
||||
# symbol or offset changed vs 3.49.9), so it stays soname .so.3; bump revision.
|
||||
# (3:0:0 was the htsblk mime-buffer widening, the ABI break that moved .so.2 -> .so.3.)
|
||||
VERSION_INFO="3:3:0"
|
||||
VERSION_INFO="3:2:0"
|
||||
AM_MAINTAINER_MODE
|
||||
AC_USE_SYSTEM_EXTENSIONS
|
||||
|
||||
|
||||
12
debian/changelog
vendored
12
debian/changelog
vendored
@@ -1,15 +1,3 @@
|
||||
httrack (3.49.11-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: crawl correctness and security fixes (network-facing
|
||||
buffer overflows, file-type detection, redirect handling) and modernized
|
||||
web defaults; full list in history.txt.
|
||||
* Add DEP-12 upstream metadata (#466).
|
||||
* Bump debhelper compat to 14 (#466).
|
||||
* Drop the redundant Priority field and update the NMU lintian override to
|
||||
the current tag names (#466).
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sun, 05 Jul 2026 00:03:18 +0200
|
||||
|
||||
httrack (3.49.10-1) unstable; urgency=medium
|
||||
|
||||
* New upstream release: new download-pacing and URL-handling options plus a
|
||||
|
||||
17
history.txt
17
history.txt
@@ -4,23 +4,6 @@ HTTrack Website Copier release history:
|
||||
|
||||
This file lists all changes and fixes that have been made for HTTrack
|
||||
|
||||
3.49-11
|
||||
+ New: parse robots.txt Allow rules and path wildcards per RFC 9309 (#452)
|
||||
+ New: advertise deflate in Accept-Encoding and decode deflate responses (#450)
|
||||
+ New: follow <source> and <track> media elements as embedded links (#451)
|
||||
+ New: added modern web MIME types to the type/extension table (#448)
|
||||
+ Fixed: enforce the -E time limit during a slow transfer instead of only between files (#481)
|
||||
+ Fixed: sniff the leading bytes of a download so a misdeclared Content-Type no longer renames a correct URL extension
|
||||
+ Fixed: fast transfers could be saved under their temporary .delayed placeholder name (#5, #107)
|
||||
+ Fixed: follow a redirect that maps to the same saved file instead of writing a self-pointing stub (#159)
|
||||
+ Fixed: several network-facing buffer overflows in the FTP, Java and HTML parsers
|
||||
+ Fixed: the htsjava plugin could not be loaded (hidden entry points, stale library name)
|
||||
+ Fixed: HTML-escape truncation and a cache-buffer leak in the parser
|
||||
+ Changed: modernized the default User-Agent to an honest HTTrack identifier (#449)
|
||||
+ Changed: decode the full WHATWG set of HTML named character references (#443)
|
||||
+ Changed: refreshed stale HTTP status, proxy-port and TLS-floor constants (#453)
|
||||
+ Changed: multiple internal hardening, build, test and CI improvements
|
||||
|
||||
3.49-10
|
||||
+ New: --cookies-file to preload a Netscape cookies.txt before crawling (#215)
|
||||
+ New: --pause to space out file downloads by a random delay (#185)
|
||||
|
||||
@@ -43,8 +43,8 @@ Please visit our Website: http://www.httrack.com
|
||||
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
||||
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
||||
LIB_VERSION the data/cache format generation. */
|
||||
#define HTTRACK_VERSION "3.49-11"
|
||||
#define HTTRACK_VERSIONID "3.49.11"
|
||||
#define HTTRACK_VERSION "3.49-10"
|
||||
#define HTTRACK_VERSIONID "3.49.10"
|
||||
#define HTTRACK_AFF_VERSION "3.x"
|
||||
#define HTTRACK_LIB_VERSION "2.0"
|
||||
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# -M byte cap (#77): the crawl must stop with the "giving up" error and keep
|
||||
# the mirror well under the 8 x 640KB the fixture totals uncapped.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
# cap = -M + the 4 in-flight files the smooth stop lets finish + one of margin
|
||||
bash "$top_srcdir/tests/local-crawl.sh" \
|
||||
--log-found 'More than 400000 bytes have been transferred.. giving up' \
|
||||
--found bigfiles/p0.bin \
|
||||
--max-mirror-bytes 3700000 \
|
||||
httrack 'BASEURL/bigfiles/index.html' -M400000 -c4
|
||||
@@ -1,55 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Diverse seeded /big/ crawl: 12 pattern families, decoy absence, update pass
|
||||
# must 304-revalidate. 360 = 1 index + 96 pages + 192 imgs + 5 shared + 60
|
||||
# family + 6 singles; the 4 planted errors write -o1 pages, not counted.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --rerun \
|
||||
--errors 4 --files 360 \
|
||||
--found 'big/p/95.html' \
|
||||
--found 'big/a/d1/d2/d3/d4/d5/d6/d7/d8/deep.png' \
|
||||
--found 'big/a/f2-2x.png' \
|
||||
--found 'big/a/subs.vtt' \
|
||||
--found 'big/a/font.woff2' \
|
||||
--found 'big/a/js-data.bin' \
|
||||
--found 'big/d/01.pdf' \
|
||||
--found 'big/d/named.pdf' \
|
||||
--found 'big/a/doc.pdf' \
|
||||
--found "big/f9/caf$(printf '\xc3\xa9').html" \
|
||||
--found 'big/f7/fa.html' \
|
||||
--found 'big/a/ref.png' \
|
||||
--found 'big/f6/sub/leaf.html' \
|
||||
--found 'big/f1/dir/index.html' \
|
||||
--found 'big/f10/empty.html' \
|
||||
--found 'big/indexd41d.html' \
|
||||
--found 'big/a/i0a.png' \
|
||||
--not-found 'big/x/og' \
|
||||
--not-found 'big/x/tw' \
|
||||
--not-found 'big/x/jsonld.png' \
|
||||
--not-found 'big/x/never-scanned.png' \
|
||||
--not-found 'big/x/atom-only.html' \
|
||||
--not-found 'big/x/sitemap-only.html' \
|
||||
--not-found 'big/x/form-target.html' \
|
||||
--not-found 'big/x/formact' \
|
||||
--not-found 'big/x/ping' \
|
||||
--not-found 'big/x/aj.jar' \
|
||||
--not-found 'big/x/bj.jar' \
|
||||
--not-found 'big/x/is1.png' \
|
||||
--not-found 'big/x/concat.html' \
|
||||
--file-matches 'big/p/2.html' 'srcset="\.\./a/f2-1x\.png 1x, \.\./a/f2-2x\.png 2x"' \
|
||||
--file-matches 'big/a/blk2.css' 'url\(blk2-bg\.png\)' \
|
||||
--file-matches 'big/p/5.html' "document\\.write\\('<a href=\"\\.\\./f5/dw\\.html\"" \
|
||||
--file-not-matches 'big/p/1.html' 'href="/big/' \
|
||||
--log-not-found 'bogus state|[Pp]anic|assert' \
|
||||
--log-found '\(404\) at link [^ ]*/big/e/404\.html' \
|
||||
--log-found '\(410\) at link [^ ]*/big/e/410\.html' \
|
||||
--log-found '\(500\) at link [^ ]*/big/e/500\.html' \
|
||||
--log-found 'decompressing.*big/e/gztrunc\.html' \
|
||||
--log-found ', no files updated' \
|
||||
--max-mirror-bytes 700000 \
|
||||
--min-mirror-bytes 500000 \
|
||||
httrack 'BASEURL/big/index.html' --retries=0 -c8 -%c100 -A100000000
|
||||
@@ -97,8 +97,6 @@ TESTS = \
|
||||
31_local-javaclass.test \
|
||||
32_local-cdispo.test \
|
||||
33_local-delayed.test \
|
||||
34_local-maxtime.test \
|
||||
35_local-maxsize.test \
|
||||
36_local-bigcrawl.test
|
||||
34_local-maxtime.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -16,10 +16,8 @@
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
|
||||
# --max-mirror-bytes N \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --max/--min-mirror-bytes bound the mirrored content bytes (host root).
|
||||
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
|
||||
# host root), to assert rewritten link/content survived the crawl.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
@@ -126,7 +124,7 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--found | --not-found | --directory | --log-found | --log-not-found | --max-mirror-bytes | --min-mirror-bytes)
|
||||
--found | --not-found | --directory | --log-found | --log-not-found)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
@@ -318,24 +316,6 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
--max-mirror-bytes)
|
||||
i=$((i + 1))
|
||||
sz=$(find "$hostroot" -type f -exec cat {} + | wc -c | tr -d '[:space:]')
|
||||
info "checking mirror size ${sz} <= ${audit[$i]} bytes"
|
||||
if test "$sz" -le "${audit[$i]}"; then result "OK"; else
|
||||
result "mirror too big"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--min-mirror-bytes)
|
||||
i=$((i + 1))
|
||||
sz=$(find "$hostroot" -type f -exec cat {} + | wc -c | tr -d '[:space:]')
|
||||
info "checking mirror size ${sz} >= ${audit[$i]} bytes"
|
||||
if test "$sz" -ge "${audit[$i]}"; then result "OK"; else
|
||||
result "mirror too small"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--file-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
|
||||
@@ -15,7 +15,6 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import hashlib
|
||||
import os
|
||||
import time
|
||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||
@@ -43,416 +42,6 @@ PAGE = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"""
|
||||
|
||||
|
||||
# --- /big/ seeded pseudo-site (36_local-bigcrawl) ---------------------------
|
||||
# Deterministic ~360-file tree; bodies derive from sha256(BIG_SEED, name) so
|
||||
# every run serves identical content and the test pins exact counts.
|
||||
BIG_SEED = "bigcrawl-lite-1"
|
||||
BIG_PAGES = 96
|
||||
BIG_FANOUT = 4
|
||||
# Fixed validator: a matching If-Modified-Since gets 304, so the update pass
|
||||
# revalidates instead of re-downloading.
|
||||
BIG_LASTMOD = "Mon, 01 Jan 2024 00:00:00 GMT"
|
||||
|
||||
BIG_CTYPES = {
|
||||
"html": "text/html",
|
||||
"css": "text/css",
|
||||
"js": "application/x-javascript",
|
||||
"png": "image/png",
|
||||
"gif": "image/gif",
|
||||
"jpg": "image/jpeg",
|
||||
"webp": "image/webp",
|
||||
"pdf": "application/pdf",
|
||||
"woff2": "font/woff2",
|
||||
"mp4": "video/mp4",
|
||||
"webm": "video/webm",
|
||||
"mp3": "audio/mpeg",
|
||||
"vtt": "text/vtt",
|
||||
"xml": "text/xml",
|
||||
"svg": "image/svg+xml",
|
||||
"jar": "application/java-archive",
|
||||
"bin": "application/octet-stream",
|
||||
}
|
||||
|
||||
# Honest magic bytes per claimed type so the #478 sniff never contests.
|
||||
BIG_MAGIC = {
|
||||
"png": b"\x89PNG\r\n\x1a\n",
|
||||
"gif": b"GIF89a",
|
||||
"jpg": b"\xff\xd8\xff\xe0",
|
||||
"webp": b"RIFF\x10\x27\x00\x00WEBPVP8 ",
|
||||
"pdf": b"%PDF-1.4\n",
|
||||
"woff2": b"wOF2",
|
||||
"mp4": b"\x00\x00\x00\x18ftypmp42",
|
||||
"webm": b"\x1a\x45\xdf\xa3",
|
||||
"mp3": b"ID3\x04\x00\x00\x00\x00\x00\x00",
|
||||
"jar": b"PK\x03\x04",
|
||||
}
|
||||
|
||||
|
||||
def big_blob(name, size):
|
||||
out = b""
|
||||
n = 0
|
||||
while len(out) < size:
|
||||
out += hashlib.sha256(f"{BIG_SEED}/{name}/{n}".encode()).digest()
|
||||
n += 1
|
||||
return out[:size]
|
||||
|
||||
|
||||
def big_asset(name):
|
||||
ext = name.rsplit(".", 1)[-1]
|
||||
size = 200 + int(hashlib.sha256(name.encode()).hexdigest(), 16) % 3800
|
||||
raw = big_blob(name, size)
|
||||
if ext in ("css", "js", "txt"):
|
||||
return b"/* " + raw.hex().encode() + b" */"
|
||||
return BIG_MAGIC.get(ext, b"") + raw
|
||||
|
||||
|
||||
def big_html(title, inner):
|
||||
page = (
|
||||
"<!DOCTYPE html><html><head><title>%s</title></head><body>\n%s\n</body></html>"
|
||||
% (
|
||||
title,
|
||||
inner,
|
||||
)
|
||||
)
|
||||
return page.encode()
|
||||
|
||||
|
||||
def _hexfill(name):
|
||||
return big_blob(name, 160).hex()
|
||||
|
||||
|
||||
HOME = '<a href="/big/index.html">home</a>'
|
||||
|
||||
BIG_TEXT_ASSETS = {
|
||||
"site.css": (
|
||||
"body { background: url(bg.png); } /* %s */" % _hexfill("site.css"),
|
||||
"text/css",
|
||||
),
|
||||
"print.css": ("p { margin: 0; } /* %s */" % _hexfill("print.css"), "text/css"),
|
||||
"blk.css": (
|
||||
'@import "blk2.css";\n'
|
||||
'@font-face { font-family: big; src: local("Nope Sans"), '
|
||||
'url(font.woff2) format("woff2"); }\n'
|
||||
"/* %s */" % _hexfill("blk.css"),
|
||||
"text/css",
|
||||
),
|
||||
# Absolute url() must come back relative after the rewrite (test greps it);
|
||||
# the \/ escapes collapse to an already-linked URL if taken literally.
|
||||
"blk2.css": (
|
||||
"body { background: url(/big/a/blk2-bg.png); }\n"
|
||||
"i { background: url(/big\\/a\\/bg.png); }\n"
|
||||
"/* %s */" % _hexfill("blk2.css"),
|
||||
"text/css",
|
||||
),
|
||||
# .open() grabs its first arg only (a method there is rejected, #218), so
|
||||
# the window.open single-URL form is the token-detected shape.
|
||||
"app.js": (
|
||||
'var im = new Image(); im.src = "/big/a/js-img.png";\n'
|
||||
'function pop() { window.open("/big/a/js-data.bin"); }\n'
|
||||
"// %s\n" % _hexfill("app.js"),
|
||||
"application/x-javascript",
|
||||
),
|
||||
"heavy.js": (
|
||||
'var h = new Image(); h.src = "/big/a/js1.png";\n'
|
||||
'function nav() { location.href = "/big/p/1.html"; }\n'
|
||||
'function pop() { window.open("/big/a/js2.bin"); }\n'
|
||||
"// %s\n" % _hexfill("heavy.js"),
|
||||
"application/x-javascript",
|
||||
),
|
||||
# text/javascript is fetched but never scanned: the URL inside must stay
|
||||
# out of the mirror.
|
||||
"decoy.js": (
|
||||
'var d = new Image(); d.src = "/big/x/never-scanned.png";\n',
|
||||
"text/javascript",
|
||||
),
|
||||
"subs.vtt": ("WEBVTT\n\n00:00.000 --> 00:01.000\nbig\n", "text/vtt"),
|
||||
"logo.svg": (
|
||||
'<svg xmlns="http://www.w3.org/2000/svg" width="4" height="4">'
|
||||
'<image href="ref.png" width="4" height="4"/></svg>',
|
||||
"image/svg+xml",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _fam_feeds(port):
|
||||
return (
|
||||
'<link rel="alternate" type="application/rss+xml" href="/big/f12/rss.xml">'
|
||||
'<a href="/big/f12/atom.xml">atom</a>'
|
||||
'<a href="/big/f12/sitemap.xml">sitemap</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_plain(port):
|
||||
return (
|
||||
'<a href="../f1/one.html">one</a>'
|
||||
'<a href="./two.html">two</a>'
|
||||
'<a href="../../big/f1/tri.html">tri</a>'
|
||||
'<a href="/big/f1/abs.html">abs</a>'
|
||||
'<a href="/big/f1/list.html">list</a>'
|
||||
'<a href="/big/f1/list.html?page=2">p2</a>'
|
||||
'<a href="/big/f1/list.html?page=3&sort=asc">p3</a>'
|
||||
'<a href="/big/f1/dir">dir</a>'
|
||||
'<a href="">self</a><a href="#">frag</a>'
|
||||
'<a href="mailto:big@example.com">mail</a>'
|
||||
'<a href="tel:+15551234">tel</a>'
|
||||
'<a href="data:text/plain;base64,aGk=">data</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_srcset(port):
|
||||
return (
|
||||
'<img src="/big/a/f2-base.png">'
|
||||
'<img srcset="/big/a/f2-1x.png 1x, /big/a/f2-2x.png 2x"'
|
||||
' src="/big/a/f2-base.png">'
|
||||
'<img data-srcset="/big/a/f2-1x.png 1x, /big/a/f2-2x.png 2x"'
|
||||
' src="/big/a/f2-base.png" loading="lazy">'
|
||||
'<picture><source type="image/webp" srcset="/big/a/f2-alt.webp">'
|
||||
'<img src="/big/a/f2-base.png"></picture>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_media(port):
|
||||
return (
|
||||
'<video src="/big/a/clip.mp4" poster="/big/a/poster.jpg">'
|
||||
'<source src="/big/a/clip.webm" type="video/webm">'
|
||||
'<track src="/big/a/subs.vtt" kind="subtitles" srclang="en">'
|
||||
"</video>"
|
||||
'<audio><source src="/big/a/tune.mp3" type="audio/mpeg"></audio>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_css(port):
|
||||
# image-set with descriptors is a proven-safe decoy (engine-surface §6).
|
||||
return (
|
||||
'<link rel="stylesheet" href="/big/a/print.css" media="print">'
|
||||
'<div style="background:url(/big/a/attr-bg.png)">styled</div>'
|
||||
'<style>@import "/big/a/blk.css"; h1 { background: url(/big/a/blk-bg.gif); }'
|
||||
' h2 { background-image: image-set("/big/x/is1.png" 1x, "/big/x/is2.png" 2x); }'
|
||||
"</style>"
|
||||
)
|
||||
|
||||
|
||||
def _fam_js(port):
|
||||
# The concatenated string is rejected by the scanner (no single literal).
|
||||
return (
|
||||
'<script src="/big/a/heavy.js"></script>'
|
||||
'<script src="/big/a/decoy.js"></script>'
|
||||
"<script>document.write('<a href=\"/big/f5/dw.html\">dw</a>');\n"
|
||||
'var nope = "xx-" + "/big/x/concat.html";</script>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_meta(port):
|
||||
# Extensionless decoy targets stay unfetchable even if the aggressive
|
||||
# parser fires (no known extension, no scheme: rejected in every state).
|
||||
return (
|
||||
'<meta http-equiv="refresh" content="2;URL=/big/f6/refreshed.html">'
|
||||
'<a href="/big/f6/based.html">based</a>'
|
||||
'<meta property="og:image" content="/big/x/og">'
|
||||
'<meta name="twitter:image" content="/big/x/tw">'
|
||||
'<script type="application/ld+json">'
|
||||
'{"@type": "Thing", "image": "/big/x/jsonld.png"}</script>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_legacy(port):
|
||||
# Comma-valued applet archive is rejected whole by the engine (decoy).
|
||||
return (
|
||||
'<a href="/big/f7/frames.html">frames</a>'
|
||||
'<img src="/big/a/map.gif" usemap="#m">'
|
||||
'<map name="m">'
|
||||
'<area shape="rect" coords="0,0,9,9" href="/big/f7/area.html"></map>'
|
||||
'<embed src="/big/a/e.pdf" type="application/pdf" width="9" height="9">'
|
||||
'<object data="/big/a/o.pdf" type="application/pdf"></object>'
|
||||
'<applet archive="/big/x/aj.jar,/big/x/bj.jar" width="1" height="1"></applet>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_svg(port):
|
||||
return (
|
||||
'<svg width="9" height="9">'
|
||||
'<image href="/big/a/svg-in.png" width="4" height="4"/>'
|
||||
'<use xlink:href="#icon"/></svg>'
|
||||
'<img src="/big/a/logo.svg">'
|
||||
)
|
||||
|
||||
|
||||
def _fam_i18n(port):
|
||||
return (
|
||||
'<a href="/big/f9/caf%C3%A9.html">cafe</a>'
|
||||
'<a href="/big/f9/latin1.html">latin1</a>'
|
||||
'<a href="/big/f9/metaonly.html">meta</a>'
|
||||
'<a href="/big/f9/bom.html">bom</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_http(port):
|
||||
return (
|
||||
'<a href="/big/r/hop1">chain</a>'
|
||||
'<a href="/big/r/get42">get42</a>'
|
||||
'<a href="/big/d/01">d01</a>'
|
||||
'<a href="/big/d/02">d02</a>'
|
||||
'<a href="/big/f10/empty.html">empty</a>'
|
||||
'<a href="/big/d/dl">dl</a>'
|
||||
)
|
||||
|
||||
|
||||
def _fam_forms(port):
|
||||
# GET form action is rewritten but never fetched; formaction/ping are
|
||||
# outside the attribute tables (decoys).
|
||||
return (
|
||||
'<form action="/big/x/form-target.html" method="get">'
|
||||
'<input type="text" name="q">'
|
||||
'<input type="image" src="/big/a/btn.png" alt="go"></form>'
|
||||
'<a href="/big/f11/page.html">bare</a>'
|
||||
'<a href="/big/f11/page.html?utm_source=news&utm_medium=mail">utm</a>'
|
||||
'<a href="/big/f11/sess.html?PHPSESSID=deadbeef123">sess</a>'
|
||||
'<button formaction="/big/x/formact">go</button>'
|
||||
'<a href="/big/f11/page.html" ping="/big/x/ping">ping</a>'
|
||||
)
|
||||
|
||||
|
||||
BIG_FAMILIES = [
|
||||
_fam_feeds,
|
||||
_fam_plain,
|
||||
_fam_srcset,
|
||||
_fam_media,
|
||||
_fam_css,
|
||||
_fam_js,
|
||||
_fam_meta,
|
||||
_fam_legacy,
|
||||
_fam_svg,
|
||||
_fam_i18n,
|
||||
_fam_http,
|
||||
_fam_forms,
|
||||
]
|
||||
|
||||
|
||||
def big_link(m, style):
|
||||
return ["%d.html" % m, "../p/%d.html" % m, "/big/p/%d.html" % m][style]
|
||||
|
||||
|
||||
def big_page(n, port):
|
||||
style = n % 3
|
||||
home = ["../index.html", "/big/index.html", "../index.html"][style]
|
||||
parts = ['<a href="%s">home</a>' % home]
|
||||
if n > 0:
|
||||
parts.append('<a href="%s">up</a>' % big_link((n - 1) // BIG_FANOUT, style))
|
||||
for c in range(n * BIG_FANOUT + 1, n * BIG_FANOUT + BIG_FANOUT + 1):
|
||||
if c < BIG_PAGES:
|
||||
parts.append('<a href="%s">p%d</a>' % (big_link(c, style), c))
|
||||
parts.append('<link rel="stylesheet" href="/big/a/site.css">')
|
||||
parts.append('<script src="/big/a/app.js"></script>')
|
||||
exts = ["png", "gif", "jpg"]
|
||||
ia = "/big/a/i%da.%s" % (n, exts[n % 3])
|
||||
ib = "/big/a/i%db.%s" % (n, exts[(n + 1) % 3])
|
||||
# Rotate the second-image construct across deterministic table attributes.
|
||||
con = n % 4
|
||||
if con == 0:
|
||||
parts.append('<img src="%s"><img src="%s">' % (ia, ib))
|
||||
elif con == 1:
|
||||
parts.append(
|
||||
'<img src="%s"><table background="%s"><tr><td>t</td></tr></table>'
|
||||
% (ia, ib)
|
||||
)
|
||||
elif con == 2:
|
||||
parts.append('<img src="%s"><img src="%s" data-src="%s">' % (ia, ia, ib))
|
||||
else:
|
||||
parts.append(
|
||||
'<img src="%s" loading="lazy"><video poster="%s"></video>' % (ia, ib)
|
||||
)
|
||||
parts.append(BIG_FAMILIES[n % 12](port))
|
||||
return big_html("p%d" % n, "\n".join(parts))
|
||||
|
||||
|
||||
def big_index(port):
|
||||
return big_html(
|
||||
"big index",
|
||||
'<link rel="stylesheet" href="/big/a/site.css">'
|
||||
'<script src="/big/a/app.js"></script>'
|
||||
'<a href="p/0.html">root</a>'
|
||||
'<img src="/big/a/d1/d2/d3/d4/d5/d6/d7/d8/deep.png">'
|
||||
'<a href="/big/f1/long.html?x=%s">long</a>'
|
||||
'<a href="/big/f1/gzok.html">gzok</a>'
|
||||
'<a href="//127.0.0.1:%d/big/f1/protorel.html">protorel</a>'
|
||||
'<a href="http://127.0.0.1:%d/big/f1/abshost.html">abshost</a>'
|
||||
'<a href="/big/e/404.html">e404</a>'
|
||||
'<a href="/big/e/410.html">e410</a>'
|
||||
'<a href="/big/e/500.html">e500</a>'
|
||||
'<a href="/big/e/gztrunc.html">gzt</a>'
|
||||
'<a href="?">query</a>' % ("a" * 900, port, port),
|
||||
)
|
||||
|
||||
|
||||
BIG_REDIRECTS = {
|
||||
"/big/r/hop1": (301, "/big/r/hop2"),
|
||||
"/big/r/hop2": (302, "/big/f10/land.html"),
|
||||
"/big/r/get42": (301, "/big/a/doc.pdf"),
|
||||
"/big/f1/dir": (301, "/big/f1/dir/"),
|
||||
}
|
||||
|
||||
BIG_SIMPLE_PAGES = {
|
||||
"/big/p/two.html": "dot-slash target",
|
||||
"/big/f1/one.html": "one",
|
||||
"/big/f1/tri.html": "tri",
|
||||
"/big/f1/abs.html": "abs",
|
||||
"/big/f1/dir/": "dir index",
|
||||
"/big/f1/long.html": "long",
|
||||
"/big/f1/gzok.html": "gzok",
|
||||
"/big/f1/protorel.html": "protorel",
|
||||
"/big/f1/abshost.html": "abshost",
|
||||
"/big/f5/dw.html": "dw target",
|
||||
"/big/f6/refreshed.html": "refreshed",
|
||||
"/big/f6/sub/leaf.html": "leaf",
|
||||
"/big/f7/fa.html": "frame a",
|
||||
"/big/f7/fb.html": "frame b",
|
||||
"/big/f7/fn.html": "noframes",
|
||||
"/big/f7/area.html": "area",
|
||||
"/big/f10/land.html": "landed",
|
||||
"/big/f11/page.html": "the page",
|
||||
"/big/f11/sess.html": "the sess page",
|
||||
}
|
||||
|
||||
# Extensionless downloads: name resolution is wire-type driven (#478 contract).
|
||||
BIG_DOWNLOADS = {
|
||||
"/big/d/01": ("pdf", None),
|
||||
"/big/d/02": ("png", None),
|
||||
"/big/d/dl": ("pdf", 'attachment; filename="named.pdf"'),
|
||||
}
|
||||
|
||||
|
||||
def _big_rss(port):
|
||||
# purl.org marker makes the feed parse; item URLs are already-linked pages.
|
||||
return (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">\n'
|
||||
"<channel><title>big</title><link>http://127.0.0.1:%d/big/index.html</link>\n"
|
||||
"<item><title>i1</title><link>http://127.0.0.1:%d/big/p/1.html</link>\n"
|
||||
'<enclosure url="http://127.0.0.1:%d/big/p/2.html" type="text/html"/></item>\n'
|
||||
"</channel></rss>\n" % (port, port, port)
|
||||
).encode()
|
||||
|
||||
|
||||
def _big_atom(port):
|
||||
# No purl marker: emitted verbatim, its URL must never be fetched.
|
||||
return (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<feed xmlns="http://www.w3.org/2005/Atom"><title>big</title>\n'
|
||||
"<entry><title>e1</title>"
|
||||
'<link href="http://127.0.0.1:%d/big/x/atom-only.html"/>'
|
||||
"</entry></feed>\n" % port
|
||||
).encode()
|
||||
|
||||
|
||||
def _big_sitemap(port):
|
||||
return (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'
|
||||
"<url><loc>http://127.0.0.1:%d/big/x/sitemap-only.html</loc></url>\n"
|
||||
"</urlset>\n" % port
|
||||
).encode()
|
||||
|
||||
|
||||
class Handler(SimpleHTTPRequestHandler):
|
||||
# Quieter logging; the launcher captures httrack's own log anyway.
|
||||
def log_message(self, fmt, *args):
|
||||
@@ -879,15 +468,11 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
# so only an engine-side abort can end the crawl.
|
||||
TRICKLE_SECONDS = 60
|
||||
|
||||
def send_bin_index(self):
|
||||
"""Index page linking p0.bin..p7.bin (shared by trickle and bigfiles)."""
|
||||
def route_trickle_index(self):
|
||||
self.send_html(
|
||||
"".join('\t<a href="p%d.bin">p%d</a>\n' % (i, i) for i in range(8))
|
||||
)
|
||||
|
||||
def route_trickle_index(self):
|
||||
self.send_bin_index()
|
||||
|
||||
def route_trickle_page(self):
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/octet-stream")
|
||||
@@ -903,15 +488,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# -M byte cap (#77): large fast files so a crawl overruns -M immediately.
|
||||
BIGFILE_BYTES = 640 * 1024
|
||||
|
||||
def route_bigfiles_index(self):
|
||||
self.send_bin_index()
|
||||
|
||||
def route_bigfile(self):
|
||||
self.send_raw(b"x" * self.BIGFILE_BYTES, "application/octet-stream")
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
@@ -966,15 +542,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/trickle/p5.bin": route_trickle_page,
|
||||
"/trickle/p6.bin": route_trickle_page,
|
||||
"/trickle/p7.bin": route_trickle_page,
|
||||
"/bigfiles/index.html": route_bigfiles_index,
|
||||
"/bigfiles/p0.bin": route_bigfile,
|
||||
"/bigfiles/p1.bin": route_bigfile,
|
||||
"/bigfiles/p2.bin": route_bigfile,
|
||||
"/bigfiles/p3.bin": route_bigfile,
|
||||
"/bigfiles/p4.bin": route_bigfile,
|
||||
"/bigfiles/p5.bin": route_bigfile,
|
||||
"/bigfiles/p6.bin": route_bigfile,
|
||||
"/bigfiles/p7.bin": route_bigfile,
|
||||
"/delayed/noloc.php": route_delayed_noloc,
|
||||
"/delayed/selfloop.php": route_delayed_selfloop,
|
||||
"/delayed/redir.php": route_delayed_redir,
|
||||
@@ -995,146 +562,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/redir/target.html": route_redir_target,
|
||||
}
|
||||
|
||||
# --- /big/ seeded pseudo-site ------------------------------------------
|
||||
|
||||
def big_send(self, body, ctype, code=200, extra=()):
|
||||
if code == 200 and self.headers.get("If-Modified-Since") == BIG_LASTMOD:
|
||||
self.send_response(304)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
return
|
||||
self.send_response(code)
|
||||
if code == 200:
|
||||
self.send_header("Last-Modified", BIG_LASTMOD)
|
||||
self.send_header("Content-Type", ctype)
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
for name, value in extra:
|
||||
self.send_header(name, value)
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
def big_error(self, code, reason):
|
||||
body = big_html("error", "<p>%d</p>%s" % (code, HOME))
|
||||
self.big_send(body, "text/html", code=code, extra=[("X-Reason", reason)])
|
||||
|
||||
def route_big(self):
|
||||
split = urlsplit(self.path)
|
||||
path = unquote(split.path)
|
||||
port = self.server.server_address[1]
|
||||
if path in BIG_REDIRECTS:
|
||||
code, location = BIG_REDIRECTS[path]
|
||||
self.send_response(code)
|
||||
self.send_header("Location", location)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
elif path == "/big/index.html":
|
||||
self.big_send(big_index(port), "text/html")
|
||||
elif path in BIG_SIMPLE_PAGES:
|
||||
body = big_html(path, "<p>%s</p>%s" % (BIG_SIMPLE_PAGES[path], HOME))
|
||||
if path == "/big/f1/gzok.html":
|
||||
self.big_send(
|
||||
gzip.compress(body, mtime=0),
|
||||
"text/html",
|
||||
extra=[("Content-Encoding", "gzip")],
|
||||
)
|
||||
else:
|
||||
self.big_send(body, "text/html")
|
||||
elif path == "/big/f1/list.html":
|
||||
# Pagination: distinct content per query string.
|
||||
body = big_html("list", "<p>listing %s</p>%s" % (split.query or "1", HOME))
|
||||
self.big_send(body, "text/html")
|
||||
elif path == "/big/f6/based.html":
|
||||
self.big_send(
|
||||
big_html(
|
||||
"based",
|
||||
'<base href="http://127.0.0.1:%d/big/f6/sub/">'
|
||||
'<a href="leaf.html">leaf</a>' % port,
|
||||
),
|
||||
"text/html",
|
||||
)
|
||||
elif path == "/big/f7/frames.html":
|
||||
self.big_send(
|
||||
b'<html><frameset cols="50%,50%"><frame src="fa.html">'
|
||||
b'<frame src="fb.html"><noframes><body><a href="fn.html">fn</a>'
|
||||
b"</body></noframes></frameset></html>",
|
||||
"text/html",
|
||||
)
|
||||
elif path == "/big/f9/café.html":
|
||||
self.big_send(big_html("cafe", "<p>cafe</p>%s" % HOME), "text/html")
|
||||
elif path == "/big/f9/latin1.html":
|
||||
self.big_send(
|
||||
b"<html><body><p>caf\xe9 latin</p></body></html>",
|
||||
"text/html; charset=ISO-8859-1",
|
||||
)
|
||||
elif path == "/big/f9/metaonly.html":
|
||||
self.big_send(
|
||||
'<html><head><meta charset="utf-8"></head>'
|
||||
"<body><p>café meta</p></body></html>".encode(),
|
||||
"text/html",
|
||||
)
|
||||
elif path == "/big/f9/bom.html":
|
||||
self.big_send(
|
||||
b"\xef\xbb\xbf" + big_html("bom", "<p>bom</p>%s" % HOME), "text/html"
|
||||
)
|
||||
elif path == "/big/f10/empty.html":
|
||||
self.big_send(b"", "text/html")
|
||||
elif path == "/big/f12/rss.xml":
|
||||
self.big_send(_big_rss(port), "text/xml")
|
||||
elif path == "/big/f12/atom.xml":
|
||||
self.big_send(_big_atom(port), "application/xml")
|
||||
elif path == "/big/f12/sitemap.xml":
|
||||
self.big_send(_big_sitemap(port), "text/xml")
|
||||
elif path.startswith("/big/p/"):
|
||||
try:
|
||||
n = int(path[len("/big/p/") : -len(".html")])
|
||||
except ValueError:
|
||||
n = -1
|
||||
if 0 <= n < BIG_PAGES and path.endswith(".html"):
|
||||
self.big_send(big_page(n, port), "text/html")
|
||||
else:
|
||||
self.big_error(404, "no such page")
|
||||
elif path.startswith("/big/a/") or path.startswith("/big/x/"):
|
||||
name = path[len("/big/a/") :]
|
||||
if path.startswith("/big/a/") and name in BIG_TEXT_ASSETS:
|
||||
text, ctype = BIG_TEXT_ASSETS[name]
|
||||
self.big_send(text.encode(), ctype)
|
||||
elif name.endswith(".html"):
|
||||
# Decoy targets 200 so a parser leak becomes a mirror file.
|
||||
self.big_send(big_html(name, "<p>%s</p>" % name), "text/html")
|
||||
else:
|
||||
ext = name.rsplit(".", 1)[-1]
|
||||
ctype = BIG_CTYPES.get(ext, "application/octet-stream")
|
||||
self.big_send(big_asset(name), ctype)
|
||||
elif path in BIG_DOWNLOADS:
|
||||
ext, cdispo = BIG_DOWNLOADS[path]
|
||||
extra = [("Content-Disposition", cdispo)] if cdispo else []
|
||||
self.big_send(
|
||||
big_asset(path[len("/big/") :] + "." + ext),
|
||||
BIG_CTYPES[ext],
|
||||
extra=extra,
|
||||
)
|
||||
elif path == "/big/e/404.html":
|
||||
self.big_error(404, "Not Found")
|
||||
elif path == "/big/e/410.html":
|
||||
self.big_error(410, "Gone")
|
||||
elif path == "/big/e/500.html":
|
||||
self.big_error(500, "Server Error")
|
||||
elif path == "/big/e/gztrunc.html":
|
||||
# Half a gzip stream, honest Content-Length: decode fails, and the
|
||||
# missing Last-Modified keeps it the one uncacheable resource.
|
||||
full = gzip.compress(big_html("gz", "x" * 3000), mtime=0)
|
||||
body = full[: len(full) // 2]
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/html")
|
||||
self.send_header("Content-Encoding", "gzip")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
else:
|
||||
self.big_error(404, "no such big path")
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
|
||||
def reject_fragment(self):
|
||||
@@ -1150,9 +577,6 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def dispatch(self):
|
||||
self._set_cookies = []
|
||||
path = urlsplit(self.path).path
|
||||
if path.startswith("/big/"):
|
||||
self.route_big()
|
||||
return True
|
||||
# Match percent-encoded paths (accented #157 route) by their decoded form.
|
||||
handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
|
||||
if handler is not None:
|
||||
|
||||
@@ -211,9 +211,7 @@ main() {
|
||||
# lintian ourselves below as the real gate.
|
||||
local -a debuild_opts=(--no-lintian)
|
||||
local -a build_opts=()
|
||||
# -d: a source build runs no debhelper, so don't require Build-Depends
|
||||
# locally (the buildds and the --sbuild gate enforce them).
|
||||
[[ $source_only -eq 1 ]] && build_opts+=(-S -d)
|
||||
[[ $source_only -eq 1 ]] && build_opts+=(-S)
|
||||
if [[ $unsigned -eq 1 ]]; then
|
||||
build_opts+=(-us -uc)
|
||||
else
|
||||
@@ -236,15 +234,12 @@ main() {
|
||||
|
||||
# The real lintian gate (debuild only reports, it does not fail on tags).
|
||||
# --profile debian: CI runners are Ubuntu, whose vendor data would wrongly
|
||||
# reject the Debian "unstable" distribution. Suppressed tags are stale-local-
|
||||
# lintian skew, not package defects: newer-standards-version, and
|
||||
# recommended-field (old lintian still wants the Priority field the sid
|
||||
# lintian in CI accepts dropping). set -e turns any error/warning tag into
|
||||
# a failure.
|
||||
# reject the Debian "unstable" distribution. newer-standards-version only
|
||||
# means the local lintian is older than the buildds', not a package
|
||||
# defect, so suppress it. set -e turns any error/warning tag into a failure.
|
||||
info "running lintian gate (--fail-on=error,warning)"
|
||||
lintian --profile debian -I -i --fail-on=error,warning \
|
||||
--suppress-tags newer-standards-version,recommended-field \
|
||||
"${changes[@]}"
|
||||
--suppress-tags newer-standards-version "${changes[@]}"
|
||||
|
||||
dcmd cp -- "${changes[@]}" "$outdir/"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user