mirror of
https://github.com/xroche/httrack.git
synced 2026-06-21 01:28:35 +03:00
Compare commits
4 Commits
fix/267-de
...
fix/empty-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
02180549f6 | ||
|
|
1611dbcabf | ||
|
|
099501ee50 | ||
|
|
1b9eefa3b4 |
3
.github/workflows/ci.yml
vendored
3
.github/workflows/ci.yml
vendored
@@ -227,7 +227,8 @@ jobs:
|
||||
# Validate the Debian packaging via the same script maintainers release with.
|
||||
# One amd64/gcc run is enough: packaging (control/rules/manifest/lintian/quilt
|
||||
# source build) is arch- and compiler-independent, and the build matrix above
|
||||
# already covers compile portability. lintian runs with --fail-on=error.
|
||||
# already covers compile portability. mkdeb.sh runs lintian as an explicit gate
|
||||
# (debuild does not propagate lintian's exit) with --fail-on=error,warning.
|
||||
deb:
|
||||
name: deb package (lintian)
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
@@ -29,9 +29,9 @@ AC_CONFIG_SRCDIR(src/httrack.c)
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
AM_INIT_AUTOMAKE([subdir-objects])
|
||||
# 3:0:0: htsblk layout changed (contenttype/charset/contentencoding widened to
|
||||
# 128), an incompatible ABI break, so bump current and reset revision/age.
|
||||
VERSION_INFO="3:0:0"
|
||||
# 4:0:0: htsblk gained the contenttype_given field, an incompatible ABI break,
|
||||
# so bump current and reset revision/age.
|
||||
VERSION_INFO="4:0:0"
|
||||
AM_MAINTAINER_MODE
|
||||
AC_USE_SYSTEM_EXTENSIONS
|
||||
|
||||
|
||||
10
debian/changelog
vendored
10
debian/changelog
vendored
@@ -1,3 +1,13 @@
|
||||
httrack (3.49.8-3) unstable; urgency=medium
|
||||
|
||||
* Rename libhttrack3 to libhttrack4 to follow the SONAME bump to
|
||||
libhttrack.so.4: htsblk gained a contenttype_given field, an
|
||||
incompatible ABI change (VERSION_INFO 3 -> 4). The .files wildcard
|
||||
now tracks .so.4* so the runtime libraries land in the right
|
||||
package. New binary package, via NEW.
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sat, 20 Jun 2026 19:46:16 +0200
|
||||
|
||||
httrack (3.49.8-2) unstable; urgency=medium
|
||||
|
||||
* Rename libhttrack2 to libhttrack3 to follow the SONAME, which the 3.49.8
|
||||
|
||||
6
debian/control
vendored
6
debian/control
vendored
@@ -58,13 +58,13 @@ Description: webhttrack common files
|
||||
This package is the common files of webhttrack, website copier and
|
||||
mirroring utility
|
||||
|
||||
Package: libhttrack3
|
||||
Package: libhttrack4
|
||||
Architecture: any
|
||||
Multi-Arch: same
|
||||
Section: libs
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}
|
||||
Replaces: libhttrack2, httrack (<< 3.49.8-2~)
|
||||
Breaks: libhttrack2, httrack (<< 3.49.8-2~)
|
||||
Replaces: libhttrack3, httrack (<< 3.49.8-3~)
|
||||
Breaks: libhttrack3, httrack (<< 3.49.8-3~)
|
||||
Description: Httrack website copier library
|
||||
This package is the library part of httrack, website copier and mirroring
|
||||
utility
|
||||
|
||||
3
debian/httrack-doc.lintian-overrides
vendored
3
debian/httrack-doc.lintian-overrides
vendored
@@ -4,3 +4,6 @@
|
||||
# so the path lives in the display pointer, not the override -- match with '*'.
|
||||
httrack-doc: extra-license-file *
|
||||
httrack-doc: package-contains-documentation-outside-usr-share-doc *
|
||||
# search.sh is a sample CGI shipped alongside the HTML manual, not meant to be
|
||||
# run from the package tree; it stays non-executable by design.
|
||||
httrack-doc: script-not-executable *
|
||||
|
||||
3
debian/libhttrack3.files
vendored
3
debian/libhttrack3.files
vendored
@@ -1,3 +0,0 @@
|
||||
usr/lib/*/libhttrack.so.3*
|
||||
usr/lib/*/libhtsjava.so.3*
|
||||
usr/share/httrack/templates
|
||||
3
debian/libhttrack4.files
vendored
Normal file
3
debian/libhttrack4.files
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
usr/lib/*/libhttrack.so.4*
|
||||
usr/lib/*/libhtsjava.so.4*
|
||||
usr/share/httrack/templates
|
||||
@@ -1,3 +1,3 @@
|
||||
# The shared libraries ship without a versioned symbols control file (ABI is
|
||||
# tracked via the SONAME plus a >= upstream-version dependency, see debian/rules).
|
||||
libhttrack3: no-symbols-control-file usr/lib/*
|
||||
libhttrack4: no-symbols-control-file usr/lib/*
|
||||
2
debian/rules
vendored
2
debian/rules
vendored
@@ -135,7 +135,7 @@ binary-arch: build install
|
||||
dh_makeshlibs -a -X/usr/lib/$(DEB_HOST_MULTIARCH)/httrack/libtest --version-info
|
||||
dh_installdeb -a
|
||||
# we depend on the current version (ABI may change)
|
||||
dh_shlibdeps -a -ldebian/libhttrack3/usr/lib/$(DEB_HOST_MULTIARCH)
|
||||
dh_shlibdeps -a -ldebian/libhttrack4/usr/lib/$(DEB_HOST_MULTIARCH)
|
||||
dh_gencontrol -a
|
||||
dh_md5sums -a
|
||||
dh_builddeb -a
|
||||
|
||||
18
src/htslib.c
18
src/htslib.c
@@ -1396,6 +1396,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
const char *a = rcvd;
|
||||
|
||||
retour->contenttype_given = HTS_FALSE; /* set when a Content-Type is seen */
|
||||
|
||||
// exemple:
|
||||
// HTTP/1.0 200 OK
|
||||
if (*a) {
|
||||
@@ -1589,11 +1591,17 @@ void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * ret
|
||||
}
|
||||
}
|
||||
}
|
||||
sscanf(rcvd + p, "%s", tempo);
|
||||
if (strlen(tempo) < sizeof(retour->contenttype) - 2) // pas trop long!!
|
||||
strcpybuff(retour->contenttype, tempo);
|
||||
else
|
||||
strcpybuff(retour->contenttype, "application/octet-stream-unknown"); // erreur
|
||||
// An empty/whitespace Content-Type value yields no token; keep the
|
||||
// default type and the "not given" flag instead of reading uninit tempo.
|
||||
if (sscanf(rcvd + p, "%s", tempo) == 1) {
|
||||
if (strlen(tempo) < sizeof(retour->contenttype) - 2) // pas trop long!!
|
||||
strcpybuff(retour->contenttype, tempo);
|
||||
else
|
||||
strcpybuff(retour->contenttype,
|
||||
"application/octet-stream-unknown"); // erreur
|
||||
retour->contenttype_given =
|
||||
HTS_TRUE; /* server declared a usable type */
|
||||
}
|
||||
}
|
||||
} else if ((p = strfield(rcvd, "Content-Range:")) != 0) {
|
||||
// Content-Range: bytes 0-70870/70871
|
||||
|
||||
@@ -139,12 +139,13 @@ static void cleanEndingSpaceOrDot(char *s) {
|
||||
}
|
||||
|
||||
/* Should the wire Content-Type override the URL's own extension when naming the
|
||||
saved file? True only when the type is patchable (may_unknown2) and doing so
|
||||
would not clobber a URL extension that already maps to a specific, non-HTML
|
||||
type. This is the #267 mangle guard: a .png served as text/html (or with no
|
||||
type) stays named .png. */
|
||||
saved file? True when the type is patchable (may_unknown2) and either the URL
|
||||
extension implies no specific type or the server declared a disagreeing one.
|
||||
A URL extension mapping to a specific non-HTML type is kept only when the
|
||||
server sent NO Content-Type (the #267 mangle guard): a typeless .png stays
|
||||
.png, but a .pdf explicitly served as text/html is named .html. */
|
||||
static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
const char *file, int contenttype_given) {
|
||||
char urlmime[256];
|
||||
|
||||
if (may_unknown2(opt, wiremime, file))
|
||||
@@ -155,9 +156,12 @@ static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
return 1; /* URL ext implies no known type: trust the wire type */
|
||||
if (strfield2(wiremime, urlmime))
|
||||
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
|
||||
/* wire disagrees: keep a specific non-HTML ext against an html/empty claim */
|
||||
if (!is_hypertext_mime(opt, urlmime, file) &&
|
||||
(is_html_mime_type(wiremime) || !strnotempty(wiremime)))
|
||||
/* wire disagrees with a specific non-HTML URL ext. Keep the ext only when
|
||||
the server sent NO Content-Type: a missing type is defaulted to text/html
|
||||
upstream and must not clobber e.g. a .png. An explicitly declared type is
|
||||
trusted, so a binary-looking URL that really serves HTML (login/error
|
||||
interstitial, soft-404) is named .html instead of kept as .pdf/.jpg. */
|
||||
if (!is_hypertext_mime(opt, urlmime, file) && !contenttype_given)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
@@ -407,7 +411,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (strnotempty(r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, r.cdispo);
|
||||
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
|
||||
} else if (wire_patches_ext(opt, r.contenttype, fil,
|
||||
r.contenttype_given)) {
|
||||
if (give_mimext(s, sizeof(s),
|
||||
r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
@@ -453,7 +458,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, headers->r.cdispo);
|
||||
} else if (wire_patches_ext(opt, headers->r.contenttype,
|
||||
headers->url_fil)) {
|
||||
headers->url_fil,
|
||||
headers->r.contenttype_given)) {
|
||||
char s[16];
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
@@ -682,7 +688,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, back[b].r.cdispo);
|
||||
} else if (wire_patches_ext(opt, back[b].r.contenttype,
|
||||
back[b].url_fil)) {
|
||||
back[b].url_fil,
|
||||
back[b].r.contenttype_given)) {
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
back[b].r.contenttype)) { // recognized extension
|
||||
|
||||
@@ -651,6 +651,8 @@ struct htsblk {
|
||||
int debugid; /**< connection debug id */
|
||||
/* */
|
||||
htsrequest req; /**< parameters used for the request */
|
||||
/* a Content-Type header was received (else contenttype holds a default) */
|
||||
hts_boolean contenttype_given;
|
||||
/*char digest[32+2]; // md5 digest generated by the engine ("" if none) */
|
||||
};
|
||||
|
||||
|
||||
@@ -1,17 +1,22 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Content-Type vs URL-extension naming (issue #267 family). Under the default
|
||||
# delayed type check (-%N2), a bogus/missing html-ish wire type must not clobber
|
||||
# a URL extension that maps to a specific non-HTML type. The .html "mangle" names
|
||||
# are asserted absent so a regression that re-introduces it fails here.
|
||||
# Content-Type vs URL-extension naming (issue #267 family) under the default
|
||||
# delayed type check (-%N2). Policy: a MISSING Content-Type must not clobber a
|
||||
# URL extension that maps to a specific non-HTML type (.png/.pdf stay as-is);
|
||||
# an explicitly DECLARED type is trusted, so a binary-looking URL that really
|
||||
# serves HTML (text/html on .pdf/.jpg) is named .html. The "wrong" names are
|
||||
# asserted absent so a regression in either direction fails here.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/lie.png' --not-found 'types/lie.html' \
|
||||
--found 'types/page.htm' --not-found 'types/page.html' \
|
||||
--found 'types/notype.png' \
|
||||
--found 'types/notype.pdf' --not-found 'types/notype.html' \
|
||||
--found 'types/photo.png' \
|
||||
--found 'types/doc.pdf' \
|
||||
--found 'types/lie.html' --not-found 'types/lie.png' \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/page.htm' --not-found 'types/page.html' \
|
||||
--found 'types/script.js' \
|
||||
--found 'types/style.css' \
|
||||
--found 'types/data.json' \
|
||||
|
||||
12
tests/17_local-empty-ct.test
Normal file
12
tests/17_local-empty-ct.test
Normal file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# An empty "Content-Type:" header value must be treated as "no usable type"
|
||||
# (keep the URL extension), not parsed from an uninitialized buffer. The crawl
|
||||
# also runs under ASan/UBSan in CI, which catches the uninitialized read this
|
||||
# guards against.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/emptyct.png' --not-found 'types/emptyct.html' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
@@ -53,6 +53,7 @@ TESTS = \
|
||||
13_local-cookies.test \
|
||||
14_local-https.test \
|
||||
15_local-types.test \
|
||||
16_local-assume.test
|
||||
16_local-assume.test \
|
||||
17_local-empty-ct.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -131,15 +131,21 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
# A fake-binary PNG-ish blob for the image/typeless cases.
|
||||
# Fake-binary blobs for the image/pdf/typeless cases.
|
||||
FAKE_PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64
|
||||
FAKE_PDF = b"%PDF-1.4\n" + b"\x00" * 64
|
||||
|
||||
# path -> (body, content_type); content_type None means no header at all.
|
||||
# path -> (body, content_type); None sends no header, "" sends an empty
|
||||
# Content-Type value (no usable type, must be treated like None).
|
||||
TYPE_MATRIX = {
|
||||
"/types/control.php": (b"<html><body>control</body></html>", "text/html"),
|
||||
"/types/photo.png": (FAKE_PNG, "image/png"),
|
||||
"/types/doc.pdf": (FAKE_PDF, "application/pdf"),
|
||||
"/types/notype.png": (FAKE_PNG, None),
|
||||
"/types/notype.pdf": (FAKE_PDF, None),
|
||||
"/types/emptyct.png": (FAKE_PNG, ""),
|
||||
"/types/lie.png": (FAKE_PNG, "text/html"),
|
||||
"/types/report.pdf": (b"<html><body>real page</body></html>", "text/html"),
|
||||
"/types/page.htm": (b"<html><body>htm page</body></html>", "text/html"),
|
||||
"/types/script.js": (b"var x = 1;\n", "application/javascript"),
|
||||
"/types/style.css": (b"body { color: red; }\n", "text/css"),
|
||||
@@ -151,8 +157,12 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
body = (
|
||||
'\t<a href="control.php">control</a>\n'
|
||||
'\t<img src="photo.png" />\n'
|
||||
'\t<a href="doc.pdf">doc</a>\n'
|
||||
'\t<img src="notype.png" />\n'
|
||||
'\t<a href="notype.pdf">notypepdf</a>\n'
|
||||
'\t<img src="emptyct.png" />\n'
|
||||
'\t<img src="lie.png" />\n'
|
||||
'\t<a href="report.pdf">report</a>\n'
|
||||
'\t<a href="page.htm">htm</a>\n'
|
||||
'\t<script src="script.js"></script>\n'
|
||||
'\t<link rel="stylesheet" href="style.css" />\n'
|
||||
@@ -174,8 +184,12 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/types/index.html": route_types_index,
|
||||
"/types/control.php": route_types,
|
||||
"/types/photo.png": route_types,
|
||||
"/types/doc.pdf": route_types,
|
||||
"/types/notype.png": route_types,
|
||||
"/types/notype.pdf": route_types,
|
||||
"/types/emptyct.png": route_types,
|
||||
"/types/lie.png": route_types,
|
||||
"/types/report.pdf": route_types,
|
||||
"/types/page.htm": route_types,
|
||||
"/types/script.js": route_types,
|
||||
"/types/style.css": route_types,
|
||||
|
||||
@@ -206,9 +206,10 @@ main() {
|
||||
cp -a "$export_dir/debian" "httrack-$ver/debian"
|
||||
)
|
||||
|
||||
# Build (debuild also runs lintian and signs). --fail-on aborts on a lintian
|
||||
# error or warning, so neither a release nor CI produces an unclean package.
|
||||
local -a debuild_opts=(--lintian-opts -I -i "--fail-on=error,warning")
|
||||
# Build and sign. debuild runs lintian too but does NOT propagate its exit
|
||||
# status, so a broken package would pass unnoticed; disable it here and run
|
||||
# lintian ourselves below as the real gate.
|
||||
local -a debuild_opts=(--no-lintian)
|
||||
local -a build_opts=()
|
||||
[[ $source_only -eq 1 ]] && build_opts+=(-S)
|
||||
if [[ $unsigned -eq 1 ]]; then
|
||||
@@ -219,7 +220,8 @@ main() {
|
||||
info "building packages with debuild"
|
||||
(
|
||||
cd "$scratch/httrack-$ver"
|
||||
debuild "${build_opts[@]}" "${debuild_opts[@]}"
|
||||
# debuild options (--no-lintian) must precede the dpkg-buildpackage ones
|
||||
debuild "${debuild_opts[@]}" "${build_opts[@]}"
|
||||
)
|
||||
|
||||
# Collect every file the .changes references (orig, dsc, debs, ddebs, buildinfo).
|
||||
@@ -229,6 +231,16 @@ main() {
|
||||
changes=("$scratch"/*.changes)
|
||||
shopt -u nullglob
|
||||
[[ ${#changes[@]} -ge 1 ]] || die "debuild produced no .changes file"
|
||||
|
||||
# The real lintian gate (debuild only reports, it does not fail on tags).
|
||||
# --profile debian: CI runners are Ubuntu, whose vendor data would wrongly
|
||||
# reject the Debian "unstable" distribution. newer-standards-version only
|
||||
# means the local lintian is older than the buildds', not a package
|
||||
# defect, so suppress it. set -e turns any error/warning tag into a failure.
|
||||
info "running lintian gate (--fail-on=error,warning)"
|
||||
lintian --profile debian -I -i --fail-on=error,warning \
|
||||
--suppress-tags newer-standards-version "${changes[@]}"
|
||||
|
||||
dcmd cp -- "${changes[@]}" "$outdir/"
|
||||
|
||||
# Clean-room build gate: rebuild the source package in a minimal chroot that
|
||||
|
||||
Reference in New Issue
Block a user