Compare commits

...

4 Commits

Author SHA1 Message Date
Xavier Roche
9c8d3a41eb tests: tighten the type-matrix guards
Add two assertions surfaced by review of the override path: control.php
must not survive its rename to control.html (a dual-write regression
would leave both), and gen.php?id=5 (a query/extension-less URL served
image/png) must keep its .png and not be mangled to .html. Both exercise
the "override still fires" direction that the suppression cases don't.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-20 18:25:45 +02:00
Xavier Roche
ae77cd9d6d Honor --assume under the default delayed type check (-%N2)
Under HARD savename-delayed (the default), url_savename() forced
is_html=-1 before consulting the user's --assume rules, so a type the
user pinned was lost to the delayed name and never applied (#56). Skip
the forced delay when is_userknowntype() matches: ishtml() already
consults the user type, so the immediate naming path applies it. Files
with no --assume rule are unaffected -- is_userknowntype() is false and
the delay still fires.

tests/16_local-assume.test crawls a .png served as image/png but assumed
text/html and checks it is saved .html; it fails without this change.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-20 18:12:01 +02:00
Xavier Roche
51b8dcd81c Keep a known URL extension against a bogus html/empty Content-Type
Under the default delayed type check (-%N2), url_savename() rewrote a
saved file's extension from the wire Content-Type, gated only by
!may_unknown2(). text/html is not in the keep-list, so a response
labeled text/html -- or a typeless one, which is coerced to text/html --
clobbered the URL's own extension: a PNG served as text/html or with no
Content-Type was saved as .html, and .htm was normalized to .html (#29).
The bytes stayed intact; only the name was silently wrong.

wire_patches_ext() now lets the wire type override the extension only
when the type is patchable and doing so would not clobber a URL
extension that already maps to a specific, non-HTML type. A generator or
extension-less URL still becomes .html; a .png stays .png.

tests/15_local-types.test locks this with a deterministic offline crawl
of a content-type/extension matrix (tests/local-server.py); it fails on
the unfixed engine. Addresses the #267 mangle family (incl. #29).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-20 18:07:08 +02:00
Xavier Roche
bcce664143 Merge pull request #364 from xroche/feature/local-test-server
tests: offline local test server prototype (cookies + HTTPS)
2026-06-20 16:41:26 +02:00
5 changed files with 125 additions and 5 deletions

View File

@@ -138,6 +138,30 @@ static void cleanEndingSpaceOrDot(char *s) {
}
}
/* Should the wire Content-Type override the URL's own extension when naming the
saved file? True only when the type is patchable (may_unknown2) and doing so
would not clobber a URL extension that already maps to a specific, non-HTML
type. This is the #267 mangle guard: a .png served as text/html (or with no
type) stays named .png. */
static int wire_patches_ext(httrackp *opt, const char *wiremime,
const char *file) {
char urlmime[256];
if (may_unknown2(opt, wiremime, file))
return 0; /* type kept verbatim (keep-list / bogus-multiple) */
urlmime[0] = '\0';
/* type implied by the URL extension, only when confidently known (flag 0) */
if (!get_httptype_sized(opt, urlmime, sizeof(urlmime), file, 0))
return 1; /* URL ext implies no known type: trust the wire type */
if (strfield2(wiremime, urlmime))
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
/* wire disagrees: keep a specific non-HTML ext against an html/empty claim */
if (!is_hypertext_mime(opt, urlmime, file) &&
(is_html_mime_type(wiremime) || !strnotempty(wiremime)))
return 0;
return 1;
}
// forme le nom du fichier à sauver (save) à partir de fil et adr
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
int url_savename(lien_adrfilsave *const afs,
@@ -325,7 +349,10 @@ int url_savename(lien_adrfilsave *const afs,
}
/* replace shtml to html.. */
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD)
/* HARD delays every type, except one the user pinned with --assume: honor it
immediately (ishtml() consults the user type), no delayed name (#56) */
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD &&
!is_userknowntype(opt, fil))
is_html = -1; /* ALWAYS delay type */
else
is_html = ishtml(opt, fil);
@@ -380,7 +407,7 @@ int url_savename(lien_adrfilsave *const afs,
if (strnotempty(r.cdispo)) { /* filename given */
ext_chg = 2; /* change filename */
strcpybuff(ext, r.cdispo);
} else if (!may_unknown2(opt, r.contenttype, fil)) { // on peut patcher à priori?
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
if (give_mimext(s, sizeof(s),
r.contenttype)) { // recognized extension
ext_chg = 1;
@@ -425,7 +452,8 @@ int url_savename(lien_adrfilsave *const afs,
if (strnotempty(headers->r.cdispo)) { /* filename given */
ext_chg = 2; /* change filename */
strcpybuff(ext, headers->r.cdispo);
} else if (!may_unknown2(opt, headers->r.contenttype, headers->url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
} else if (wire_patches_ext(opt, headers->r.contenttype,
headers->url_fil)) {
char s[16];
if (give_mimext(
s, sizeof(s),
@@ -653,7 +681,8 @@ int url_savename(lien_adrfilsave *const afs,
if (strnotempty(back[b].r.cdispo)) { /* filename given */
ext_chg = 2; /* change filename */
strcpybuff(ext, back[b].r.cdispo);
} else if (!may_unknown2(opt, back[b].r.contenttype, back[b].url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
} else if (wire_patches_ext(opt, back[b].r.contenttype,
back[b].url_fil)) {
if (give_mimext(
s, sizeof(s),
back[b].r.contenttype)) { // recognized extension

20
tests/15_local-types.test Normal file
View File

@@ -0,0 +1,20 @@
#!/bin/bash
#
# Content-Type vs URL-extension naming (issue #267 family). Under the default
# delayed type check (-%N2), a bogus/missing html-ish wire type must not clobber
# a URL extension that maps to a specific non-HTML type. The .html "mangle" names
# are asserted absent so a regression that re-introduces it fails here.
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'types/notype.png' --not-found 'types/notype.html' \
--found 'types/lie.png' --not-found 'types/lie.html' \
--found 'types/page.htm' --not-found 'types/page.html' \
--found 'types/photo.png' \
--found 'types/script.js' \
--found 'types/style.css' \
--found 'types/data.json' \
--found 'types/control.html' --not-found 'types/control.php' \
--found 'types/gend61c.png' --not-found 'types/gend61c.html' \
httrack 'BASEURL/types/index.html'

View File

@@ -0,0 +1,11 @@
#!/bin/bash
#
# --assume under the default delayed type check (-%N2), issue #56. A user type
# pinned with --assume must be honored immediately, not lost to the delayed
# name: photo.png served as image/png but assumed text/html is saved as .html.
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'types/photo.html' --not-found 'types/photo.png' \
httrack 'BASEURL/types/photo.png' --assume png=text/html

View File

@@ -51,6 +51,8 @@ TESTS = \
12_crawl_https.test \
13_crawl_proxy_https.test \
13_local-cookies.test \
14_local-https.test
14_local-https.test \
15_local-types.test \
16_local-assume.test
CLEANFILES = check-network_sh.cache

View File

@@ -118,11 +118,69 @@ class Handler(SimpleHTTPRequestHandler):
if self.command != "HEAD":
self.wfile.write(body)
# --- type/extension matrix (issue #267 family) -------------------------
def send_raw(self, body, content_type):
"""Send a raw body with an explicit Content-Type, or none at all when
content_type is None (to observe httrack's typeless-file naming)."""
self.send_response(200)
if content_type is not None:
self.send_header("Content-Type", content_type)
self.send_header("Content-Length", str(len(body)))
self.end_headers()
if self.command != "HEAD":
self.wfile.write(body)
# A fake-binary PNG-ish blob for the image/typeless cases.
FAKE_PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64
# path -> (body, content_type); content_type None means no header at all.
TYPE_MATRIX = {
"/types/control.php": (b"<html><body>control</body></html>", "text/html"),
"/types/photo.png": (FAKE_PNG, "image/png"),
"/types/notype.png": (FAKE_PNG, None),
"/types/lie.png": (FAKE_PNG, "text/html"),
"/types/page.htm": (b"<html><body>htm page</body></html>", "text/html"),
"/types/script.js": (b"var x = 1;\n", "application/javascript"),
"/types/style.css": (b"body { color: red; }\n", "text/css"),
"/types/data.json": (b'{"k": "v"}\n', "application/json"),
"/types/gen.php": (FAKE_PNG, "image/png"),
}
def route_types_index(self):
body = (
'\t<a href="control.php">control</a>\n'
'\t<img src="photo.png" />\n'
'\t<img src="notype.png" />\n'
'\t<img src="lie.png" />\n'
'\t<a href="page.htm">htm</a>\n'
'\t<script src="script.js"></script>\n'
'\t<link rel="stylesheet" href="style.css" />\n'
'\t<a href="data.json">json</a>\n'
'\t<img src="gen.php?id=5" />\n'
)
self.send_html(body)
def route_types(self):
path = urlsplit(self.path).path
body, ctype = self.TYPE_MATRIX[path]
self.send_raw(body, ctype)
ROUTES = {
"/cookies/entrance.php": route_entrance,
"/cookies/second.php": route_second,
"/cookies/third.php": route_third,
"/robots.txt": route_robots,
"/types/index.html": route_types_index,
"/types/control.php": route_types,
"/types/photo.png": route_types,
"/types/notype.png": route_types,
"/types/lie.png": route_types,
"/types/page.htm": route_types,
"/types/script.js": route_types,
"/types/style.css": route_types,
"/types/data.json": route_types,
"/types/gen.php": route_types,
}
# --- dispatch ----------------------------------------------------------