mirror of
https://github.com/xroche/httrack.git
synced 2026-06-21 09:38:24 +03:00
Compare commits
4 Commits
feature/lo
...
fix/267-de
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9c8d3a41eb | ||
|
|
ae77cd9d6d | ||
|
|
51b8dcd81c | ||
|
|
bcce664143 |
@@ -138,6 +138,30 @@ static void cleanEndingSpaceOrDot(char *s) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Should the wire Content-Type override the URL's own extension when naming the
|
||||
saved file? True only when the type is patchable (may_unknown2) and doing so
|
||||
would not clobber a URL extension that already maps to a specific, non-HTML
|
||||
type. This is the #267 mangle guard: a .png served as text/html (or with no
|
||||
type) stays named .png. */
|
||||
static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
char urlmime[256];
|
||||
|
||||
if (may_unknown2(opt, wiremime, file))
|
||||
return 0; /* type kept verbatim (keep-list / bogus-multiple) */
|
||||
urlmime[0] = '\0';
|
||||
/* type implied by the URL extension, only when confidently known (flag 0) */
|
||||
if (!get_httptype_sized(opt, urlmime, sizeof(urlmime), file, 0))
|
||||
return 1; /* URL ext implies no known type: trust the wire type */
|
||||
if (strfield2(wiremime, urlmime))
|
||||
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
|
||||
/* wire disagrees: keep a specific non-HTML ext against an html/empty claim */
|
||||
if (!is_hypertext_mime(opt, urlmime, file) &&
|
||||
(is_html_mime_type(wiremime) || !strnotempty(wiremime)))
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// forme le nom du fichier à sauver (save) à partir de fil et adr
|
||||
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
|
||||
int url_savename(lien_adrfilsave *const afs,
|
||||
@@ -325,7 +349,10 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
}
|
||||
|
||||
/* replace shtml to html.. */
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD)
|
||||
/* HARD delays every type, except one the user pinned with --assume: honor it
|
||||
immediately (ishtml() consults the user type), no delayed name (#56) */
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD &&
|
||||
!is_userknowntype(opt, fil))
|
||||
is_html = -1; /* ALWAYS delay type */
|
||||
else
|
||||
is_html = ishtml(opt, fil);
|
||||
@@ -380,7 +407,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (strnotempty(r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, r.cdispo);
|
||||
} else if (!may_unknown2(opt, r.contenttype, fil)) { // on peut patcher à priori?
|
||||
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
|
||||
if (give_mimext(s, sizeof(s),
|
||||
r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
@@ -425,7 +452,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (strnotempty(headers->r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, headers->r.cdispo);
|
||||
} else if (!may_unknown2(opt, headers->r.contenttype, headers->url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
|
||||
} else if (wire_patches_ext(opt, headers->r.contenttype,
|
||||
headers->url_fil)) {
|
||||
char s[16];
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
@@ -653,7 +681,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (strnotempty(back[b].r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, back[b].r.cdispo);
|
||||
} else if (!may_unknown2(opt, back[b].r.contenttype, back[b].url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
|
||||
} else if (wire_patches_ext(opt, back[b].r.contenttype,
|
||||
back[b].url_fil)) {
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
back[b].r.contenttype)) { // recognized extension
|
||||
|
||||
20
tests/15_local-types.test
Normal file
20
tests/15_local-types.test
Normal file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Content-Type vs URL-extension naming (issue #267 family). Under the default
|
||||
# delayed type check (-%N2), a bogus/missing html-ish wire type must not clobber
|
||||
# a URL extension that maps to a specific non-HTML type. The .html "mangle" names
|
||||
# are asserted absent so a regression that re-introduces it fails here.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/lie.png' --not-found 'types/lie.html' \
|
||||
--found 'types/page.htm' --not-found 'types/page.html' \
|
||||
--found 'types/photo.png' \
|
||||
--found 'types/script.js' \
|
||||
--found 'types/style.css' \
|
||||
--found 'types/data.json' \
|
||||
--found 'types/control.html' --not-found 'types/control.php' \
|
||||
--found 'types/gend61c.png' --not-found 'types/gend61c.html' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
11
tests/16_local-assume.test
Normal file
11
tests/16_local-assume.test
Normal file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# --assume under the default delayed type check (-%N2), issue #56. A user type
|
||||
# pinned with --assume must be honored immediately, not lost to the delayed
|
||||
# name: photo.png served as image/png but assumed text/html is saved as .html.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/photo.html' --not-found 'types/photo.png' \
|
||||
httrack 'BASEURL/types/photo.png' --assume png=text/html
|
||||
@@ -51,6 +51,8 @@ TESTS = \
|
||||
12_crawl_https.test \
|
||||
13_crawl_proxy_https.test \
|
||||
13_local-cookies.test \
|
||||
14_local-https.test
|
||||
14_local-https.test \
|
||||
15_local-types.test \
|
||||
16_local-assume.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -118,11 +118,69 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
# --- type/extension matrix (issue #267 family) -------------------------
|
||||
|
||||
def send_raw(self, body, content_type):
|
||||
"""Send a raw body with an explicit Content-Type, or none at all when
|
||||
content_type is None (to observe httrack's typeless-file naming)."""
|
||||
self.send_response(200)
|
||||
if content_type is not None:
|
||||
self.send_header("Content-Type", content_type)
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
# A fake-binary PNG-ish blob for the image/typeless cases.
|
||||
FAKE_PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64
|
||||
|
||||
# path -> (body, content_type); content_type None means no header at all.
|
||||
TYPE_MATRIX = {
|
||||
"/types/control.php": (b"<html><body>control</body></html>", "text/html"),
|
||||
"/types/photo.png": (FAKE_PNG, "image/png"),
|
||||
"/types/notype.png": (FAKE_PNG, None),
|
||||
"/types/lie.png": (FAKE_PNG, "text/html"),
|
||||
"/types/page.htm": (b"<html><body>htm page</body></html>", "text/html"),
|
||||
"/types/script.js": (b"var x = 1;\n", "application/javascript"),
|
||||
"/types/style.css": (b"body { color: red; }\n", "text/css"),
|
||||
"/types/data.json": (b'{"k": "v"}\n', "application/json"),
|
||||
"/types/gen.php": (FAKE_PNG, "image/png"),
|
||||
}
|
||||
|
||||
def route_types_index(self):
|
||||
body = (
|
||||
'\t<a href="control.php">control</a>\n'
|
||||
'\t<img src="photo.png" />\n'
|
||||
'\t<img src="notype.png" />\n'
|
||||
'\t<img src="lie.png" />\n'
|
||||
'\t<a href="page.htm">htm</a>\n'
|
||||
'\t<script src="script.js"></script>\n'
|
||||
'\t<link rel="stylesheet" href="style.css" />\n'
|
||||
'\t<a href="data.json">json</a>\n'
|
||||
'\t<img src="gen.php?id=5" />\n'
|
||||
)
|
||||
self.send_html(body)
|
||||
|
||||
def route_types(self):
|
||||
path = urlsplit(self.path).path
|
||||
body, ctype = self.TYPE_MATRIX[path]
|
||||
self.send_raw(body, ctype)
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
"/cookies/third.php": route_third,
|
||||
"/robots.txt": route_robots,
|
||||
"/types/index.html": route_types_index,
|
||||
"/types/control.php": route_types,
|
||||
"/types/photo.png": route_types,
|
||||
"/types/notype.png": route_types,
|
||||
"/types/lie.png": route_types,
|
||||
"/types/page.htm": route_types,
|
||||
"/types/script.js": route_types,
|
||||
"/types/style.css": route_types,
|
||||
"/types/data.json": route_types,
|
||||
"/types/gen.php": route_types,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user