Keep unrecognized URL tails instead of mangling them to .html

url_savename truncated any trailing ".token" when applying a resolved
content-type, so /article-1.884291 served as text/html was saved as
article-1.html, dropping the .884291 tail and colliding with every
sibling sharing the prefix. Cut the old extension only when it is the
resolved type, a known MIME extension, a dynamic-page extension, or an
html-family extension; otherwise keep the tail and append the type
(article-1.884291.html).

Recognized extensions still collapse as before, so the #267/#408
soft-404 behavior (a binary URL served as HTML named .html) is
preserved, and a type that agrees with the extension causes no churn.

Add a hidden -#N <fil> <content-type> self-test driving url_savename
offline, plus tests/01_engine-savename.test covering the matrix.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
This commit is contained in:
Xavier Roche
2026-06-24 18:09:14 +02:00
parent 594cf0da39
commit 6da794fdb6
4 changed files with 88 additions and 7 deletions

View File

@@ -2468,6 +2468,44 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
htsmain_free();
return err;
} break;
case 'N': { // url_savename name resolution: httrack -#N <fil>
// <content-type>
if (na + 2 < argc) {
lien_adrfilsave afs;
cache_back cache;
struct_back *sback;
hash_struct hash;
lien_back headers;
memset(&afs, 0, sizeof(afs));
strcpybuff(afs.af.adr, "www.example.com");
strcpybuff(afs.af.fil, argv[na + 1]);
memset(&cache, 0, sizeof(cache));
cache.hashtable = (void *) coucal_new(0);
sback = back_new(opt, opt->maxsoc * 32 + 1024);
hash_init(opt, &hash, opt->urlhack);
memset(&headers, 0, sizeof(headers));
headers.status = 0;
headers.r.statuscode = HTTP_OK;
strcpybuff(headers.r.contenttype, argv[na + 2]);
strcpybuff(headers.url_fil, argv[na + 1]);
url_savename(&afs, NULL, NULL, NULL, opt, sback, &cache,
&hash, 0, 0, &headers);
printf("savename: %s\n", afs.save);
htsmain_free();
return 0;
} else {
fprintf(
stderr,
"Option #N requires <fil> <content-type> arguments\n");
htsmain_free();
return 1;
}
} break;
case 'C': // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file
{
int hasFilter = 0;

View File

@@ -760,9 +760,9 @@ int url_savename(lien_adrfilsave *const afs,
strcatbuff(fil, DEFAULT_HTML); // nommer page par défaut (à priori ici html depuis un proxy http)
}
}
// Changer extension?
// par exemple, php3 sera sauvé en html, cgi en html ou gif, xbm etc.. selon les cas
if (ext_chg && !opt->no_type_change) { // changer ext
// Change the extension? e.g. php3 saved as html, cgi as html or gif/xbm
// depending on the resolved type.
if (ext_chg && !opt->no_type_change) {
char *a = fil + strlen(fil) - 1;
if ((opt->debug > 1) && (opt->log != NULL)) {
@@ -774,11 +774,18 @@ int url_savename(lien_adrfilsave *const afs,
adr_complete, fil_complete, ext);
}
if (ext_chg == 1) {
// Cut the old extension only when it is the new one or a recognized one;
// an unknown trailing ".token" (e.g. /article-1.884291, #115) is part of
// the name, not an extension to replace.
const char *const old_ext = get_ext(catbuff, sizeof(catbuff), fil);
const int known_ext = strfield2(old_ext, ext) || is_knowntype(opt, fil) ||
is_dyntype(old_ext) || ishtml_ext(old_ext) != -1;
while((a > fil) && (*a != '.') && (*a != '/'))
a--;
if (*a == '.')
*a = '\0'; // couper
strcatbuff(fil, "."); // recopier point
if (*a == '.' && known_ext)
*a = '\0'; // cut
strcatbuff(fil, "."); // re-add the dot
} else {
while((a > fil) && (*a != '/'))
a--;
@@ -786,7 +793,7 @@ int url_savename(lien_adrfilsave *const afs,
a++;
*a = '\0';
}
strcatbuff(fil, ext); // copier ext/nom
strcatbuff(fil, ext); // append ext/name
}
// Rechercher premier / et dernier .
{

35
tests/01_engine-savename.test Executable file
View File

@@ -0,0 +1,35 @@
#!/bin/bash
#
set -euo pipefail
# Local save-name extension resolution (url_savename via -#N <fil> <content-type>).
# Asserts on the basename of "savename: <path>".
name() {
out="$(httrack -O /dev/null -#N "$1" "$2" | sed -n 's/^savename: //p')"
test "${out##*/}" == "$3" || {
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
exit 1
}
}
# #115: an unknown trailing ".token" is part of the name, keep it and append the type.
name '/article-1.884291' 'text/html' 'article-1.884291.html'
name '/news/story-12345.987654' 'text/html' 'story-12345.987654.html'
# Recognized extensions still collapse to the resolved type.
name '/page.php' 'text/html' 'page.html'
name '/page.asp' 'text/html' 'page.html'
name '/foo' 'text/html' 'foo.html'
# Soft-404 (#267/#408): a binary URL served as HTML is named .html.
name '/x.pdf' 'text/html' 'x.html'
name '/x.gif' 'text/html' 'x.html'
# Type agrees with the extension: keep it, no churn, no double extension.
name '/x.pdf' 'application/pdf' 'x.pdf'
name '/x.jpg' 'image/jpeg' 'x.jpg'
name '/x.html' 'text/html' 'x.html'
name '/x.js' 'application/x-javascript' 'x.js'
name '/types/data.json' 'application/json' 'data.json'

View File

@@ -40,6 +40,7 @@ TESTS = \
01_engine-parse.test \
01_engine-rcfile.test \
01_engine-relative.test \
01_engine-savename.test \
01_engine-simplify.test \
01_engine-strsafe.test \
02_manpage-regen.test \