From 6da794fdb68ecf56cccbbc280b78a81eaead4a4e Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Wed, 24 Jun 2026 18:09:14 +0200 Subject: [PATCH] Keep unrecognized URL tails instead of mangling them to .html url_savename truncated any trailing ".token" when applying a resolved content-type, so /article-1.884291 served as text/html was saved as article-1.html, dropping the .884291 tail and colliding with every sibling sharing the prefix. Cut the old extension only when it is the resolved type, a known MIME extension, a dynamic-page extension, or an html-family extension; otherwise keep the tail and append the type (article-1.884291.html). Recognized extensions still collapse as before, so the #267/#408 soft-404 behavior (a binary URL served as HTML named .html) is preserved, and a type that agrees with the extension causes no churn. Add a hidden -#N self-test driving url_savename offline, plus tests/01_engine-savename.test covering the matrix. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Roche --- src/htscoremain.c | 38 +++++++++++++++++++++++++++++++++++ src/htsname.c | 21 ++++++++++++------- tests/01_engine-savename.test | 35 ++++++++++++++++++++++++++++++++ tests/Makefile.am | 1 + 4 files changed, 88 insertions(+), 7 deletions(-) create mode 100755 tests/01_engine-savename.test diff --git a/src/htscoremain.c b/src/htscoremain.c index 2c6bb6b..b72e785 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -2468,6 +2468,44 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) { htsmain_free(); return err; } break; + case 'N': { // url_savename name resolution: httrack -#N + // + if (na + 2 < argc) { + lien_adrfilsave afs; + cache_back cache; + struct_back *sback; + hash_struct hash; + lien_back headers; + + memset(&afs, 0, sizeof(afs)); + strcpybuff(afs.af.adr, "www.example.com"); + strcpybuff(afs.af.fil, argv[na + 1]); + + memset(&cache, 0, sizeof(cache)); + cache.hashtable = (void *) coucal_new(0); + + sback = back_new(opt, opt->maxsoc * 32 + 1024); + hash_init(opt, &hash, opt->urlhack); + + memset(&headers, 0, sizeof(headers)); + headers.status = 0; + headers.r.statuscode = HTTP_OK; + strcpybuff(headers.r.contenttype, argv[na + 2]); + strcpybuff(headers.url_fil, argv[na + 1]); + + url_savename(&afs, NULL, NULL, NULL, opt, sback, &cache, + &hash, 0, 0, &headers); + printf("savename: %s\n", afs.save); + htsmain_free(); + return 0; + } else { + fprintf( + stderr, + "Option #N requires arguments\n"); + htsmain_free(); + return 1; + } + } break; case 'C': // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file { int hasFilter = 0; diff --git a/src/htsname.c b/src/htsname.c index aabd706..9f971fb 100644 --- a/src/htsname.c +++ b/src/htsname.c @@ -760,9 +760,9 @@ int url_savename(lien_adrfilsave *const afs, strcatbuff(fil, DEFAULT_HTML); // nommer page par défaut (à priori ici html depuis un proxy http) } } - // Changer extension? - // par exemple, php3 sera sauvé en html, cgi en html ou gif, xbm etc.. selon les cas - if (ext_chg && !opt->no_type_change) { // changer ext + // Change the extension? e.g. php3 saved as html, cgi as html or gif/xbm + // depending on the resolved type. + if (ext_chg && !opt->no_type_change) { char *a = fil + strlen(fil) - 1; if ((opt->debug > 1) && (opt->log != NULL)) { @@ -774,11 +774,18 @@ int url_savename(lien_adrfilsave *const afs, adr_complete, fil_complete, ext); } if (ext_chg == 1) { + // Cut the old extension only when it is the new one or a recognized one; + // an unknown trailing ".token" (e.g. /article-1.884291, #115) is part of + // the name, not an extension to replace. + const char *const old_ext = get_ext(catbuff, sizeof(catbuff), fil); + const int known_ext = strfield2(old_ext, ext) || is_knowntype(opt, fil) || + is_dyntype(old_ext) || ishtml_ext(old_ext) != -1; + while((a > fil) && (*a != '.') && (*a != '/')) a--; - if (*a == '.') - *a = '\0'; // couper - strcatbuff(fil, "."); // recopier point + if (*a == '.' && known_ext) + *a = '\0'; // cut + strcatbuff(fil, "."); // re-add the dot } else { while((a > fil) && (*a != '/')) a--; @@ -786,7 +793,7 @@ int url_savename(lien_adrfilsave *const afs, a++; *a = '\0'; } - strcatbuff(fil, ext); // copier ext/nom + strcatbuff(fil, ext); // append ext/name } // Rechercher premier / et dernier . { diff --git a/tests/01_engine-savename.test b/tests/01_engine-savename.test new file mode 100755 index 0000000..02d7379 --- /dev/null +++ b/tests/01_engine-savename.test @@ -0,0 +1,35 @@ +#!/bin/bash +# + +set -euo pipefail + +# Local save-name extension resolution (url_savename via -#N ). +# Asserts on the basename of "savename: ". + +name() { + out="$(httrack -O /dev/null -#N "$1" "$2" | sed -n 's/^savename: //p')" + test "${out##*/}" == "$3" || { + echo "FAIL: '$1' '$2' -> '$out' (want '$3')" + exit 1 + } +} + +# #115: an unknown trailing ".token" is part of the name, keep it and append the type. +name '/article-1.884291' 'text/html' 'article-1.884291.html' +name '/news/story-12345.987654' 'text/html' 'story-12345.987654.html' + +# Recognized extensions still collapse to the resolved type. +name '/page.php' 'text/html' 'page.html' +name '/page.asp' 'text/html' 'page.html' +name '/foo' 'text/html' 'foo.html' + +# Soft-404 (#267/#408): a binary URL served as HTML is named .html. +name '/x.pdf' 'text/html' 'x.html' +name '/x.gif' 'text/html' 'x.html' + +# Type agrees with the extension: keep it, no churn, no double extension. +name '/x.pdf' 'application/pdf' 'x.pdf' +name '/x.jpg' 'image/jpeg' 'x.jpg' +name '/x.html' 'text/html' 'x.html' +name '/x.js' 'application/x-javascript' 'x.js' +name '/types/data.json' 'application/json' 'data.json' diff --git a/tests/Makefile.am b/tests/Makefile.am index 87bdb19..5e495e5 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -40,6 +40,7 @@ TESTS = \ 01_engine-parse.test \ 01_engine-rcfile.test \ 01_engine-relative.test \ + 01_engine-savename.test \ 01_engine-simplify.test \ 01_engine-strsafe.test \ 02_manpage-regen.test \