Compare commits

...

3 Commits

Author SHA1 Message Date
Xavier Roche
92db2f2b41 htsparse: follow same-file redirects instead of self-pointing stubs (#159) (#471)
An http->https redirect (or any alias where the savename strips a
component the URL keeps) collapses the source and target onto one saved
file. hts_mirror_check_moved wrote a "Page has moved" stub linking to the
target's savename, but that equals the source's, so the stub pointed at
itself and the real content was never saved.

Detect a same-file alias with a new hts_redirect_same_savefile helper
(scheme and userinfo stripped, www kept, path slash/query-normalized per
the URL-hack dedup flags) and follow the redirect through: record the
moved link at the same savename so its content overwrites the placeholder.
Genuinely-different moves keep the stub. The old informational "URL Hack
identical" log is superseded by an actionable message on the followed
redirect. Covered by a redirect-samefile engine self-test.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-02 09:41:17 +02:00
Xavier Roche
ec52112446 tests: cover anchored-link (#frag) rewriting (#279) (#470)
An anchored hyperlink target.html#sec must fetch the target with the
fragment dropped yet keep the fragment in the rewritten local link so
the anchor still resolves. This already works; #279 is a stale
report from the Google Code era with no current repro.

Pin the behavior with a local-crawl test: the strict server 400s on a
'#' in the request-target (so a leaked fragment fails the fetch), and a
new --file-matches audit asserts the mirrored link keeps #sec/#sec2 for
both the unquoted and quoted forms.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-01 22:26:16 +02:00
Xavier Roche
1eaddc9c0e htsparse: drive the extended-context field list from one X-macro (#469)
ENGINE_SET_CONTEXT and ENGINE_SAVE_CONTEXT kept hand-maintained parallel
copies of the mutable extended-context fields. The two lists drifting apart is
how the makestat_time throttle bug got in: a field reloaded by SET with no
matching SAVE. Move the six mutable fields into a single ENGINE_MUTABLE_FIELDS
list that DEFINE, SET and SAVE each expand through their own operation, so a
load without a matching store can no longer be written.

Pure refactor: object code matches the prior macros apart from embedded
__LINE__ constants shifting with the smaller source.

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-01 22:02:03 +02:00
9 changed files with 160 additions and 69 deletions

View File

@@ -108,62 +108,47 @@ Please visit our Website: http://www.httrack.com
#define HT_ADD_FOP
/* Mutable extended-context fields: one source of truth so the DEFINE/SET/SAVE
load and store lists can't drift apart. */
/* clang-format off */
#define ENGINE_MUTABLE_FIELDS(X) \
X(int, error, stre->error_) \
X(int, store_errpage, stre->store_errpage_) \
X(int, makeindex_done, stre->makeindex_done_) \
X(FILE *, makeindex_fp, stre->makeindex_fp_) \
X(int, makeindex_links, stre->makeindex_links_) \
X(LLint, stat_fragment, stre->stat_fragment_)
#define ENGINE_FIELD_DECLARE(type, name, src) type name = *(src);
#define ENGINE_FIELD_LOAD(type, name, src) name = *(src);
#define ENGINE_FIELD_STORE(type, name, src) *(src) = name;
#define ENGINE_DEFINE_CONTEXT() \
ENGINE_DEFINE_CONTEXT_BASE(); \
/* */ \
htsblk* const r HTS_UNUSED = stre->r_; \
hash_struct* const hash HTS_UNUSED = stre->hash_; \
char* const codebase HTS_UNUSED = stre->codebase; \
char* const base HTS_UNUSED = stre->base; \
/* */ \
const char * const template_header HTS_UNUSED = stre->template_header_; \
const char * const template_body HTS_UNUSED = stre->template_body_; \
const char * const template_footer HTS_UNUSED = stre->template_footer_; \
/* */ \
HTS_UNUSED char* const makeindex_firstlink = stre->makeindex_firstlink_; \
/* */ \
/* */ \
int error = * stre->error_; \
int store_errpage = * stre->store_errpage_; \
/* */ \
int makeindex_done = *stre->makeindex_done_; \
FILE* makeindex_fp = *stre->makeindex_fp_; \
int makeindex_links = *stre->makeindex_links_; \
/* */ \
LLint stat_fragment = *stre->stat_fragment_; \
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_DECLARE) \
/* load-once (kept out of SET/SAVE): re-reading would reset the throttle */ \
HTS_UNUSED TStamp makestat_time = stre->makestat_time; \
HTS_UNUSED FILE* makestat_fp = stre->makestat_fp
/* clang-format off: an edit realigns all backslashes, churning the macro. */
/* clang-format off */
/* Load-once: re-reading resets makestat_time (mutated locally, never SAVEd). */
#define ENGINE_SET_CONTEXT() \
ENGINE_SET_CONTEXT_BASE(); \
/* */ \
error = * stre->error_; \
store_errpage = * stre->store_errpage_; \
/* */ \
makeindex_done = *stre->makeindex_done_; \
makeindex_fp = *stre->makeindex_fp_; \
makeindex_links = *stre->makeindex_links_; \
/* */ \
stat_fragment = *stre->stat_fragment_
/* clang-format on */
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_LOAD)
#define ENGINE_LOAD_CONTEXT() \
ENGINE_DEFINE_CONTEXT()
#define ENGINE_SAVE_CONTEXT() \
ENGINE_SAVE_CONTEXT_BASE(); \
/* */ \
* stre->error_ = error; \
* stre->store_errpage_ = store_errpage; \
/* */ \
*stre->makeindex_done_ = makeindex_done; \
*stre->makeindex_fp_ = makeindex_fp; \
*stre->makeindex_links_ = makeindex_links; \
/* */ \
*stre->stat_fragment_ = stat_fragment
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_STORE)
/* clang-format on */
#define _ROBOTS ((robots_wizard*)opt->robotsptr)
@@ -3503,6 +3488,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
return 0;
}
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
* contract in htsparse.h. */
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
const char *cur_fil,
const char *moved_adr,
const char *moved_fil) {
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
const int norm_query = opt->urlhack && !opt->no_query_dedup;
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
if (strcasecmp(jump_identification_const(moved_adr),
jump_identification_const(cur_adr)) != 0)
return HTS_FALSE;
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
return strcasecmp(n_fil, pn_fil) == 0;
}
/*
Check 301, 302, .. statuscodes (moved)
*/
@@ -3548,36 +3551,9 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
if ((reponse =
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
int set_prio_to = 0; // pas de priotité fixéd par wizard
// check whether URLHack is harmless or not (per the effective
// sub-flags)
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
!opt->no_query_dedup)) {
const int norm_host = !opt->no_www_dedup;
const int norm_slash = !opt->no_slash_dedup;
const int norm_query = !opt->no_query_dedup;
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
strlcpybuff(n_adr,
norm_host ? jump_normalized_const(moved->adr)
: jump_identification_const(moved->adr),
sizeof(n_adr));
strlcpybuff(pn_adr,
norm_host ? jump_normalized_const(urladr())
: jump_identification_const(urladr()),
sizeof(pn_adr));
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
norm_query);
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
norm_query);
if (strcasecmp(n_adr, pn_adr) == 0
&& strcasecmp(n_fil, pn_fil) == 0) {
hts_log_print(opt, LOG_WARNING,
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
urladr(), urlfil(), moved->adr, moved->fil);
}
}
// A same-file alias redirect must be followed, not stubbed (#159).
const hts_boolean same_savefile = hts_redirect_same_savefile(
opt, urladr(), urlfil(), moved->adr, moved->fil);
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
// c'est (en gros) la même URL..
// si c'est un problème de casse dans le host c'est que le serveur est buggé
@@ -3605,7 +3581,17 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
moved->adr, moved->fil);
}
} /* sinon traité normalement */
} else if (same_savefile) {
// A stub would point at itself; follow the redirect instead.
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
&set_prio_to, NULL) != 1) {
get_it = 1;
hts_log_print(opt, LOG_WARNING,
"Redirect to a same-file alias, fetching real "
"content: %s%s -> %s%s",
urladr(), urlfil(), moved->adr, moved->fil);
}
} /* sinon traité normalement */
}
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
@@ -3628,7 +3614,11 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
heap(heap(ptr)->precedent)->adr,
heap(heap(ptr)->precedent)->fil, opt,
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
// Same-file alias: the reserved name is the invalidated source,
// so record anyway.
if (same_savefile ||
hash_read(hash, savedmoved.save, NULL,
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
// enregistrer lien avec SAV IDENTIQUE
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
// mode test?
@@ -3652,7 +3642,6 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
"moving %s to an existing file %s",
heap(ptr)->fil, urlfil());
}
}
}

View File

@@ -116,6 +116,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
int hts_mirror_check_moved(htsmoduleStruct * str,
htsmoduleStructExtended * stre);
/*
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
same local file, so it must be followed rather than turned into a
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
stripped, www kept (www dedup is the crawl layer's job), path
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
on the dedup hash, which folds www and never collapses http<->https.
*/
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
const char *cur_fil,
const char *moved_adr,
const char *moved_fil);
/*
Process user intercations: pause, add link, delete link..
*/

View File

@@ -45,6 +45,7 @@ Please visit our Website: http://www.httrack.com
#include "htscore.h"
#include "htsdefines.h"
#include "htslib.h"
#include "htsparse.h"
#include "htscache_selftest.h"
#include "htsdns_selftest.h"
#include "htscharset.h"
@@ -1340,6 +1341,37 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
return 0;
}
/* #159: hts_redirect_same_savefile decides whether a redirect is a same-file
* alias. */
static int st_redirect_samefile(httrackp *opt, int argc, char **argv) {
(void) argc;
(void) argv;
#define SAME(aa, fa, ab, fb) hts_redirect_same_savefile(opt, aa, fa, ab, fb)
/* scheme and userinfo collapse (the #159 case); a different path does not */
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
assertf(SAME("http://user@foo.com", "/a", "http://foo.com", "/a"));
assertf(!SAME("http://foo.com", "/a", "http://foo.com", "/b"));
/* www stays distinct here; the crawl's dedup layer folds www, not this helper
*/
opt->urlhack = HTS_TRUE;
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
assertf(!SAME("http://www.foo.com", "/a", "http://foo.com", "/a"));
/* slash/query fold only when the dedup flag is on */
assertf(SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
assertf(
SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
assertf(!SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
assertf(
!SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
/* but a pure scheme alias still collapses regardless of dedup opt-outs */
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
#undef SAME
printf("redirect-samefile self-test OK\n");
return 0;
}
// hts_finish_makeindex writes the footer, emits the refresh meta only when
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
static int st_makeindex(httrackp *opt, int argc, char **argv) {
@@ -1757,6 +1789,8 @@ static const struct selftest_entry {
st_stripquery},
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
st_urlhack},
{"redirect-samefile", "", "same-file redirect detection self-test (#159)",
st_redirect_samefile},
{"mime", "<filename>", "MIME type for a filename", st_mime},
{"charset", "<charset> <string>",
"convert a string to UTF-8 from a charset", st_charset},

View File

@@ -0,0 +1,9 @@
#!/bin/bash
#
set -euo pipefail
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
# followed through, not turned into a self-pointing "moved" stub. The decision
# helper is exercised by the engine self-test.
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"

View File

@@ -0,0 +1,13 @@
#!/bin/bash
# Issue #279: an anchored link (target.html#sec, quoted or bare) fetches the
# target with the fragment dropped (strict server 400s on a '#' in the request)
# but keeps it in the rewritten local link so the anchor still works.
set -e
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'fraglink/target.html' \
--file-matches 'fraglink/index.html' 'href=target\.html#sec' \
--file-matches 'fraglink/index.html' 'href="target\.html#sec2"' \
httrack 'BASEURL/fraglink/index.html'

View File

@@ -6,6 +6,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
local-crawl.sh local-server.py server.crt server.key \
server-root/simple/basic.html server-root/simple/link.html \
server-root/stripquery/index.html server-root/stripquery/a.html \
server-root/fraglink/index.html server-root/fraglink/target.html \
fixtures/cache-golden/hts-cache/new.zip
TESTS_ENVIRONMENT =
@@ -43,6 +44,7 @@ TESTS = \
01_engine-parse.test \
01_engine-pause.test \
01_engine-rcfile.test \
01_engine-redirect.test \
01_engine-relative.test \
01_engine-robots.test \
01_engine-savename.test \
@@ -83,6 +85,7 @@ TESTS = \
26_local-strip-query.test \
27_local-cookies-file.test \
28_local-pause.test \
29_local-redirect-fragment.test
29_local-redirect-fragment.test \
30_local-fragment-link.test
CLEANFILES = check-network_sh.cache

View File

@@ -15,8 +15,11 @@
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
# --errors N --files N --found PATH ... --directory PATH ... \
# --log-found REGEX ... --log-not-found REGEX ... \
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
# httrack BASEURL/some/path [httrack-args...]
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
# host root), to assert rewritten link/content survived the crawl.
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
# which the ephemeral port forces into the cookie domain) and passes it to
# httrack via --cookies-file, to exercise preloaded cookies.
@@ -121,6 +124,10 @@ while test "$pos" -lt "$nargs"; do
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
pos=$((pos + 1))
;;
--file-matches | --file-not-matches)
audit+=("${args[$pos]}" "${args[$((pos + 1))]}" "${args[$((pos + 2))]}")
pos=$((pos + 2))
;;
httrack)
pos=$((pos + 1))
break
@@ -294,6 +301,24 @@ while test "$i" -lt "${#audit[@]}"; do
exit 1
else result "OK"; fi
;;
--file-matches)
path="${audit[$((i + 1))]}"
i=$((i + 2))
info "checking ${path} matches ${audit[$i]}"
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then result "OK"; else
result "no match"
exit 1
fi
;;
--file-not-matches)
path="${audit[$((i + 1))]}"
i=$((i + 2))
info "checking ${path} lacks ${audit[$i]}"
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then
result "matched"
exit 1
else result "OK"; fi
;;
esac
i=$((i + 1))
done

View File

@@ -0,0 +1,4 @@
<html><body>
<a href=target.html#sec>unquoted fragment link</a>
<a href="target.html#sec2">quoted fragment link</a>
</body></html>

View File

@@ -0,0 +1 @@
<html><body><a name="sec"></a><a name="sec2"></a>target</body></html>