mirror of
https://github.com/xroche/httrack.git
synced 2026-07-02 15:14:49 +03:00
Compare commits
4 Commits
htsparse-t
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
92db2f2b41 | ||
|
|
ec52112446 | ||
|
|
1eaddc9c0e | ||
|
|
d97a7bdfd9 |
125
src/htsparse.c
125
src/htsparse.c
@@ -108,62 +108,47 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
#define HT_ADD_FOP
|
||||
|
||||
/* Mutable extended-context fields: one source of truth so the DEFINE/SET/SAVE
|
||||
load and store lists can't drift apart. */
|
||||
/* clang-format off */
|
||||
#define ENGINE_MUTABLE_FIELDS(X) \
|
||||
X(int, error, stre->error_) \
|
||||
X(int, store_errpage, stre->store_errpage_) \
|
||||
X(int, makeindex_done, stre->makeindex_done_) \
|
||||
X(FILE *, makeindex_fp, stre->makeindex_fp_) \
|
||||
X(int, makeindex_links, stre->makeindex_links_) \
|
||||
X(LLint, stat_fragment, stre->stat_fragment_)
|
||||
|
||||
#define ENGINE_FIELD_DECLARE(type, name, src) type name = *(src);
|
||||
#define ENGINE_FIELD_LOAD(type, name, src) name = *(src);
|
||||
#define ENGINE_FIELD_STORE(type, name, src) *(src) = name;
|
||||
|
||||
#define ENGINE_DEFINE_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
htsblk* const r HTS_UNUSED = stre->r_; \
|
||||
hash_struct* const hash HTS_UNUSED = stre->hash_; \
|
||||
char* const codebase HTS_UNUSED = stre->codebase; \
|
||||
char* const base HTS_UNUSED = stre->base; \
|
||||
/* */ \
|
||||
const char * const template_header HTS_UNUSED = stre->template_header_; \
|
||||
const char * const template_body HTS_UNUSED = stre->template_body_; \
|
||||
const char * const template_footer HTS_UNUSED = stre->template_footer_; \
|
||||
/* */ \
|
||||
HTS_UNUSED char* const makeindex_firstlink = stre->makeindex_firstlink_; \
|
||||
/* */ \
|
||||
/* */ \
|
||||
int error = * stre->error_; \
|
||||
int store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
int makeindex_done = *stre->makeindex_done_; \
|
||||
FILE* makeindex_fp = *stre->makeindex_fp_; \
|
||||
int makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
LLint stat_fragment = *stre->stat_fragment_; \
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_DECLARE) \
|
||||
/* load-once (kept out of SET/SAVE): re-reading would reset the throttle */ \
|
||||
HTS_UNUSED TStamp makestat_time = stre->makestat_time; \
|
||||
HTS_UNUSED FILE* makestat_fp = stre->makestat_fp
|
||||
|
||||
/* clang-format off: an edit realigns all backslashes, churning the macro. */
|
||||
/* clang-format off */
|
||||
/* Load-once: re-reading resets makestat_time (mutated locally, never SAVEd). */
|
||||
#define ENGINE_SET_CONTEXT() \
|
||||
ENGINE_SET_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
error = * stre->error_; \
|
||||
store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
makeindex_done = *stre->makeindex_done_; \
|
||||
makeindex_fp = *stre->makeindex_fp_; \
|
||||
makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
stat_fragment = *stre->stat_fragment_
|
||||
/* clang-format on */
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_LOAD)
|
||||
|
||||
#define ENGINE_LOAD_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT()
|
||||
|
||||
#define ENGINE_SAVE_CONTEXT() \
|
||||
ENGINE_SAVE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
* stre->error_ = error; \
|
||||
* stre->store_errpage_ = store_errpage; \
|
||||
/* */ \
|
||||
*stre->makeindex_done_ = makeindex_done; \
|
||||
*stre->makeindex_fp_ = makeindex_fp; \
|
||||
*stre->makeindex_links_ = makeindex_links; \
|
||||
/* */ \
|
||||
*stre->stat_fragment_ = stat_fragment
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_STORE)
|
||||
/* clang-format on */
|
||||
|
||||
#define _ROBOTS ((robots_wizard*)opt->robotsptr)
|
||||
|
||||
@@ -3503,6 +3488,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
|
||||
* contract in htsparse.h. */
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil) {
|
||||
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
|
||||
const int norm_query = opt->urlhack && !opt->no_query_dedup;
|
||||
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
if (strcasecmp(jump_identification_const(moved_adr),
|
||||
jump_identification_const(cur_adr)) != 0)
|
||||
return HTS_FALSE;
|
||||
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
|
||||
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
|
||||
return strcasecmp(n_fil, pn_fil) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Check 301, 302, .. statuscodes (moved)
|
||||
*/
|
||||
@@ -3548,36 +3551,9 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
if ((reponse =
|
||||
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
|
||||
int set_prio_to = 0; // pas de priotité fixéd par wizard
|
||||
|
||||
// check whether URLHack is harmless or not (per the effective
|
||||
// sub-flags)
|
||||
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
|
||||
!opt->no_query_dedup)) {
|
||||
const int norm_host = !opt->no_www_dedup;
|
||||
const int norm_slash = !opt->no_slash_dedup;
|
||||
const int norm_query = !opt->no_query_dedup;
|
||||
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
strlcpybuff(n_adr,
|
||||
norm_host ? jump_normalized_const(moved->adr)
|
||||
: jump_identification_const(moved->adr),
|
||||
sizeof(n_adr));
|
||||
strlcpybuff(pn_adr,
|
||||
norm_host ? jump_normalized_const(urladr())
|
||||
: jump_identification_const(urladr()),
|
||||
sizeof(pn_adr));
|
||||
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
if (strcasecmp(n_adr, pn_adr) == 0
|
||||
&& strcasecmp(n_fil, pn_fil) == 0) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
}
|
||||
// A same-file alias redirect must be followed, not stubbed (#159).
|
||||
const hts_boolean same_savefile = hts_redirect_same_savefile(
|
||||
opt, urladr(), urlfil(), moved->adr, moved->fil);
|
||||
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
|
||||
// c'est (en gros) la même URL..
|
||||
// si c'est un problème de casse dans le host c'est que le serveur est buggé
|
||||
@@ -3605,7 +3581,17 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
|
||||
moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
} else if (same_savefile) {
|
||||
// A stub would point at itself; follow the redirect instead.
|
||||
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
|
||||
&set_prio_to, NULL) != 1) {
|
||||
get_it = 1;
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirect to a same-file alias, fetching real "
|
||||
"content: %s%s -> %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
}
|
||||
|
||||
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
|
||||
@@ -3628,7 +3614,11 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
heap(heap(ptr)->precedent)->adr,
|
||||
heap(heap(ptr)->precedent)->fil, opt,
|
||||
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
|
||||
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// Same-file alias: the reserved name is the invalidated source,
|
||||
// so record anyway.
|
||||
if (same_savefile ||
|
||||
hash_read(hash, savedmoved.save, NULL,
|
||||
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// enregistrer lien avec SAV IDENTIQUE
|
||||
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||
// mode test?
|
||||
@@ -3652,7 +3642,6 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
"moving %s to an existing file %s",
|
||||
heap(ptr)->fil, urlfil());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -116,6 +116,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
|
||||
int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
htsmoduleStructExtended * stre);
|
||||
|
||||
/*
|
||||
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
|
||||
same local file, so it must be followed rather than turned into a
|
||||
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
|
||||
stripped, www kept (www dedup is the crawl layer's job), path
|
||||
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
|
||||
on the dedup hash, which folds www and never collapses http<->https.
|
||||
*/
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil);
|
||||
|
||||
/*
|
||||
Process user intercations: pause, add link, delete link..
|
||||
*/
|
||||
|
||||
@@ -45,6 +45,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htscore.h"
|
||||
#include "htsdefines.h"
|
||||
#include "htslib.h"
|
||||
#include "htsparse.h"
|
||||
#include "htscache_selftest.h"
|
||||
#include "htsdns_selftest.h"
|
||||
#include "htscharset.h"
|
||||
@@ -1340,6 +1341,37 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* #159: hts_redirect_same_savefile decides whether a redirect is a same-file
|
||||
* alias. */
|
||||
static int st_redirect_samefile(httrackp *opt, int argc, char **argv) {
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
#define SAME(aa, fa, ab, fb) hts_redirect_same_savefile(opt, aa, fa, ab, fb)
|
||||
/* scheme and userinfo collapse (the #159 case); a different path does not */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
assertf(SAME("http://user@foo.com", "/a", "http://foo.com", "/a"));
|
||||
assertf(!SAME("http://foo.com", "/a", "http://foo.com", "/b"));
|
||||
/* www stays distinct here; the crawl's dedup layer folds www, not this helper
|
||||
*/
|
||||
opt->urlhack = HTS_TRUE;
|
||||
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
assertf(!SAME("http://www.foo.com", "/a", "http://foo.com", "/a"));
|
||||
/* slash/query fold only when the dedup flag is on */
|
||||
assertf(SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
|
||||
assertf(!SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
!SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
/* but a pure scheme alias still collapses regardless of dedup opt-outs */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
#undef SAME
|
||||
printf("redirect-samefile self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// hts_finish_makeindex writes the footer, emits the refresh meta only when
|
||||
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
|
||||
static int st_makeindex(httrackp *opt, int argc, char **argv) {
|
||||
@@ -1757,6 +1789,8 @@ static const struct selftest_entry {
|
||||
st_stripquery},
|
||||
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
|
||||
st_urlhack},
|
||||
{"redirect-samefile", "", "same-file redirect detection self-test (#159)",
|
||||
st_redirect_samefile},
|
||||
{"mime", "<filename>", "MIME type for a filename", st_mime},
|
||||
{"charset", "<charset> <string>",
|
||||
"convert a string to UTF-8 from a charset", st_charset},
|
||||
|
||||
9
tests/01_engine-redirect.test
Normal file
9
tests/01_engine-redirect.test
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
|
||||
# followed through, not turned into a self-pointing "moved" stub. The decision
|
||||
# helper is exercised by the engine self-test.
|
||||
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"
|
||||
13
tests/30_local-fragment-link.test
Executable file
13
tests/30_local-fragment-link.test
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
# Issue #279: an anchored link (target.html#sec, quoted or bare) fetches the
|
||||
# target with the fragment dropped (strict server 400s on a '#' in the request)
|
||||
# but keeps it in the rewritten local link so the anchor still works.
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'fraglink/target.html' \
|
||||
--file-matches 'fraglink/index.html' 'href=target\.html#sec' \
|
||||
--file-matches 'fraglink/index.html' 'href="target\.html#sec2"' \
|
||||
httrack 'BASEURL/fraglink/index.html'
|
||||
@@ -6,6 +6,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
local-crawl.sh local-server.py server.crt server.key \
|
||||
server-root/simple/basic.html server-root/simple/link.html \
|
||||
server-root/stripquery/index.html server-root/stripquery/a.html \
|
||||
server-root/fraglink/index.html server-root/fraglink/target.html \
|
||||
fixtures/cache-golden/hts-cache/new.zip
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
@@ -43,6 +44,7 @@ TESTS = \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-redirect.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
01_engine-savename.test \
|
||||
@@ -83,6 +85,7 @@ TESTS = \
|
||||
26_local-strip-query.test \
|
||||
27_local-cookies-file.test \
|
||||
28_local-pause.test \
|
||||
29_local-redirect-fragment.test
|
||||
29_local-redirect-fragment.test \
|
||||
30_local-fragment-link.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -15,8 +15,11 @@
|
||||
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
|
||||
# host root), to assert rewritten link/content survived the crawl.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
# which the ephemeral port forces into the cookie domain) and passes it to
|
||||
# httrack via --cookies-file, to exercise preloaded cookies.
|
||||
@@ -121,6 +124,10 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--file-matches | --file-not-matches)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}" "${args[$((pos + 2))]}")
|
||||
pos=$((pos + 2))
|
||||
;;
|
||||
httrack)
|
||||
pos=$((pos + 1))
|
||||
break
|
||||
@@ -294,6 +301,24 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
--file-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} matches ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then result "OK"; else
|
||||
result "no match"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--file-not-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} lacks ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then
|
||||
result "matched"
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
4
tests/server-root/fraglink/index.html
Normal file
4
tests/server-root/fraglink/index.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html><body>
|
||||
<a href=target.html#sec>unquoted fragment link</a>
|
||||
<a href="target.html#sec2">quoted fragment link</a>
|
||||
</body></html>
|
||||
1
tests/server-root/fraglink/target.html
Normal file
1
tests/server-root/fraglink/target.html
Normal file
@@ -0,0 +1 @@
|
||||
<html><body><a name="sec"></a><a name="sec2"></a>target</body></html>
|
||||
Reference in New Issue
Block a user