mirror of
https://github.com/xroche/httrack.git
synced 2026-07-02 23:24:03 +03:00
Compare commits
1 Commits
master
...
issue-159-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9ee7a94b40 |
@@ -3488,6 +3488,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
|
||||
* contract in htsparse.h. */
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil) {
|
||||
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
|
||||
const int norm_query = opt->urlhack && !opt->no_query_dedup;
|
||||
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
if (strcasecmp(jump_identification_const(moved_adr),
|
||||
jump_identification_const(cur_adr)) != 0)
|
||||
return HTS_FALSE;
|
||||
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
|
||||
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
|
||||
return strcasecmp(n_fil, pn_fil) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Check 301, 302, .. statuscodes (moved)
|
||||
*/
|
||||
@@ -3533,36 +3551,9 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
if ((reponse =
|
||||
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
|
||||
int set_prio_to = 0; // pas de priotité fixéd par wizard
|
||||
|
||||
// check whether URLHack is harmless or not (per the effective
|
||||
// sub-flags)
|
||||
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
|
||||
!opt->no_query_dedup)) {
|
||||
const int norm_host = !opt->no_www_dedup;
|
||||
const int norm_slash = !opt->no_slash_dedup;
|
||||
const int norm_query = !opt->no_query_dedup;
|
||||
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
strlcpybuff(n_adr,
|
||||
norm_host ? jump_normalized_const(moved->adr)
|
||||
: jump_identification_const(moved->adr),
|
||||
sizeof(n_adr));
|
||||
strlcpybuff(pn_adr,
|
||||
norm_host ? jump_normalized_const(urladr())
|
||||
: jump_identification_const(urladr()),
|
||||
sizeof(pn_adr));
|
||||
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
if (strcasecmp(n_adr, pn_adr) == 0
|
||||
&& strcasecmp(n_fil, pn_fil) == 0) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
}
|
||||
// A same-file alias redirect must be followed, not stubbed (#159).
|
||||
const hts_boolean same_savefile = hts_redirect_same_savefile(
|
||||
opt, urladr(), urlfil(), moved->adr, moved->fil);
|
||||
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
|
||||
// c'est (en gros) la même URL..
|
||||
// si c'est un problème de casse dans le host c'est que le serveur est buggé
|
||||
@@ -3590,7 +3581,17 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
|
||||
moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
} else if (same_savefile) {
|
||||
// A stub would point at itself; follow the redirect instead.
|
||||
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
|
||||
&set_prio_to, NULL) != 1) {
|
||||
get_it = 1;
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirect to a same-file alias, fetching real "
|
||||
"content: %s%s -> %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
}
|
||||
|
||||
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
|
||||
@@ -3613,7 +3614,11 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
heap(heap(ptr)->precedent)->adr,
|
||||
heap(heap(ptr)->precedent)->fil, opt,
|
||||
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
|
||||
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// Same-file alias: the reserved name is the invalidated source,
|
||||
// so record anyway.
|
||||
if (same_savefile ||
|
||||
hash_read(hash, savedmoved.save, NULL,
|
||||
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// enregistrer lien avec SAV IDENTIQUE
|
||||
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||
// mode test?
|
||||
@@ -3637,7 +3642,6 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
"moving %s to an existing file %s",
|
||||
heap(ptr)->fil, urlfil());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -116,6 +116,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
|
||||
int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
htsmoduleStructExtended * stre);
|
||||
|
||||
/*
|
||||
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
|
||||
same local file, so it must be followed rather than turned into a
|
||||
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
|
||||
stripped, www kept (www dedup is the crawl layer's job), path
|
||||
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
|
||||
on the dedup hash, which folds www and never collapses http<->https.
|
||||
*/
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil);
|
||||
|
||||
/*
|
||||
Process user intercations: pause, add link, delete link..
|
||||
*/
|
||||
|
||||
@@ -45,6 +45,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htscore.h"
|
||||
#include "htsdefines.h"
|
||||
#include "htslib.h"
|
||||
#include "htsparse.h"
|
||||
#include "htscache_selftest.h"
|
||||
#include "htsdns_selftest.h"
|
||||
#include "htscharset.h"
|
||||
@@ -1340,6 +1341,37 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* #159: hts_redirect_same_savefile decides whether a redirect is a same-file
|
||||
* alias. */
|
||||
static int st_redirect_samefile(httrackp *opt, int argc, char **argv) {
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
#define SAME(aa, fa, ab, fb) hts_redirect_same_savefile(opt, aa, fa, ab, fb)
|
||||
/* scheme and userinfo collapse (the #159 case); a different path does not */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
assertf(SAME("http://user@foo.com", "/a", "http://foo.com", "/a"));
|
||||
assertf(!SAME("http://foo.com", "/a", "http://foo.com", "/b"));
|
||||
/* www stays distinct here; the crawl's dedup layer folds www, not this helper
|
||||
*/
|
||||
opt->urlhack = HTS_TRUE;
|
||||
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
assertf(!SAME("http://www.foo.com", "/a", "http://foo.com", "/a"));
|
||||
/* slash/query fold only when the dedup flag is on */
|
||||
assertf(SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
|
||||
assertf(!SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
!SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
/* but a pure scheme alias still collapses regardless of dedup opt-outs */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
#undef SAME
|
||||
printf("redirect-samefile self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// hts_finish_makeindex writes the footer, emits the refresh meta only when
|
||||
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
|
||||
static int st_makeindex(httrackp *opt, int argc, char **argv) {
|
||||
@@ -1757,6 +1789,8 @@ static const struct selftest_entry {
|
||||
st_stripquery},
|
||||
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
|
||||
st_urlhack},
|
||||
{"redirect-samefile", "", "same-file redirect detection self-test (#159)",
|
||||
st_redirect_samefile},
|
||||
{"mime", "<filename>", "MIME type for a filename", st_mime},
|
||||
{"charset", "<charset> <string>",
|
||||
"convert a string to UTF-8 from a charset", st_charset},
|
||||
|
||||
9
tests/01_engine-redirect.test
Normal file
9
tests/01_engine-redirect.test
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
|
||||
# followed through, not turned into a self-pointing "moved" stub. The decision
|
||||
# helper is exercised by the engine self-test.
|
||||
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"
|
||||
@@ -44,6 +44,7 @@ TESTS = \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-redirect.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
01_engine-savename.test \
|
||||
|
||||
Reference in New Issue
Block a user