mirror of
https://github.com/xroche/httrack.git
synced 2026-07-02 23:24:03 +03:00
Compare commits
5 Commits
htsparse-t
...
phase0-par
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b0d131f084 | ||
|
|
92db2f2b41 | ||
|
|
ec52112446 | ||
|
|
1eaddc9c0e | ||
|
|
d97a7bdfd9 |
@@ -175,7 +175,9 @@ HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||
//
|
||||
socinput(soc, line, 1000);
|
||||
if (strnotempty(line)) {
|
||||
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
|
||||
/* widths bound the caller buffers: method[32], url[HTS_URLMAXSIZE*2],
|
||||
protocol[256] */
|
||||
if (sscanf(line, "%31s %2047s %255s", method, url, protocol) == 3) {
|
||||
lien_adrfil af;
|
||||
|
||||
// méthode en majuscule
|
||||
|
||||
@@ -190,9 +190,9 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* copy */
|
||||
if (j + 1 > max) {
|
||||
|
||||
/* reserve one byte for the trailing NUL written after the loop */
|
||||
if (j + 1 >= max) {
|
||||
/* overflow */
|
||||
return -1;
|
||||
}
|
||||
@@ -314,8 +314,8 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
|
||||
}
|
||||
}
|
||||
|
||||
/* Check for overflow */
|
||||
if (j + 1 > max) {
|
||||
/* reserve one byte for the trailing NUL written after the loop */
|
||||
if (j + 1 >= max) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
@@ -1149,7 +1149,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
char BIGSTK protocol[256], url[HTS_URLMAXSIZE * 2], method[256];
|
||||
|
||||
linput(fp, line, 1000);
|
||||
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
|
||||
/* widths bound method[256], url[HTS_URLMAXSIZE*2], protocol[256] */
|
||||
if (sscanf(line, "%255s %2047s %255s", method, url, protocol) == 3) {
|
||||
size_t ret;
|
||||
// selon que l'on a ou pas un proxy
|
||||
if (retour->req.proxy.active) {
|
||||
|
||||
238
src/htsparse.c
238
src/htsparse.c
@@ -108,88 +108,97 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
#define HT_ADD_FOP
|
||||
|
||||
/* Mutable extended-context fields: one source of truth so the DEFINE/SET/SAVE
|
||||
load and store lists can't drift apart. */
|
||||
/* clang-format off */
|
||||
#define ENGINE_MUTABLE_FIELDS(X) \
|
||||
X(int, error, stre->error_) \
|
||||
X(int, store_errpage, stre->store_errpage_) \
|
||||
X(int, makeindex_done, stre->makeindex_done_) \
|
||||
X(FILE *, makeindex_fp, stre->makeindex_fp_) \
|
||||
X(int, makeindex_links, stre->makeindex_links_) \
|
||||
X(LLint, stat_fragment, stre->stat_fragment_)
|
||||
|
||||
#define ENGINE_FIELD_DECLARE(type, name, src) type name = *(src);
|
||||
#define ENGINE_FIELD_LOAD(type, name, src) name = *(src);
|
||||
#define ENGINE_FIELD_STORE(type, name, src) *(src) = name;
|
||||
|
||||
#define ENGINE_DEFINE_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
htsblk* const r HTS_UNUSED = stre->r_; \
|
||||
hash_struct* const hash HTS_UNUSED = stre->hash_; \
|
||||
char* const codebase HTS_UNUSED = stre->codebase; \
|
||||
char* const base HTS_UNUSED = stre->base; \
|
||||
/* */ \
|
||||
const char * const template_header HTS_UNUSED = stre->template_header_; \
|
||||
const char * const template_body HTS_UNUSED = stre->template_body_; \
|
||||
const char * const template_footer HTS_UNUSED = stre->template_footer_; \
|
||||
/* */ \
|
||||
HTS_UNUSED char* const makeindex_firstlink = stre->makeindex_firstlink_; \
|
||||
/* */ \
|
||||
/* */ \
|
||||
int error = * stre->error_; \
|
||||
int store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
int makeindex_done = *stre->makeindex_done_; \
|
||||
FILE* makeindex_fp = *stre->makeindex_fp_; \
|
||||
int makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
LLint stat_fragment = *stre->stat_fragment_; \
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_DECLARE) \
|
||||
/* load-once (kept out of SET/SAVE): re-reading would reset the throttle */ \
|
||||
HTS_UNUSED TStamp makestat_time = stre->makestat_time; \
|
||||
HTS_UNUSED FILE* makestat_fp = stre->makestat_fp
|
||||
|
||||
/* clang-format off: an edit realigns all backslashes, churning the macro. */
|
||||
/* clang-format off */
|
||||
/* Load-once: re-reading resets makestat_time (mutated locally, never SAVEd). */
|
||||
#define ENGINE_SET_CONTEXT() \
|
||||
ENGINE_SET_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
error = * stre->error_; \
|
||||
store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
makeindex_done = *stre->makeindex_done_; \
|
||||
makeindex_fp = *stre->makeindex_fp_; \
|
||||
makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
stat_fragment = *stre->stat_fragment_
|
||||
/* clang-format on */
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_LOAD)
|
||||
|
||||
#define ENGINE_LOAD_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT()
|
||||
|
||||
#define ENGINE_SAVE_CONTEXT() \
|
||||
ENGINE_SAVE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
* stre->error_ = error; \
|
||||
* stre->store_errpage_ = store_errpage; \
|
||||
/* */ \
|
||||
*stre->makeindex_done_ = makeindex_done; \
|
||||
*stre->makeindex_fp_ = makeindex_fp; \
|
||||
*stre->makeindex_links_ = makeindex_links; \
|
||||
/* */ \
|
||||
*stre->stat_fragment_ = stat_fragment
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_STORE)
|
||||
/* clang-format on */
|
||||
|
||||
#define _ROBOTS ((robots_wizard*)opt->robotsptr)
|
||||
|
||||
/* Apply current *adr character for the script automate */
|
||||
#define AUTOMATE_LOOKUP_CURRENT_ADR() do { \
|
||||
if (inscript) { \
|
||||
int new_state_pos; \
|
||||
new_state_pos=inscript_state[inscript_state_pos][(unsigned char)*html]; \
|
||||
if (new_state_pos < 0) { \
|
||||
new_state_pos=inscript_state[inscript_state_pos][INSCRIPT_DEFAULT]; \
|
||||
} \
|
||||
assertf(new_state_pos >= 0); \
|
||||
assertf(new_state_pos*sizeof(inscript_state[0]) < sizeof(inscript_state)); \
|
||||
inscript_state_pos=new_state_pos; \
|
||||
} \
|
||||
} while(0)
|
||||
/* JS-detection automaton states; INSCRIPT_DEFAULT is the synthetic "any other
|
||||
char" column of the transition table. */
|
||||
typedef enum {
|
||||
INSCRIPT_START = 0,
|
||||
INSCRIPT_ANTISLASH,
|
||||
INSCRIPT_INQUOTE,
|
||||
INSCRIPT_INQUOTE2,
|
||||
INSCRIPT_SLASH,
|
||||
INSCRIPT_SLASHSLASH,
|
||||
INSCRIPT_COMMENT,
|
||||
INSCRIPT_COMMENT2,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE2,
|
||||
INSCRIPT_DEFAULT = 256
|
||||
} INSCRIPT;
|
||||
|
||||
/* Increment current pointer to 'steps' characters, modifying automate if necessary */
|
||||
#define INCREMENT_CURRENT_ADR(steps) do { \
|
||||
int steps__ = (int) ( steps ); \
|
||||
while(steps__ > 0) { \
|
||||
html++; \
|
||||
AUTOMATE_LOOKUP_CURRENT_ADR(); \
|
||||
steps__ --; \
|
||||
} \
|
||||
} while(0)
|
||||
#define INSCRIPT_NSTATES 10 /* rows in the transition table */
|
||||
|
||||
/* Live view of the parser's automaton locals, set up once so the helpers below
|
||||
can drive it without capturing them by lexical scope. */
|
||||
typedef struct {
|
||||
const int *inscript; /* nonzero while inside a script body */
|
||||
const signed char (*table)[257]; /* [INSCRIPT_NSTATES][257] transitions */
|
||||
INSCRIPT *pos; /* current state */
|
||||
const char **html; /* parse cursor */
|
||||
} script_automate;
|
||||
|
||||
/* Feed the current *html byte to the automaton. No-op outside a script body. */
|
||||
static void hts_automate_lookup(const script_automate *aut) {
|
||||
if (*aut->inscript) {
|
||||
int next = aut->table[*aut->pos][(unsigned char) **aut->html];
|
||||
if (next < 0) {
|
||||
next = aut->table[*aut->pos][INSCRIPT_DEFAULT];
|
||||
}
|
||||
assertf(next >= 0 && next < INSCRIPT_NSTATES);
|
||||
*aut->pos = (INSCRIPT) next;
|
||||
}
|
||||
}
|
||||
|
||||
/* Advance the cursor by 'steps' bytes, feeding each to the automaton. */
|
||||
static void hts_automate_increment(const script_automate *aut, int steps) {
|
||||
while (steps > 0) {
|
||||
(*aut->html)++;
|
||||
hts_automate_lookup(aut);
|
||||
steps--;
|
||||
}
|
||||
}
|
||||
|
||||
/* Percent-encode the angle brackets of a string so it is safe to embed inside
|
||||
an HTML comment (the default footer) or any other HTML context. A URL holding
|
||||
@@ -334,20 +343,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int incomment = 0; // dans un <!--
|
||||
int inscript = 0; // dans un scipt pour applets javascript)
|
||||
int inscript_locked = 0; // in locked script (ie. js file)
|
||||
signed char inscript_state[10][257];
|
||||
typedef enum {
|
||||
INSCRIPT_START = 0,
|
||||
INSCRIPT_ANTISLASH,
|
||||
INSCRIPT_INQUOTE,
|
||||
INSCRIPT_INQUOTE2,
|
||||
INSCRIPT_SLASH,
|
||||
INSCRIPT_SLASHSLASH,
|
||||
INSCRIPT_COMMENT,
|
||||
INSCRIPT_COMMENT2,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE2,
|
||||
INSCRIPT_DEFAULT = 256
|
||||
} INSCRIPT;
|
||||
signed char inscript_state[INSCRIPT_NSTATES][257];
|
||||
INSCRIPT inscript_state_pos = INSCRIPT_START;
|
||||
const char *inscript_name = NULL; // script tag name
|
||||
int inscript_tag = 0; // on est dans un <body onLoad="... terminé par >
|
||||
@@ -408,6 +404,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
inscript_state[INSCRIPT_COMMENT2]['*'] = INSCRIPT_COMMENT2;
|
||||
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE; /* #8: escape in '' */
|
||||
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE2][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE2; /* #9: escape in "" */
|
||||
const script_automate saut = {&inscript, inscript_state,
|
||||
&inscript_state_pos, &html};
|
||||
|
||||
/* Primary list or URLs */
|
||||
if (ptr == 0) {
|
||||
@@ -606,13 +604,14 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
// Decode title with encoding
|
||||
if (str->page_charset_ != NULL
|
||||
&& *str->page_charset_ != '\0') {
|
||||
char *const sUtf =
|
||||
hts_convertStringToUTF8(s, strlen(s), str->page_charset_);
|
||||
if (str->page_charset_ != NULL &&
|
||||
*str->page_charset_ != '\0') {
|
||||
char *sUtf = hts_convertStringToUTF8(
|
||||
s, strlen(s), str->page_charset_);
|
||||
if (sUtf != NULL) {
|
||||
strcpy(s, sUtf);
|
||||
free(sUtf);
|
||||
/* UTF-8 can expand past s[]; truncate to fit */
|
||||
snprintf(s, sizeof(s), "%s", sUtf);
|
||||
freet(sUtf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -846,7 +845,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
/* automate */
|
||||
AUTOMATE_LOOKUP_CURRENT_ADR();
|
||||
hts_automate_lookup(&saut);
|
||||
|
||||
// Note:
|
||||
// Certaines pages ne respectent pas le html
|
||||
@@ -1762,7 +1761,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
// sauter espaces
|
||||
// adr+=p;
|
||||
INCREMENT_CURRENT_ADR(p);
|
||||
hts_automate_increment(&saut, p);
|
||||
while((is_space(*html)
|
||||
|| (inscriptgen && html[0] == '\\' && is_space(html[1])
|
||||
)
|
||||
@@ -1777,7 +1776,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
// puis quitter
|
||||
// html++; // sauter les espaces, "" et cie
|
||||
INCREMENT_CURRENT_ADR(1);
|
||||
hts_automate_increment(&saut, 1);
|
||||
}
|
||||
|
||||
/* Stop at \n (LF) if primary links or link lists */
|
||||
@@ -1792,7 +1791,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (*html == '\\') {
|
||||
if ((*(html + 1) == '\'') || (*(html + 1) == '"')) { // \" ou \'
|
||||
// html+=2; // sauter
|
||||
INCREMENT_CURRENT_ADR(2);
|
||||
hts_automate_increment(&saut, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1840,7 +1839,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (srcset_p) {
|
||||
while(html < r->adr + r->size
|
||||
&& (is_realspace(*html) || *html == ','))
|
||||
INCREMENT_CURRENT_ADR(1);
|
||||
hts_automate_increment(&saut, 1);
|
||||
}
|
||||
eadr = html;
|
||||
|
||||
@@ -3300,7 +3299,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
|
||||
assertf(eadr - html >= 0); // Should not go back
|
||||
if (eadr > html) {
|
||||
INCREMENT_CURRENT_ADR(eadr - 1 - html);
|
||||
hts_automate_increment(&saut, (int) (eadr - 1 - html));
|
||||
}
|
||||
// adr=eadr-1; // ** sauter
|
||||
|
||||
@@ -3319,7 +3318,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
q++; // skip whitespace and empty candidates
|
||||
if (q < endp && *q != '\0' && *q != ',' && *q != quote
|
||||
&& *q != '<' && *q != '>' && (unsigned char) *q >= 32) {
|
||||
INCREMENT_CURRENT_ADR(q - html); // keep the automate in sync
|
||||
hts_automate_increment(
|
||||
&saut, (int) (q - html)); // keep the automate in sync
|
||||
ok = 1;
|
||||
goto srcset_next;
|
||||
}
|
||||
@@ -3489,6 +3489,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
|
||||
* contract in htsparse.h. */
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil) {
|
||||
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
|
||||
const int norm_query = opt->urlhack && !opt->no_query_dedup;
|
||||
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
if (strcasecmp(jump_identification_const(moved_adr),
|
||||
jump_identification_const(cur_adr)) != 0)
|
||||
return HTS_FALSE;
|
||||
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
|
||||
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
|
||||
return strcasecmp(n_fil, pn_fil) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Check 301, 302, .. statuscodes (moved)
|
||||
*/
|
||||
@@ -3534,36 +3552,9 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
if ((reponse =
|
||||
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
|
||||
int set_prio_to = 0; // pas de priotité fixéd par wizard
|
||||
|
||||
// check whether URLHack is harmless or not (per the effective
|
||||
// sub-flags)
|
||||
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
|
||||
!opt->no_query_dedup)) {
|
||||
const int norm_host = !opt->no_www_dedup;
|
||||
const int norm_slash = !opt->no_slash_dedup;
|
||||
const int norm_query = !opt->no_query_dedup;
|
||||
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
strlcpybuff(n_adr,
|
||||
norm_host ? jump_normalized_const(moved->adr)
|
||||
: jump_identification_const(moved->adr),
|
||||
sizeof(n_adr));
|
||||
strlcpybuff(pn_adr,
|
||||
norm_host ? jump_normalized_const(urladr())
|
||||
: jump_identification_const(urladr()),
|
||||
sizeof(pn_adr));
|
||||
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
if (strcasecmp(n_adr, pn_adr) == 0
|
||||
&& strcasecmp(n_fil, pn_fil) == 0) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
}
|
||||
// A same-file alias redirect must be followed, not stubbed (#159).
|
||||
const hts_boolean same_savefile = hts_redirect_same_savefile(
|
||||
opt, urladr(), urlfil(), moved->adr, moved->fil);
|
||||
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
|
||||
// c'est (en gros) la même URL..
|
||||
// si c'est un problème de casse dans le host c'est que le serveur est buggé
|
||||
@@ -3591,7 +3582,17 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
|
||||
moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
} else if (same_savefile) {
|
||||
// A stub would point at itself; follow the redirect instead.
|
||||
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
|
||||
&set_prio_to, NULL) != 1) {
|
||||
get_it = 1;
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirect to a same-file alias, fetching real "
|
||||
"content: %s%s -> %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
}
|
||||
|
||||
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
|
||||
@@ -3614,7 +3615,11 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
heap(heap(ptr)->precedent)->adr,
|
||||
heap(heap(ptr)->precedent)->fil, opt,
|
||||
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
|
||||
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// Same-file alias: the reserved name is the invalidated source,
|
||||
// so record anyway.
|
||||
if (same_savefile ||
|
||||
hash_read(hash, savedmoved.save, NULL,
|
||||
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// enregistrer lien avec SAV IDENTIQUE
|
||||
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||
// mode test?
|
||||
@@ -3638,7 +3643,6 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
"moving %s to an existing file %s",
|
||||
heap(ptr)->fil, urlfil());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -116,6 +116,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
|
||||
int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
htsmoduleStructExtended * stre);
|
||||
|
||||
/*
|
||||
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
|
||||
same local file, so it must be followed rather than turned into a
|
||||
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
|
||||
stripped, www kept (www dedup is the crawl layer's job), path
|
||||
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
|
||||
on the dedup hash, which folds www and never collapses http<->https.
|
||||
*/
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil);
|
||||
|
||||
/*
|
||||
Process user intercations: pause, add link, delete link..
|
||||
*/
|
||||
|
||||
@@ -45,6 +45,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htscore.h"
|
||||
#include "htsdefines.h"
|
||||
#include "htslib.h"
|
||||
#include "htsparse.h"
|
||||
#include "htscache_selftest.h"
|
||||
#include "htsdns_selftest.h"
|
||||
#include "htscharset.h"
|
||||
@@ -707,7 +708,8 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
|
||||
}
|
||||
s = strdupt(argv[0]);
|
||||
enc = argc >= 2 ? argv[1] : "UTF-8";
|
||||
if (s != NULL && hts_unescapeEntitiesWithCharset(s, s, strlen(s), enc) == 0) {
|
||||
if (s != NULL &&
|
||||
hts_unescapeEntitiesWithCharset(s, s, strlen(s) + 1, enc) == 0) {
|
||||
printf("%s\n", s);
|
||||
freet(s);
|
||||
} else {
|
||||
@@ -716,6 +718,23 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* The unescapers must reserve one byte for the trailing NUL: a 'max'-byte
|
||||
dest holding 'max' output chars pre-fix wrote dest[max] (1-byte OOB, caught
|
||||
by ASan). Both unescapeEntities and unescapeUrl share the guard. */
|
||||
static int st_unescape_bounds(httrackp *opt, int argc, char **argv) {
|
||||
char dest[4];
|
||||
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
assertf(hts_unescapeEntities("abcd", dest, sizeof(dest)) == -1);
|
||||
assertf(hts_unescapeUrl("abcd", dest, sizeof(dest)) == -1);
|
||||
assertf(hts_unescapeEntities("abc", dest, sizeof(dest)) == 0);
|
||||
assertf(strcmp(dest, "abc") == 0);
|
||||
printf("unescape-bounds self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int st_hashtable(httrackp *opt, int argc, char **argv) {
|
||||
char *snum;
|
||||
unsigned long count = 0;
|
||||
@@ -1340,6 +1359,37 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* #159: hts_redirect_same_savefile decides whether a redirect is a same-file
|
||||
* alias. */
|
||||
static int st_redirect_samefile(httrackp *opt, int argc, char **argv) {
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
#define SAME(aa, fa, ab, fb) hts_redirect_same_savefile(opt, aa, fa, ab, fb)
|
||||
/* scheme and userinfo collapse (the #159 case); a different path does not */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
assertf(SAME("http://user@foo.com", "/a", "http://foo.com", "/a"));
|
||||
assertf(!SAME("http://foo.com", "/a", "http://foo.com", "/b"));
|
||||
/* www stays distinct here; the crawl's dedup layer folds www, not this helper
|
||||
*/
|
||||
opt->urlhack = HTS_TRUE;
|
||||
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
assertf(!SAME("http://www.foo.com", "/a", "http://foo.com", "/a"));
|
||||
/* slash/query fold only when the dedup flag is on */
|
||||
assertf(SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
|
||||
assertf(!SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
!SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
/* but a pure scheme alias still collapses regardless of dedup opt-outs */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
#undef SAME
|
||||
printf("redirect-samefile self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// hts_finish_makeindex writes the footer, emits the refresh meta only when
|
||||
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
|
||||
static int st_makeindex(httrackp *opt, int argc, char **argv) {
|
||||
@@ -1757,6 +1807,8 @@ static const struct selftest_entry {
|
||||
st_stripquery},
|
||||
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
|
||||
st_urlhack},
|
||||
{"redirect-samefile", "", "same-file redirect detection self-test (#159)",
|
||||
st_redirect_samefile},
|
||||
{"mime", "<filename>", "MIME type for a filename", st_mime},
|
||||
{"charset", "<charset> <string>",
|
||||
"convert a string to UTF-8 from a charset", st_charset},
|
||||
@@ -1765,6 +1817,8 @@ static const struct selftest_entry {
|
||||
{"idna-decode", "<host>", "decode an IDNA/punycode hostname",
|
||||
st_idna_decode},
|
||||
{"entities", "<string> [encoding]", "unescape HTML entities", st_entities},
|
||||
{"unescape-bounds", "", "unescapers reserve the NUL byte (no 1-byte OOB)",
|
||||
st_unescape_bounds},
|
||||
{"hashtable", "<count|file>", "coucal hashtable stress test", st_hashtable},
|
||||
{"strsafe", "[overflow|overflow-buff [str]]", "bounded string-op self-test",
|
||||
st_strsafe},
|
||||
|
||||
9
tests/01_engine-redirect.test
Normal file
9
tests/01_engine-redirect.test
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
|
||||
# followed through, not turned into a self-pointing "moved" stub. The decision
|
||||
# helper is exercised by the engine self-test.
|
||||
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"
|
||||
7
tests/01_engine-unescape-bounds.test
Executable file
7
tests/01_engine-unescape-bounds.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Entity/URL unescapers reserve one byte for the trailing NUL (no 1-byte OOB).
|
||||
httrack -O /dev/null -#test=unescape-bounds run | grep -q "unescape-bounds self-test OK"
|
||||
13
tests/30_local-fragment-link.test
Executable file
13
tests/30_local-fragment-link.test
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
# Issue #279: an anchored link (target.html#sec, quoted or bare) fetches the
|
||||
# target with the fragment dropped (strict server 400s on a '#' in the request)
|
||||
# but keeps it in the rewritten local link so the anchor still works.
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'fraglink/target.html' \
|
||||
--file-matches 'fraglink/index.html' 'href=target\.html#sec' \
|
||||
--file-matches 'fraglink/index.html' 'href="target\.html#sec2"' \
|
||||
httrack 'BASEURL/fraglink/index.html'
|
||||
@@ -6,6 +6,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
local-crawl.sh local-server.py server.crt server.key \
|
||||
server-root/simple/basic.html server-root/simple/link.html \
|
||||
server-root/stripquery/index.html server-root/stripquery/a.html \
|
||||
server-root/fraglink/index.html server-root/fraglink/target.html \
|
||||
fixtures/cache-golden/hts-cache/new.zip
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
@@ -43,6 +44,7 @@ TESTS = \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-redirect.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
01_engine-savename.test \
|
||||
@@ -52,6 +54,7 @@ TESTS = \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
01_engine-urlhack.test \
|
||||
01_engine-unescape-bounds.test \
|
||||
01_engine-useragent.test \
|
||||
01_zlib-acceptencoding.test \
|
||||
01_zlib-cache.test \
|
||||
@@ -83,6 +86,7 @@ TESTS = \
|
||||
26_local-strip-query.test \
|
||||
27_local-cookies-file.test \
|
||||
28_local-pause.test \
|
||||
29_local-redirect-fragment.test
|
||||
29_local-redirect-fragment.test \
|
||||
30_local-fragment-link.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -15,8 +15,11 @@
|
||||
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
|
||||
# host root), to assert rewritten link/content survived the crawl.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
# which the ephemeral port forces into the cookie domain) and passes it to
|
||||
# httrack via --cookies-file, to exercise preloaded cookies.
|
||||
@@ -121,6 +124,10 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--file-matches | --file-not-matches)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}" "${args[$((pos + 2))]}")
|
||||
pos=$((pos + 2))
|
||||
;;
|
||||
httrack)
|
||||
pos=$((pos + 1))
|
||||
break
|
||||
@@ -294,6 +301,24 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
--file-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} matches ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then result "OK"; else
|
||||
result "no match"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--file-not-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} lacks ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then
|
||||
result "matched"
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
4
tests/server-root/fraglink/index.html
Normal file
4
tests/server-root/fraglink/index.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html><body>
|
||||
<a href=target.html#sec>unquoted fragment link</a>
|
||||
<a href="target.html#sec2">quoted fragment link</a>
|
||||
</body></html>
|
||||
1
tests/server-root/fraglink/target.html
Normal file
1
tests/server-root/fraglink/target.html
Normal file
@@ -0,0 +1 @@
|
||||
<html><body><a name="sec"></a><a name="sec2"></a>target</body></html>
|
||||
Reference in New Issue
Block a user