Compare commits

..

3 Commits

Author SHA1 Message Date
Xavier Roche
b804ee2da1 htsparse: keep makestat_time out of ENGINE_SET_CONTEXT
makestat_time throttles the makestat/maketrack stats to once per minute:
the wait loop compares time_local() against it and, when it fires, writes
it back to the local. But the field is by-value in the extended context,
so it can't round-trip through ENGINE_SAVE_CONTEXT, while ENGINE_SET_CONTEXT
re-read it from the load-once baseline on every loop iteration. That reset
the local before the next compare, so under -%v / maketrack the throttle
never held and the stats line plus the full back-stack dump were emitted
every iteration.

Drop makestat_time (and the never-changing makestat_fp) from SET_CONTEXT;
they belong to the load-once set. Wrapped the macro in clang-format off/on
for the same backslash-realignment reason as HT_ADD_END.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-01 10:50:42 +02:00
Xavier Roche
20317cb85b htsparse: free the cache buffer in HT_ADD_END
The not-modified fast path reads the stored //[HTML-MD5]// digest via
cache_readdata, which malloc's the buffer, but never freed it. Every page
whose on-disk size already matches the freshly rewritten one leaks that
buffer. Free it after the compare.

Wrapped the macro in clang-format off/on: it is hand-aligned and
clang-format realigns every backslash on any edit, churning untouched
lines.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-01 10:50:42 +02:00
Xavier Roche
98e382390b htsparse: reserve 6x room for full HTML-escaping, not 5x
HT_ADD_HTMLESCAPED_ANY reserved strlen*5+1024 on the assumption that
"&amp;" (5 bytes) is the worst-case expansion. That holds for
escape_for_html_print, but escape_for_html_print_full turns a high byte
into "&#xHH;" (6 bytes). Past ~1023 high bytes the reservation is short,
so the escaper hits its internal cap: it truncates the string mid-run and
its overflow return counts the terminating NUL, which then lands inside
the mirrored HTML file. The only _full call site rewrites a link into a
2KB buffer, so a long non-ASCII local path triggers it.

Give the macro a per-function expansion factor (HTS_HTMLESCAPE_MAXEXP=5,
HTS_HTMLESCAPE_FULL_MAXEXP=6) and pass 6 for the _full variant. A new
escape-room self-test pins each function's real worst-case expansion
against the constant the macro reserves, so the two can't drift again.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-07-01 10:50:15 +02:00
18 changed files with 199 additions and 351 deletions

3
debian/control vendored
View File

@@ -1,8 +1,9 @@
Source: httrack
Section: web
Priority: optional
Maintainer: Xavier Roche <roche@httrack.com>
Standards-Version: 4.7.4
Build-Depends: debhelper-compat (= 14), autoconf, autoconf-archive, automake, libtool, zlib1g-dev, libssl-dev
Build-Depends: debhelper-compat (= 13), autoconf, autoconf-archive, automake, libtool, zlib1g-dev, libssl-dev
Rules-Requires-Root: no
Homepage: http://www.httrack.com
Vcs-Git: https://github.com/xroche/httrack.git

View File

@@ -1,6 +1,4 @@
# Maintainer uploads sign the changelog as xavier@debian.org while the control
# Maintainer is roche@httrack.com; lintian reads the address mismatch as an NMU.
httrack source: no-nmu-in-changelog
httrack source: changelog-should-mention-nmu
httrack source: source-nmu-has-incorrect-version-number
# The bundled HTML pages are the genuine upstream documentation taken from

View File

@@ -1,6 +0,0 @@
---
Repository: https://github.com/xroche/httrack.git
Repository-Browse: https://github.com/xroche/httrack
Bug-Database: https://github.com/xroche/httrack/issues
Bug-Submit: https://github.com/xroche/httrack/issues/new
Contact: Xavier Roche <roche@httrack.com>

View File

@@ -175,9 +175,7 @@ HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
//
socinput(soc, line, 1000);
if (strnotempty(line)) {
/* widths bound the caller buffers: method[32], url[HTS_URLMAXSIZE*2],
protocol[256] */
if (sscanf(line, "%31s %2047s %255s", method, url, protocol) == 3) {
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
lien_adrfil af;
// méthode en majuscule

View File

@@ -441,72 +441,6 @@ void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
*makeindex_done = 1;
}
/* Flush the parsed HTML output buffer to disk, skipping the rewrite when the
* on-disk MD5 is unchanged. */
void hts_finish_html_file(httrackp *opt, cache_back *cache, htsblk *r,
FILE **fp, const char *ht_buff, size_t ht_len,
const char *adr, const char *fil, const char *save) {
char digest[32 + 2];
off_t fsize_old =
fsize(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), save));
int ok = 0;
digest[0] = '\0';
domd5mem(ht_buff, ht_len, digest, 1);
if (fsize_old == (off_t) ht_len) {
int mlen = 0;
char *mbuff;
cache_readdata(cache, "//[HTML-MD5]//", save, &mbuff, &mlen);
if (mlen)
mbuff[mlen] = '\0';
if ((mlen == 32) && (strcmp(((mbuff != NULL) ? mbuff : ""), digest) == 0)) {
ok = 1;
hts_log_print(opt, LOG_DEBUG, "File not re-written (md5): %s", save);
}
freet(mbuff);
}
if (!ok) {
file_notify(opt, adr, fil, save, 1, 1, r->notmodified);
*fp = filecreate(&opt->state.strc, save);
if (*fp) {
if (ht_len > 0 && fwrite(ht_buff, 1, ht_len, *fp) != ht_len) {
int fcheck = check_fatal_io_errno();
if (fcheck)
opt->state.exit_xh = -1;
if (opt->log) {
hts_log_print(opt, LOG_ERROR | LOG_ERRNO,
"Unable to write HTML file %s", save);
if (fcheck)
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");
}
}
fclose(*fp);
*fp = NULL;
if (strnotempty(r->lastmodified))
set_filetime_rfc822(save, r->lastmodified);
} else {
int fcheck = check_fatal_io_errno();
if (fcheck) {
hts_log_print(opt, LOG_ERROR,
"Mirror aborted: disk full or filesystem problems");
opt->state.exit_xh = -1;
}
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to save file %s", save);
if (fcheck)
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");
}
} else {
file_notify(opt, adr, fil, save, 0, 0, r->notmodified);
filenote(&opt->state.strc, save, NULL);
}
if (cache->ndx)
cache_writedata(cache->ndx, cache->dat, "//[HTML-MD5]//", save, digest,
(int) strlen(digest));
}
/* does it look like XML ? (SVG et al.) */
static int look_like_xml(const char *s) {
return strncmp(s, "<?xml", 5) == 0

View File

@@ -370,12 +370,6 @@ void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
const char *template_footer, const char *adr,
const char *fil);
// Flush ht_buff[0..ht_len] to save on disk (skip if MD5 unchanged); *fp
// closed+NULLed on write. Precondition: ht_len>0.
void hts_finish_html_file(httrackp *opt, cache_back *cache, htsblk *r,
FILE **fp, const char *ht_buff, size_t ht_len,
const char *adr, const char *fil, const char *save);
int filters_init(char ***ptrfilters, int maxfilter, int filterinc);
int fspc(httrackp * opt, FILE * fp, const char *type);

View File

@@ -190,9 +190,9 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
}
}
}
/* reserve one byte for the trailing NUL written after the loop */
if (j + 1 >= max) {
/* copy */
if (j + 1 > max) {
/* overflow */
return -1;
}
@@ -314,8 +314,8 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
}
}
/* reserve one byte for the trailing NUL written after the loop */
if (j + 1 >= max) {
/* Check for overflow */
if (j + 1 > max) {
return -1;
}

View File

@@ -1149,8 +1149,7 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
char BIGSTK protocol[256], url[HTS_URLMAXSIZE * 2], method[256];
linput(fp, line, 1000);
/* widths bound method[256], url[HTS_URLMAXSIZE*2], protocol[256] */
if (sscanf(line, "%255s %2047s %255s", method, url, protocol) == 3) {
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
size_t ret;
// selon que l'on a ou pas un proxy
if (retour->req.proxy.active) {

View File

@@ -106,99 +106,159 @@ Please visit our Website: http://www.httrack.com
// does nothing
#define XH_uninit do {} while(0)
#define HT_ADD_FOP
/* Mutable extended-context fields: one source of truth so the DEFINE/SET/SAVE
load and store lists can't drift apart. */
/* clang-format off: an edit realigns all backslashes, churning the macro. */
/* clang-format off */
#define ENGINE_MUTABLE_FIELDS(X) \
X(int, error, stre->error_) \
X(int, store_errpage, stre->store_errpage_) \
X(int, makeindex_done, stre->makeindex_done_) \
X(FILE *, makeindex_fp, stre->makeindex_fp_) \
X(int, makeindex_links, stre->makeindex_links_) \
X(LLint, stat_fragment, stre->stat_fragment_)
#define ENGINE_FIELD_DECLARE(type, name, src) type name = *(src);
#define ENGINE_FIELD_LOAD(type, name, src) name = *(src);
#define ENGINE_FIELD_STORE(type, name, src) *(src) = name;
#define HT_ADD_END { \
int ok=0;\
if (TypedArraySize(output_buffer) != 0) { \
const size_t ht_len = TypedArraySize(output_buffer); \
const char *const ht_buff = TypedArrayElts(output_buffer); \
char digest[32+2];\
off_t fsize_old = fsize(fconv(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),savename()));\
digest[0] = '\0';\
domd5mem(TypedArrayElts(output_buffer), ht_len, digest, 1);\
if (fsize_old == (off_t) ht_len) { \
int mlen = 0;\
char* mbuff;\
cache_readdata(cache,"//[HTML-MD5]//",savename(),&mbuff,&mlen);\
if (mlen) \
mbuff[mlen]='\0';\
if ((mlen == 32) && (strcmp(((mbuff!=NULL)?mbuff:""),digest)==0)) {\
ok=1;\
hts_log_print(opt, LOG_DEBUG, "File not re-written (md5): %s",savename());\
} else {\
ok=0;\
} \
freet(mbuff);\
}\
if (!ok) { \
file_notify(opt,urladr(), urlfil(), savename(), 1, 1, r->notmodified); \
fp=filecreate(&opt->state.strc, savename()); \
if (fp) { \
if (ht_len>0) {\
if (fwrite(ht_buff,1,ht_len,fp) != ht_len) { \
int fcheck;\
if ((fcheck=check_fatal_io_errno())) {\
opt->state.exit_xh=-1;\
}\
if (opt->log) { \
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to write HTML file %s", savename());\
if (fcheck) {\
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
}\
}\
}\
}\
fclose(fp); fp=NULL; \
if (strnotempty(r->lastmodified)) \
set_filetime_rfc822(savename(),r->lastmodified); \
} else {\
int fcheck;\
if ((fcheck=check_fatal_io_errno())) {\
hts_log_print(opt, LOG_ERROR, "Mirror aborted: disk full or filesystem problems"); \
opt->state.exit_xh=-1;\
}\
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to save file %s", savename());\
if (fcheck) {\
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
}\
}\
} else {\
file_notify(opt,urladr(), urlfil(), savename(), 0, 0, r->notmodified); \
filenote(&opt->state.strc, savename(),NULL); \
}\
if (cache->ndx)\
cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename(),digest,(int)strlen(digest));\
} \
TypedArrayFree(output_buffer); \
}
/* clang-format on */
#define HT_ADD_FOP
#define ENGINE_DEFINE_CONTEXT() \
ENGINE_DEFINE_CONTEXT_BASE(); \
/* */ \
htsblk* const r HTS_UNUSED = stre->r_; \
hash_struct* const hash HTS_UNUSED = stre->hash_; \
char* const codebase HTS_UNUSED = stre->codebase; \
char* const base HTS_UNUSED = stre->base; \
/* */ \
const char * const template_header HTS_UNUSED = stre->template_header_; \
const char * const template_body HTS_UNUSED = stre->template_body_; \
const char * const template_footer HTS_UNUSED = stre->template_footer_; \
/* */ \
HTS_UNUSED char* const makeindex_firstlink = stre->makeindex_firstlink_; \
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_DECLARE) \
/* load-once (kept out of SET/SAVE): re-reading would reset the throttle */ \
/* */ \
/* */ \
int error = * stre->error_; \
int store_errpage = * stre->store_errpage_; \
/* */ \
int makeindex_done = *stre->makeindex_done_; \
FILE* makeindex_fp = *stre->makeindex_fp_; \
int makeindex_links = *stre->makeindex_links_; \
/* */ \
LLint stat_fragment = *stre->stat_fragment_; \
HTS_UNUSED TStamp makestat_time = stre->makestat_time; \
HTS_UNUSED FILE* makestat_fp = stre->makestat_fp
/* clang-format off: an edit realigns all backslashes, churning the macro. */
/* clang-format off */
/* Load-once: re-reading resets makestat_time (mutated locally, never SAVEd). */
#define ENGINE_SET_CONTEXT() \
ENGINE_SET_CONTEXT_BASE(); \
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_LOAD)
/* */ \
error = * stre->error_; \
store_errpage = * stre->store_errpage_; \
/* */ \
makeindex_done = *stre->makeindex_done_; \
makeindex_fp = *stre->makeindex_fp_; \
makeindex_links = *stre->makeindex_links_; \
/* */ \
stat_fragment = *stre->stat_fragment_
/* clang-format on */
#define ENGINE_LOAD_CONTEXT() \
ENGINE_DEFINE_CONTEXT()
#define ENGINE_SAVE_CONTEXT() \
ENGINE_SAVE_CONTEXT_BASE(); \
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_STORE)
/* clang-format on */
/* */ \
* stre->error_ = error; \
* stre->store_errpage_ = store_errpage; \
/* */ \
*stre->makeindex_done_ = makeindex_done; \
*stre->makeindex_fp_ = makeindex_fp; \
*stre->makeindex_links_ = makeindex_links; \
/* */ \
*stre->stat_fragment_ = stat_fragment
#define _FILTERS (*opt->filters.filters)
#define _FILTERS_PTR (opt->filters.filptr)
#define _ROBOTS ((robots_wizard*)opt->robotsptr)
/* JS-detection automaton states; INSCRIPT_DEFAULT is the synthetic "any other
char" column of the transition table. */
typedef enum {
INSCRIPT_START = 0,
INSCRIPT_ANTISLASH,
INSCRIPT_INQUOTE,
INSCRIPT_INQUOTE2,
INSCRIPT_SLASH,
INSCRIPT_SLASHSLASH,
INSCRIPT_COMMENT,
INSCRIPT_COMMENT2,
INSCRIPT_ANTISLASH_IN_QUOTE,
INSCRIPT_ANTISLASH_IN_QUOTE2,
INSCRIPT_DEFAULT = 256
} INSCRIPT;
/* Apply current *adr character for the script automate */
#define AUTOMATE_LOOKUP_CURRENT_ADR() do { \
if (inscript) { \
int new_state_pos; \
new_state_pos=inscript_state[inscript_state_pos][(unsigned char)*html]; \
if (new_state_pos < 0) { \
new_state_pos=inscript_state[inscript_state_pos][INSCRIPT_DEFAULT]; \
} \
assertf(new_state_pos >= 0); \
assertf(new_state_pos*sizeof(inscript_state[0]) < sizeof(inscript_state)); \
inscript_state_pos=new_state_pos; \
} \
} while(0)
#define INSCRIPT_NSTATES 10 /* rows in the transition table */
/* Live view of the parser's automaton locals, set up once so the helpers below
can drive it without capturing them by lexical scope. */
typedef struct {
const int *inscript; /* nonzero while inside a script body */
const signed char (*table)[257]; /* [INSCRIPT_NSTATES][257] transitions */
INSCRIPT *pos; /* current state */
const char **html; /* parse cursor */
} script_automate;
/* Feed the current *html byte to the automaton. No-op outside a script body. */
static void hts_automate_lookup(const script_automate *aut) {
if (*aut->inscript) {
int next = aut->table[*aut->pos][(unsigned char) **aut->html];
if (next < 0) {
next = aut->table[*aut->pos][INSCRIPT_DEFAULT];
}
assertf(next >= 0 && next < INSCRIPT_NSTATES);
*aut->pos = (INSCRIPT) next;
}
}
/* Advance the cursor by 'steps' bytes, feeding each to the automaton. */
static void hts_automate_increment(const script_automate *aut, int steps) {
while (steps > 0) {
(*aut->html)++;
hts_automate_lookup(aut);
steps--;
}
}
/* Increment current pointer to 'steps' characters, modifying automate if necessary */
#define INCREMENT_CURRENT_ADR(steps) do { \
int steps__ = (int) ( steps ); \
while(steps__ > 0) { \
html++; \
AUTOMATE_LOOKUP_CURRENT_ADR(); \
steps__ --; \
} \
} while(0)
/* Percent-encode the angle brackets of a string so it is safe to embed inside
an HTML comment (the default footer) or any other HTML context. A URL holding
@@ -343,7 +403,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
int incomment = 0; // dans un <!--
int inscript = 0; // dans un scipt pour applets javascript)
int inscript_locked = 0; // in locked script (ie. js file)
signed char inscript_state[INSCRIPT_NSTATES][257];
signed char inscript_state[10][257];
typedef enum {
INSCRIPT_START = 0,
INSCRIPT_ANTISLASH,
INSCRIPT_INQUOTE,
INSCRIPT_INQUOTE2,
INSCRIPT_SLASH,
INSCRIPT_SLASHSLASH,
INSCRIPT_COMMENT,
INSCRIPT_COMMENT2,
INSCRIPT_ANTISLASH_IN_QUOTE,
INSCRIPT_ANTISLASH_IN_QUOTE2,
INSCRIPT_DEFAULT = 256
} INSCRIPT;
INSCRIPT inscript_state_pos = INSCRIPT_START;
const char *inscript_name = NULL; // script tag name
int inscript_tag = 0; // on est dans un <body onLoad="... terminé par >
@@ -404,8 +477,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
inscript_state[INSCRIPT_COMMENT2]['*'] = INSCRIPT_COMMENT2;
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE; /* #8: escape in '' */
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE2][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE2; /* #9: escape in "" */
const script_automate saut = {&inscript, inscript_state,
&inscript_state_pos, &html};
/* Primary list or URLs */
if (ptr == 0) {
@@ -604,14 +675,13 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
}
// Decode title with encoding
if (str->page_charset_ != NULL &&
*str->page_charset_ != '\0') {
char *sUtf = hts_convertStringToUTF8(
s, strlen(s), str->page_charset_);
if (str->page_charset_ != NULL
&& *str->page_charset_ != '\0') {
char *const sUtf =
hts_convertStringToUTF8(s, strlen(s), str->page_charset_);
if (sUtf != NULL) {
/* UTF-8 can expand past s[]; truncate to fit */
snprintf(s, sizeof(s), "%s", sUtf);
freet(sUtf);
strcpy(s, sUtf);
free(sUtf);
}
}
@@ -845,7 +915,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
}
/* automate */
hts_automate_lookup(&saut);
AUTOMATE_LOOKUP_CURRENT_ADR();
// Note:
// Certaines pages ne respectent pas le html
@@ -1761,7 +1831,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
}
// sauter espaces
// adr+=p;
hts_automate_increment(&saut, p);
INCREMENT_CURRENT_ADR(p);
while((is_space(*html)
|| (inscriptgen && html[0] == '\\' && is_space(html[1])
)
@@ -1776,7 +1846,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
}
// puis quitter
// html++; // sauter les espaces, "" et cie
hts_automate_increment(&saut, 1);
INCREMENT_CURRENT_ADR(1);
}
/* Stop at \n (LF) if primary links or link lists */
@@ -1791,7 +1861,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
if (*html == '\\') {
if ((*(html + 1) == '\'') || (*(html + 1) == '"')) { // \" ou \'
// html+=2; // sauter
hts_automate_increment(&saut, 2);
INCREMENT_CURRENT_ADR(2);
}
}
}
@@ -1839,7 +1909,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
if (srcset_p) {
while(html < r->adr + r->size
&& (is_realspace(*html) || *html == ','))
hts_automate_increment(&saut, 1);
INCREMENT_CURRENT_ADR(1);
}
eadr = html;
@@ -3299,7 +3369,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
assertf(eadr - html >= 0); // Should not go back
if (eadr > html) {
hts_automate_increment(&saut, (int) (eadr - 1 - html));
INCREMENT_CURRENT_ADR(eadr - 1 - html);
}
// adr=eadr-1; // ** sauter
@@ -3318,8 +3388,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
q++; // skip whitespace and empty candidates
if (q < endp && *q != '\0' && *q != ',' && *q != quote
&& *q != '<' && *q != '>' && (unsigned char) *q >= 32) {
hts_automate_increment(
&saut, (int) (q - html)); // keep the automate in sync
INCREMENT_CURRENT_ADR(q - html); // keep the automate in sync
ok = 1;
goto srcset_next;
}
@@ -3459,12 +3528,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
}
/* Flush and save to disk */
if (TypedArraySize(output_buffer) != 0) {
hts_finish_html_file(
opt, cache, r, &fp, TypedArrayElts(output_buffer),
TypedArraySize(output_buffer), urladr(), urlfil(), savename());
}
TypedArrayFree(output_buffer);
HT_ADD_END; // achever
}
//
//
@@ -3489,24 +3553,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
return 0;
}
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
* contract in htsparse.h. */
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
const char *cur_fil,
const char *moved_adr,
const char *moved_fil) {
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
const int norm_query = opt->urlhack && !opt->no_query_dedup;
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
if (strcasecmp(jump_identification_const(moved_adr),
jump_identification_const(cur_adr)) != 0)
return HTS_FALSE;
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
return strcasecmp(n_fil, pn_fil) == 0;
}
/*
Check 301, 302, .. statuscodes (moved)
*/
@@ -3552,9 +3598,36 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
if ((reponse =
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
int set_prio_to = 0; // pas de priotité fixéd par wizard
// A same-file alias redirect must be followed, not stubbed (#159).
const hts_boolean same_savefile = hts_redirect_same_savefile(
opt, urladr(), urlfil(), moved->adr, moved->fil);
// check whether URLHack is harmless or not (per the effective
// sub-flags)
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
!opt->no_query_dedup)) {
const int norm_host = !opt->no_www_dedup;
const int norm_slash = !opt->no_slash_dedup;
const int norm_query = !opt->no_query_dedup;
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
strlcpybuff(n_adr,
norm_host ? jump_normalized_const(moved->adr)
: jump_identification_const(moved->adr),
sizeof(n_adr));
strlcpybuff(pn_adr,
norm_host ? jump_normalized_const(urladr())
: jump_identification_const(urladr()),
sizeof(pn_adr));
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
norm_query);
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
norm_query);
if (strcasecmp(n_adr, pn_adr) == 0
&& strcasecmp(n_fil, pn_fil) == 0) {
hts_log_print(opt, LOG_WARNING,
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
urladr(), urlfil(), moved->adr, moved->fil);
}
}
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
// c'est (en gros) la même URL..
// si c'est un problème de casse dans le host c'est que le serveur est buggé
@@ -3582,17 +3655,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
moved->adr, moved->fil);
}
} else if (same_savefile) {
// A stub would point at itself; follow the redirect instead.
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
&set_prio_to, NULL) != 1) {
get_it = 1;
hts_log_print(opt, LOG_WARNING,
"Redirect to a same-file alias, fetching real "
"content: %s%s -> %s%s",
urladr(), urlfil(), moved->adr, moved->fil);
}
} /* sinon traité normalement */
} /* sinon traité normalement */
}
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
@@ -3615,11 +3678,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
heap(heap(ptr)->precedent)->adr,
heap(heap(ptr)->precedent)->fil, opt,
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
// Same-file alias: the reserved name is the invalidated source,
// so record anyway.
if (same_savefile ||
hash_read(hash, savedmoved.save, NULL,
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
// enregistrer lien avec SAV IDENTIQUE
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
// mode test?
@@ -3643,6 +3702,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
"moving %s to an existing file %s",
heap(ptr)->fil, urlfil());
}
}
}

View File

@@ -116,19 +116,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
int hts_mirror_check_moved(htsmoduleStruct * str,
htsmoduleStructExtended * stre);
/*
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
same local file, so it must be followed rather than turned into a
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
stripped, www kept (www dedup is the crawl layer's job), path
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
on the dedup hash, which folds www and never collapses http<->https.
*/
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
const char *cur_fil,
const char *moved_adr,
const char *moved_fil);
/*
Process user intercations: pause, add link, delete link..
*/

View File

@@ -45,7 +45,6 @@ Please visit our Website: http://www.httrack.com
#include "htscore.h"
#include "htsdefines.h"
#include "htslib.h"
#include "htsparse.h"
#include "htscache_selftest.h"
#include "htsdns_selftest.h"
#include "htscharset.h"
@@ -708,8 +707,7 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
}
s = strdupt(argv[0]);
enc = argc >= 2 ? argv[1] : "UTF-8";
if (s != NULL &&
hts_unescapeEntitiesWithCharset(s, s, strlen(s) + 1, enc) == 0) {
if (s != NULL && hts_unescapeEntitiesWithCharset(s, s, strlen(s), enc) == 0) {
printf("%s\n", s);
freet(s);
} else {
@@ -718,23 +716,6 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
return 0;
}
/* The unescapers must reserve one byte for the trailing NUL: a 'max'-byte
dest holding 'max' output chars pre-fix wrote dest[max] (1-byte OOB, caught
by ASan). Both unescapeEntities and unescapeUrl share the guard. */
static int st_unescape_bounds(httrackp *opt, int argc, char **argv) {
char dest[4];
(void) opt;
(void) argc;
(void) argv;
assertf(hts_unescapeEntities("abcd", dest, sizeof(dest)) == -1);
assertf(hts_unescapeUrl("abcd", dest, sizeof(dest)) == -1);
assertf(hts_unescapeEntities("abc", dest, sizeof(dest)) == 0);
assertf(strcmp(dest, "abc") == 0);
printf("unescape-bounds self-test OK\n");
return 0;
}
static int st_hashtable(httrackp *opt, int argc, char **argv) {
char *snum;
unsigned long count = 0;
@@ -1359,37 +1340,6 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
return 0;
}
/* #159: hts_redirect_same_savefile decides whether a redirect is a same-file
* alias. */
static int st_redirect_samefile(httrackp *opt, int argc, char **argv) {
(void) argc;
(void) argv;
#define SAME(aa, fa, ab, fb) hts_redirect_same_savefile(opt, aa, fa, ab, fb)
/* scheme and userinfo collapse (the #159 case); a different path does not */
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
assertf(SAME("http://user@foo.com", "/a", "http://foo.com", "/a"));
assertf(!SAME("http://foo.com", "/a", "http://foo.com", "/b"));
/* www stays distinct here; the crawl's dedup layer folds www, not this helper
*/
opt->urlhack = HTS_TRUE;
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
assertf(!SAME("http://www.foo.com", "/a", "http://foo.com", "/a"));
/* slash/query fold only when the dedup flag is on */
assertf(SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
assertf(
SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
assertf(!SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
assertf(
!SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
/* but a pure scheme alias still collapses regardless of dedup opt-outs */
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
#undef SAME
printf("redirect-samefile self-test OK\n");
return 0;
}
// hts_finish_makeindex writes the footer, emits the refresh meta only when
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
static int st_makeindex(httrackp *opt, int argc, char **argv) {
@@ -1807,8 +1757,6 @@ static const struct selftest_entry {
st_stripquery},
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
st_urlhack},
{"redirect-samefile", "", "same-file redirect detection self-test (#159)",
st_redirect_samefile},
{"mime", "<filename>", "MIME type for a filename", st_mime},
{"charset", "<charset> <string>",
"convert a string to UTF-8 from a charset", st_charset},
@@ -1817,8 +1765,6 @@ static const struct selftest_entry {
{"idna-decode", "<host>", "decode an IDNA/punycode hostname",
st_idna_decode},
{"entities", "<string> [encoding]", "unescape HTML entities", st_entities},
{"unescape-bounds", "", "unescapers reserve the NUL byte (no 1-byte OOB)",
st_unescape_bounds},
{"hashtable", "<count|file>", "coucal hashtable stress test", st_hashtable},
{"strsafe", "[overflow|overflow-buff [str]]", "bounded string-op self-test",
st_strsafe},

View File

@@ -1,9 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
# followed through, not turned into a self-pointing "moved" stub. The decision
# helper is exercised by the engine self-test.
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"

View File

@@ -1,7 +0,0 @@
#!/bin/bash
#
set -euo pipefail
# Entity/URL unescapers reserve one byte for the trailing NUL (no 1-byte OOB).
httrack -O /dev/null -#test=unescape-bounds run | grep -q "unescape-bounds self-test OK"

View File

@@ -1,13 +0,0 @@
#!/bin/bash
# Issue #279: an anchored link (target.html#sec, quoted or bare) fetches the
# target with the fragment dropped (strict server 400s on a '#' in the request)
# but keeps it in the rewritten local link so the anchor still works.
set -e
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'fraglink/target.html' \
--file-matches 'fraglink/index.html' 'href=target\.html#sec' \
--file-matches 'fraglink/index.html' 'href="target\.html#sec2"' \
httrack 'BASEURL/fraglink/index.html'

View File

@@ -6,7 +6,6 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
local-crawl.sh local-server.py server.crt server.key \
server-root/simple/basic.html server-root/simple/link.html \
server-root/stripquery/index.html server-root/stripquery/a.html \
server-root/fraglink/index.html server-root/fraglink/target.html \
fixtures/cache-golden/hts-cache/new.zip
TESTS_ENVIRONMENT =
@@ -44,7 +43,6 @@ TESTS = \
01_engine-parse.test \
01_engine-pause.test \
01_engine-rcfile.test \
01_engine-redirect.test \
01_engine-relative.test \
01_engine-robots.test \
01_engine-savename.test \
@@ -54,7 +52,6 @@ TESTS = \
01_engine-stripquery.test \
01_engine-strsafe.test \
01_engine-urlhack.test \
01_engine-unescape-bounds.test \
01_engine-useragent.test \
01_zlib-acceptencoding.test \
01_zlib-cache.test \
@@ -86,7 +83,6 @@ TESTS = \
26_local-strip-query.test \
27_local-cookies-file.test \
28_local-pause.test \
29_local-redirect-fragment.test \
30_local-fragment-link.test
29_local-redirect-fragment.test
CLEANFILES = check-network_sh.cache

View File

@@ -15,11 +15,8 @@
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
# --errors N --files N --found PATH ... --directory PATH ... \
# --log-found REGEX ... --log-not-found REGEX ... \
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
# httrack BASEURL/some/path [httrack-args...]
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
# host root), to assert rewritten link/content survived the crawl.
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
# which the ephemeral port forces into the cookie domain) and passes it to
# httrack via --cookies-file, to exercise preloaded cookies.
@@ -124,10 +121,6 @@ while test "$pos" -lt "$nargs"; do
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
pos=$((pos + 1))
;;
--file-matches | --file-not-matches)
audit+=("${args[$pos]}" "${args[$((pos + 1))]}" "${args[$((pos + 2))]}")
pos=$((pos + 2))
;;
httrack)
pos=$((pos + 1))
break
@@ -301,24 +294,6 @@ while test "$i" -lt "${#audit[@]}"; do
exit 1
else result "OK"; fi
;;
--file-matches)
path="${audit[$((i + 1))]}"
i=$((i + 2))
info "checking ${path} matches ${audit[$i]}"
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then result "OK"; else
result "no match"
exit 1
fi
;;
--file-not-matches)
path="${audit[$((i + 1))]}"
i=$((i + 2))
info "checking ${path} lacks ${audit[$i]}"
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then
result "matched"
exit 1
else result "OK"; fi
;;
esac
i=$((i + 1))
done

View File

@@ -1,4 +0,0 @@
<html><body>
<a href=target.html#sec>unquoted fragment link</a>
<a href="target.html#sec2">quoted fragment link</a>
</body></html>

View File

@@ -1 +0,0 @@
<html><body><a name="sec"></a><a name="sec2"></a>target</body></html>