mirror of
https://github.com/xroche/httrack.git
synced 2026-07-02 23:24:03 +03:00
Compare commits
17 Commits
html5-reso
...
phase0-par
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b0d131f084 | ||
|
|
92db2f2b41 | ||
|
|
ec52112446 | ||
|
|
1eaddc9c0e | ||
|
|
d97a7bdfd9 | ||
|
|
d2d02d87c2 | ||
|
|
4958bb8666 | ||
|
|
07da404cb8 | ||
|
|
694e45c698 | ||
|
|
db9ec2cc3b | ||
|
|
6a9ab2a11f | ||
|
|
13b31986d5 | ||
|
|
bd7e0989f6 | ||
|
|
bd74ec7cab | ||
|
|
1ed8ffad64 | ||
|
|
b68de172fa | ||
|
|
aabfd34380 |
5
.github/workflows/ci.yml
vendored
5
.github/workflows/ci.yml
vendored
@@ -269,8 +269,9 @@ jobs:
|
||||
MSAN_OPTIONS: abort_on_error=1:halt_on_error=1
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Engine self-tests only; the cache trio pulls in uninstrumented zlib.
|
||||
tests="$(cd tests && ls 01_engine-*.test | grep -v -- '-cache' | tr '\n' ' ')"
|
||||
# 01_engine-* only; zlib-dependent self-tests are named 01_zlib-* and
|
||||
# skipped here (uninstrumented libz floods MSan with false positives).
|
||||
tests="$(cd tests && ls 01_engine-*.test | tr '\n' ' ')"
|
||||
make check TESTS="$tests"
|
||||
|
||||
- name: Print the test log on failure
|
||||
|
||||
3
debian/control
vendored
3
debian/control
vendored
@@ -1,9 +1,8 @@
|
||||
Source: httrack
|
||||
Section: web
|
||||
Priority: optional
|
||||
Maintainer: Xavier Roche <roche@httrack.com>
|
||||
Standards-Version: 4.7.4
|
||||
Build-Depends: debhelper-compat (= 13), autoconf, autoconf-archive, automake, libtool, zlib1g-dev, libssl-dev
|
||||
Build-Depends: debhelper-compat (= 14), autoconf, autoconf-archive, automake, libtool, zlib1g-dev, libssl-dev
|
||||
Rules-Requires-Root: no
|
||||
Homepage: http://www.httrack.com
|
||||
Vcs-Git: https://github.com/xroche/httrack.git
|
||||
|
||||
4
debian/source/lintian-overrides
vendored
4
debian/source/lintian-overrides
vendored
@@ -1,4 +1,6 @@
|
||||
httrack source: changelog-should-mention-nmu
|
||||
# Maintainer uploads sign the changelog as xavier@debian.org while the control
|
||||
# Maintainer is roche@httrack.com; lintian reads the address mismatch as an NMU.
|
||||
httrack source: no-nmu-in-changelog
|
||||
httrack source: source-nmu-has-incorrect-version-number
|
||||
|
||||
# The bundled HTML pages are the genuine upstream documentation taken from
|
||||
|
||||
6
debian/upstream/metadata
vendored
Normal file
6
debian/upstream/metadata
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
Repository: https://github.com/xroche/httrack.git
|
||||
Repository-Browse: https://github.com/xroche/httrack
|
||||
Bug-Database: https://github.com/xroche/httrack/issues
|
||||
Bug-Submit: https://github.com/xroche/httrack/issues/new
|
||||
Contact: Xavier Roche <roche@httrack.com>
|
||||
@@ -129,6 +129,8 @@ typedef enum HTTPStatusCode {
|
||||
HTTP_UNSUPPORTED_MEDIA_TYPE = 415,
|
||||
HTTP_REQUESTED_RANGE_NOT_SATISFIABLE = 416,
|
||||
HTTP_EXPECTATION_FAILED = 417,
|
||||
HTTP_TOO_MANY_REQUESTS = 429,
|
||||
HTTP_UNAVAILABLE_FOR_LEGAL_REASONS = 451,
|
||||
HTTP_INTERNAL_SERVER_ERROR = 500,
|
||||
HTTP_NOT_IMPLEMENTED = 501,
|
||||
HTTP_BAD_GATEWAY = 502,
|
||||
|
||||
@@ -64,7 +64,7 @@ Please visit our Website: http://www.httrack.com
|
||||
// catch_url_init(&port,&return_host);
|
||||
HTSEXT_API T_SOC catch_url_init_std(int *port_prox, char *adr_prox) {
|
||||
T_SOC soc;
|
||||
int try_to_listen_to[] = { 8080, 3128, 80, 81, 82, 8081, 3129, 31337, 0, -1 };
|
||||
int try_to_listen_to[] = {8080, 3128, 80, 81, 82, 8081, 3129, 0, -1};
|
||||
int i = 0;
|
||||
|
||||
do {
|
||||
@@ -175,7 +175,9 @@ HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||
//
|
||||
socinput(soc, line, 1000);
|
||||
if (strnotempty(line)) {
|
||||
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
|
||||
/* widths bound the caller buffers: method[32], url[HTS_URLMAXSIZE*2],
|
||||
protocol[256] */
|
||||
if (sscanf(line, "%31s %2047s %255s", method, url, protocol) == 3) {
|
||||
lien_adrfil af;
|
||||
|
||||
// méthode en majuscule
|
||||
|
||||
214
src/htscore.c
214
src/htscore.c
@@ -406,29 +406,106 @@ void hts_invalidate_link(httrackp * opt, int lpos) {
|
||||
opt->liens[lpos]->pass2 = -1;
|
||||
}
|
||||
|
||||
// Write the makeindex footer (refresh meta when makeindex_links==1), close
|
||||
// the file, then run usercommand.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil) {
|
||||
if (!*makeindex_done) {
|
||||
if (*makeindex_fp) {
|
||||
char BIGSTK tempo[1024];
|
||||
if (makeindex_links == 1) {
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE * 2];
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped));
|
||||
snprintf(tempo, sizeof(tempo),
|
||||
"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">" CRLF,
|
||||
link_escaped);
|
||||
} else
|
||||
tempo[0] = '\0';
|
||||
hts_template_format(*makeindex_fp, template_footer,
|
||||
"<!-- Mirror and index made by HTTrack Website "
|
||||
"Copier/" HTTRACK_VERSION " " HTTRACK_AFF_AUTHORS
|
||||
" -->",
|
||||
tempo, /* EOF */ NULL);
|
||||
fflush(*makeindex_fp);
|
||||
fclose(*makeindex_fp);
|
||||
*makeindex_fp = NULL;
|
||||
usercommand(opt, 0, NULL,
|
||||
fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_html_utf8), "index.html"),
|
||||
adr, fil);
|
||||
}
|
||||
}
|
||||
*makeindex_done = 1;
|
||||
}
|
||||
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF, link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),StringBuff(opt->path_html_utf8),"index.html"),"",""); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
/* Flush the parsed HTML output buffer to disk, skipping the rewrite when the
|
||||
* on-disk MD5 is unchanged. */
|
||||
void hts_finish_html_file(httrackp *opt, cache_back *cache, htsblk *r,
|
||||
FILE **fp, const char *ht_buff, size_t ht_len,
|
||||
const char *adr, const char *fil, const char *save) {
|
||||
char digest[32 + 2];
|
||||
off_t fsize_old =
|
||||
fsize(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), save));
|
||||
int ok = 0;
|
||||
|
||||
digest[0] = '\0';
|
||||
domd5mem(ht_buff, ht_len, digest, 1);
|
||||
if (fsize_old == (off_t) ht_len) {
|
||||
int mlen = 0;
|
||||
char *mbuff;
|
||||
|
||||
cache_readdata(cache, "//[HTML-MD5]//", save, &mbuff, &mlen);
|
||||
if (mlen)
|
||||
mbuff[mlen] = '\0';
|
||||
if ((mlen == 32) && (strcmp(((mbuff != NULL) ? mbuff : ""), digest) == 0)) {
|
||||
ok = 1;
|
||||
hts_log_print(opt, LOG_DEBUG, "File not re-written (md5): %s", save);
|
||||
}
|
||||
freet(mbuff);
|
||||
}
|
||||
if (!ok) {
|
||||
file_notify(opt, adr, fil, save, 1, 1, r->notmodified);
|
||||
*fp = filecreate(&opt->state.strc, save);
|
||||
if (*fp) {
|
||||
if (ht_len > 0 && fwrite(ht_buff, 1, ht_len, *fp) != ht_len) {
|
||||
int fcheck = check_fatal_io_errno();
|
||||
|
||||
if (fcheck)
|
||||
opt->state.exit_xh = -1;
|
||||
if (opt->log) {
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO,
|
||||
"Unable to write HTML file %s", save);
|
||||
if (fcheck)
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");
|
||||
}
|
||||
}
|
||||
fclose(*fp);
|
||||
*fp = NULL;
|
||||
if (strnotempty(r->lastmodified))
|
||||
set_filetime_rfc822(save, r->lastmodified);
|
||||
} else {
|
||||
int fcheck = check_fatal_io_errno();
|
||||
|
||||
if (fcheck) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
opt->state.exit_xh = -1;
|
||||
}
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to save file %s", save);
|
||||
if (fcheck)
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");
|
||||
}
|
||||
} else {
|
||||
file_notify(opt, adr, fil, save, 0, 0, r->notmodified);
|
||||
filenote(&opt->state.strc, save, NULL);
|
||||
}
|
||||
if (cache->ndx)
|
||||
cache_writedata(cache->ndx, cache->dat, "//[HTML-MD5]//", save, digest,
|
||||
(int) strlen(digest));
|
||||
}
|
||||
|
||||
/* does it look like XML ? (SVG et al.) */
|
||||
static int look_like_xml(const char *s) {
|
||||
@@ -1796,90 +1873,18 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
if (strnotempty(savename()) == 0) { // pas de chemin de sauvegarde
|
||||
if (strcmp(urlfil(), "/robots.txt") == 0) { // robots.txt
|
||||
if (r.adr) {
|
||||
int bptr = 0;
|
||||
char BIGSTK line[1024];
|
||||
char BIGSTK buff[8192];
|
||||
char BIGSTK infobuff[8192];
|
||||
int record = 0;
|
||||
|
||||
line[0] = '\0';
|
||||
buff[0] = '\0';
|
||||
infobuff[0] = '\0';
|
||||
//
|
||||
#if DEBUG_ROBOTS
|
||||
printf("robots.txt dump:\n%s\n", r.adr);
|
||||
#endif
|
||||
do {
|
||||
char *comm;
|
||||
int llen;
|
||||
|
||||
bptr += binput(r.adr + bptr, line, sizeof(line) - 2);
|
||||
/* strip comment */
|
||||
comm = strchr(line, '#');
|
||||
if (comm != NULL) {
|
||||
*comm = '\0';
|
||||
}
|
||||
/* strip spaces */
|
||||
llen = (int) strlen(line);
|
||||
while(llen > 0 && is_realspace(line[llen - 1])) {
|
||||
line[llen - 1] = '\0';
|
||||
llen--;
|
||||
}
|
||||
if (strfield(line, "user-agent:")) {
|
||||
char *a;
|
||||
|
||||
a = line + 11;
|
||||
while(is_realspace(*a))
|
||||
a++; // sauter espace(s)
|
||||
if (*a == '*') {
|
||||
if (record != 2)
|
||||
record = 1; // c pour nous
|
||||
} else if (strfield(a, "httrack") || strfield(a, "winhttrack")
|
||||
|| strfield(a, "webhttrack")) {
|
||||
buff[0] = '\0'; // re-enregistrer
|
||||
infobuff[0] = '\0';
|
||||
record = 2; // locked
|
||||
#if DEBUG_ROBOTS
|
||||
printf("explicit disallow for httrack\n");
|
||||
#endif
|
||||
} else
|
||||
record = 0;
|
||||
} else if (record) {
|
||||
if (strfield(line, "disallow:")) {
|
||||
char *a = line + 9;
|
||||
|
||||
while(is_realspace(*a))
|
||||
a++; // sauter espace(s)
|
||||
if (strnotempty(a)) {
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
if (strcmp(a, "/") != 0 ||
|
||||
opt->robots >= HTS_ROBOTS_ALWAYS_STRICT)
|
||||
hts_boolean keep_root = (opt->robots >= HTS_ROBOTS_ALWAYS_STRICT)
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
#else
|
||||
hts_boolean keep_root = HTS_TRUE;
|
||||
#endif
|
||||
{ /* ignoring disallow: / */
|
||||
if ((strlen(buff) + strlen(a) + 8) < sizeof(buff)) {
|
||||
strcatbuff(buff, a);
|
||||
strcatbuff(buff, "\n");
|
||||
if ((strlen(infobuff) + strlen(a) + 8) <
|
||||
sizeof(infobuff)) {
|
||||
if (strnotempty(infobuff))
|
||||
strcatbuff(infobuff, ", ");
|
||||
strcatbuff(infobuff, a);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
else {
|
||||
hts_log_print(opt, LOG_NOTICE,
|
||||
"Note: %s robots.txt rules are too restrictive, ignoring /",
|
||||
urladr());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
} while((bptr < r.size) && (strlen(buff) < (sizeof(buff) - 32)));
|
||||
if (strnotempty(buff)) {
|
||||
checkrobots_set(&robots, urladr(), buff);
|
||||
|
||||
robots_parse(&robots, urladr(), r.adr, r.size, infobuff,
|
||||
sizeof(infobuff), keep_root);
|
||||
if (strnotempty(infobuff)) {
|
||||
hts_log_print(opt, LOG_INFO,
|
||||
"Note: robots.txt forbidden links for %s are: %s",
|
||||
urladr(), infobuff);
|
||||
@@ -2116,7 +2121,8 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
/*
|
||||
Ensure the index is being closed
|
||||
*/
|
||||
HT_INDEX_END;
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp, makeindex_links,
|
||||
makeindex_firstlink, template_footer, "", "");
|
||||
|
||||
/*
|
||||
updating-a-remotely-deteted-website hack
|
||||
|
||||
@@ -362,6 +362,20 @@ void usercommand(httrackp * opt, int exe, const char *cmd, const char *file,
|
||||
|
||||
void usercommand_exe(const char *cmd, const char *file);
|
||||
|
||||
// Finish the makeindex index.html (footer + refresh meta), run usercommand.
|
||||
// Updates *makeindex_done/*makeindex_fp in place; adr/fil are the mode strings.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil);
|
||||
|
||||
// Flush ht_buff[0..ht_len] to save on disk (skip if MD5 unchanged); *fp
|
||||
// closed+NULLed on write. Precondition: ht_len>0.
|
||||
void hts_finish_html_file(httrackp *opt, cache_back *cache, htsblk *r,
|
||||
FILE **fp, const char *ht_buff, size_t ht_len,
|
||||
const char *adr, const char *fil, const char *save);
|
||||
|
||||
int filters_init(char ***ptrfilters, int maxfilter, int filterinc);
|
||||
|
||||
int fspc(httrackp * opt, FILE * fp, const char *type);
|
||||
@@ -470,4 +484,8 @@ void voidf(void);
|
||||
/* HTML marker comment marking where the top index is spliced. */
|
||||
#define HTS_TOPINDEX "TOP_INDEX_HTTRACK"
|
||||
|
||||
/* Worst-case byte expansion HT_ADD_HTMLESCAPED* must reserve per escaper. */
|
||||
#define HTS_HTMLESCAPE_MAXEXP 5 /* escape_for_html_print: '&'->"&" */
|
||||
#define HTS_HTMLESCAPE_FULL_MAXEXP 6 /* _full: high byte->"&#xHH;" */
|
||||
|
||||
#endif
|
||||
|
||||
@@ -190,9 +190,9 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* copy */
|
||||
if (j + 1 > max) {
|
||||
|
||||
/* reserve one byte for the trailing NUL written after the loop */
|
||||
if (j + 1 >= max) {
|
||||
/* overflow */
|
||||
return -1;
|
||||
}
|
||||
@@ -314,8 +314,8 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
|
||||
}
|
||||
}
|
||||
|
||||
/* Check for overflow */
|
||||
if (j + 1 > max) {
|
||||
/* reserve one byte for the trailing NUL written after the loop */
|
||||
if (j + 1 >= max) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
73
src/htslib.c
73
src/htslib.c
@@ -1149,7 +1149,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
char BIGSTK protocol[256], url[HTS_URLMAXSIZE * 2], method[256];
|
||||
|
||||
linput(fp, line, 1000);
|
||||
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
|
||||
/* widths bound method[256], url[HTS_URLMAXSIZE*2], protocol[256] */
|
||||
if (sscanf(line, "%255s %2047s %255s", method, url, protocol) == 3) {
|
||||
size_t ret;
|
||||
// selon que l'on a ou pas un proxy
|
||||
if (retour->req.proxy.active) {
|
||||
@@ -1326,16 +1327,12 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
|
||||
// Compression accepted ?
|
||||
if (retour->req.http11) {
|
||||
hts_boolean compressible = HTS_FALSE;
|
||||
#if HTS_USEZLIB
|
||||
if ((!retour->req.range_used)
|
||||
&& (!retour->req.nocompression))
|
||||
print_buffer(&bstr, "Accept-Encoding: " "gzip" /* gzip if the preffered encoding */
|
||||
", " "identity;q=0.9" H_CRLF);
|
||||
else
|
||||
print_buffer(&bstr, "Accept-Encoding: identity" H_CRLF); /* no compression */
|
||||
#else
|
||||
print_buffer(&bstr, "Accept-Encoding: identity" H_CRLF); /* no compression */
|
||||
compressible = (!retour->req.range_used && !retour->req.nocompression);
|
||||
#endif
|
||||
print_buffer(&bstr, "Accept-Encoding: %s" H_CRLF,
|
||||
hts_acceptencoding(compressible));
|
||||
}
|
||||
|
||||
/* Authentification */
|
||||
@@ -1951,6 +1948,10 @@ HTSEXT_API const char *infostatuscode_const(int statuscode) {
|
||||
return "Requested Range Not Satisfiable";
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
case 501:
|
||||
@@ -4131,25 +4132,33 @@ DECLARE_APPEND_ESCAPE_VERSION(escape_uri)
|
||||
|
||||
#undef DECLARE_APPEND_ESCAPE_VERSION
|
||||
|
||||
// Same as above, but in-place
|
||||
#undef DECLARE_INPLACE_ESCAPE_VERSION
|
||||
#define DECLARE_INPLACE_ESCAPE_VERSION(NAME) \
|
||||
HTSEXT_API size_t inplace_ ##NAME(char *const dest, const size_t size) { \
|
||||
char buffer[256]; \
|
||||
const size_t len = strnlen(dest, size); \
|
||||
const int in_buffer = len + 1 < sizeof(buffer); \
|
||||
char *src = in_buffer ? buffer : malloct(len + 1); \
|
||||
size_t ret; \
|
||||
assertf(src != NULL); \
|
||||
assertf(len < size); \
|
||||
memcpy(src, dest, len + 1); \
|
||||
ret = NAME(src, dest, size); \
|
||||
if (!in_buffer) { \
|
||||
freet(src); \
|
||||
} \
|
||||
return ret; \
|
||||
// In-place escaping: copy dest aside, then escape that copy back into dest.
|
||||
typedef size_t (*escape_fn_t)(const char *src, char *dest, size_t size);
|
||||
|
||||
static size_t inplace_escape(char *const dest, const size_t size,
|
||||
escape_fn_t escape) {
|
||||
char buffer[256];
|
||||
const size_t len = strnlen(dest, size);
|
||||
const int in_buffer = len + 1 < sizeof(buffer);
|
||||
char *src = in_buffer ? buffer : malloct(len + 1);
|
||||
size_t ret;
|
||||
assertf(src != NULL);
|
||||
assertf(len < size);
|
||||
memcpy(src, dest, len + 1);
|
||||
ret = escape(src, dest, size);
|
||||
if (!in_buffer) {
|
||||
freet(src);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Thin exported wrappers binding inplace_escape() to each escaper (ABI).
|
||||
#undef DECLARE_INPLACE_ESCAPE_VERSION
|
||||
#define DECLARE_INPLACE_ESCAPE_VERSION(NAME) \
|
||||
HTSEXT_API size_t inplace_##NAME(char *const dest, const size_t size) { \
|
||||
return inplace_escape(dest, size, NAME); \
|
||||
}
|
||||
|
||||
DECLARE_INPLACE_ESCAPE_VERSION(escape_in_url)
|
||||
DECLARE_INPLACE_ESCAPE_VERSION(escape_spc_url)
|
||||
DECLARE_INPLACE_ESCAPE_VERSION(escape_uri_utf)
|
||||
@@ -4410,6 +4419,11 @@ HTSEXT_API void get_httptype(httrackp *opt, char *s, const char *fil,
|
||||
(void) get_httptype_sized(opt, s, HTS_MIMETYPE_SIZE, fil, flag);
|
||||
}
|
||||
|
||||
/* Advertised Accept-Encoding; gzip and deflate both decode via hts_zunpack */
|
||||
const char *hts_acceptencoding(hts_boolean compressible) {
|
||||
return compressible ? "gzip, deflate, identity;q=0.9" : "identity";
|
||||
}
|
||||
|
||||
// get type of fil (php)
|
||||
// s: buffer (text/html) or NULL
|
||||
// return: 1 if known by user
|
||||
@@ -5797,6 +5811,13 @@ HTSEXT_API int hts_init(void) {
|
||||
abortLog("unable to initialize TLS: SSL_CTX_new()");
|
||||
assertf("unable to initialize TLS" == NULL);
|
||||
}
|
||||
/* Pin a TLS floor (no SSLv3/TLS1.0/1.1); no cert verify, by design. */
|
||||
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
|
||||
SSL_CTX_set_min_proto_version(openssl_ctx, TLS1_2_VERSION);
|
||||
#else
|
||||
SSL_CTX_set_options(openssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
|
||||
SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -285,6 +285,9 @@ int ishttperror(int err);
|
||||
int get_userhttptype(httrackp * opt, char *s, const char *fil);
|
||||
int give_mimext(char *s, size_t ssize, const char *st);
|
||||
|
||||
/* Advertised Accept-Encoding value (no header name/CRLF); see htslib.c. */
|
||||
const char *hts_acceptencoding(hts_boolean compressible);
|
||||
|
||||
int may_bogus_multiple(httrackp * opt, const char *mime, const char *filename);
|
||||
int may_unknown2(httrackp * opt, const char *mime, const char *filename);
|
||||
|
||||
|
||||
352
src/htsparse.c
352
src/htsparse.c
@@ -77,13 +77,14 @@ Please visit our Website: http://www.httrack.com
|
||||
/** Append to the output buffer the string 'A'. **/
|
||||
#define HT_ADD(A) TypedArrayAppend(output_buffer, A, strlen(A))
|
||||
|
||||
/** Append to the output buffer the string 'A', html-escaped. **/
|
||||
#define HT_ADD_HTMLESCAPED_ANY(A, FUNCTION) do { \
|
||||
/* clang-format off: an edit realigns all backslashes, churning the macro. */
|
||||
/* clang-format off */
|
||||
/** Append 'A' to the output buffer, html-escaped; FACTOR = max byte expansion. **/
|
||||
#define HT_ADD_HTMLESCAPED_ANY(A, FUNCTION, FACTOR) do { \
|
||||
if ((opt->getmode & 1) != 0 && ptr>0) { \
|
||||
const char *const str_ = (A); \
|
||||
size_t size_; \
|
||||
/* & is the maximum expansion */ \
|
||||
TypedArrayEnsureRoom(output_buffer, strlen(str_) * 5 + 1024); \
|
||||
TypedArrayEnsureRoom(output_buffer, strlen(str_) * (FACTOR) + 1024); \
|
||||
size_ = FUNCTION(str_, &TypedArrayTail(output_buffer), \
|
||||
TypedArrayRoom(output_buffer)); \
|
||||
TypedArraySize(output_buffer) += size_; \
|
||||
@@ -91,188 +92,113 @@ Please visit our Website: http://www.httrack.com
|
||||
} while(0)
|
||||
|
||||
/** Append to the output buffer the string 'A', html-escaped for &. **/
|
||||
#define HT_ADD_HTMLESCAPED(A) HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print)
|
||||
#define HT_ADD_HTMLESCAPED(A) \
|
||||
HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print, HTS_HTMLESCAPE_MAXEXP)
|
||||
|
||||
/**
|
||||
* Append to the output buffer the string 'A', html-escaped for & and
|
||||
* Append to the output buffer the string 'A', html-escaped for & and
|
||||
* high chars.
|
||||
**/
|
||||
#define HT_ADD_HTMLESCAPED_FULL(A) HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print_full)
|
||||
#define HT_ADD_HTMLESCAPED_FULL(A) \
|
||||
HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print_full, HTS_HTMLESCAPE_FULL_MAXEXP)
|
||||
/* clang-format on */
|
||||
|
||||
// does nothing
|
||||
#define XH_uninit do {} while(0)
|
||||
|
||||
#define HT_ADD_END { \
|
||||
int ok=0;\
|
||||
if (TypedArraySize(output_buffer) != 0) { \
|
||||
const size_t ht_len = TypedArraySize(output_buffer); \
|
||||
const char *const ht_buff = TypedArrayElts(output_buffer); \
|
||||
char digest[32+2];\
|
||||
off_t fsize_old = fsize(fconv(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),savename()));\
|
||||
digest[0] = '\0';\
|
||||
domd5mem(TypedArrayElts(output_buffer), ht_len, digest, 1);\
|
||||
if (fsize_old == (off_t) ht_len) { \
|
||||
int mlen = 0;\
|
||||
char* mbuff;\
|
||||
cache_readdata(cache,"//[HTML-MD5]//",savename(),&mbuff,&mlen);\
|
||||
if (mlen) \
|
||||
mbuff[mlen]='\0';\
|
||||
if ((mlen == 32) && (strcmp(((mbuff!=NULL)?mbuff:""),digest)==0)) {\
|
||||
ok=1;\
|
||||
hts_log_print(opt, LOG_DEBUG, "File not re-written (md5): %s",savename());\
|
||||
} else {\
|
||||
ok=0;\
|
||||
} \
|
||||
}\
|
||||
if (!ok) { \
|
||||
file_notify(opt,urladr(), urlfil(), savename(), 1, 1, r->notmodified); \
|
||||
fp=filecreate(&opt->state.strc, savename()); \
|
||||
if (fp) { \
|
||||
if (ht_len>0) {\
|
||||
if (fwrite(ht_buff,1,ht_len,fp) != ht_len) { \
|
||||
int fcheck;\
|
||||
if ((fcheck=check_fatal_io_errno())) {\
|
||||
opt->state.exit_xh=-1;\
|
||||
}\
|
||||
if (opt->log) { \
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to write HTML file %s", savename());\
|
||||
if (fcheck) {\
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
|
||||
}\
|
||||
}\
|
||||
}\
|
||||
}\
|
||||
fclose(fp); fp=NULL; \
|
||||
if (strnotempty(r->lastmodified)) \
|
||||
set_filetime_rfc822(savename(),r->lastmodified); \
|
||||
} else {\
|
||||
int fcheck;\
|
||||
if ((fcheck=check_fatal_io_errno())) {\
|
||||
hts_log_print(opt, LOG_ERROR, "Mirror aborted: disk full or filesystem problems"); \
|
||||
opt->state.exit_xh=-1;\
|
||||
}\
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to save file %s", savename());\
|
||||
if (fcheck) {\
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
|
||||
}\
|
||||
}\
|
||||
} else {\
|
||||
file_notify(opt,urladr(), urlfil(), savename(), 0, 0, r->notmodified); \
|
||||
filenote(&opt->state.strc, savename(),NULL); \
|
||||
}\
|
||||
if (cache->ndx)\
|
||||
cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename(),digest,(int)strlen(digest));\
|
||||
} \
|
||||
TypedArrayFree(output_buffer); \
|
||||
}
|
||||
#define HT_ADD_FOP
|
||||
|
||||
// COPY IN HTSCORE.C
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_html_utf8),"index.html"),"primary","primary"); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
/* Mutable extended-context fields: one source of truth so the DEFINE/SET/SAVE
|
||||
load and store lists can't drift apart. */
|
||||
/* clang-format off */
|
||||
#define ENGINE_MUTABLE_FIELDS(X) \
|
||||
X(int, error, stre->error_) \
|
||||
X(int, store_errpage, stre->store_errpage_) \
|
||||
X(int, makeindex_done, stre->makeindex_done_) \
|
||||
X(FILE *, makeindex_fp, stre->makeindex_fp_) \
|
||||
X(int, makeindex_links, stre->makeindex_links_) \
|
||||
X(LLint, stat_fragment, stre->stat_fragment_)
|
||||
|
||||
#define ENGINE_FIELD_DECLARE(type, name, src) type name = *(src);
|
||||
#define ENGINE_FIELD_LOAD(type, name, src) name = *(src);
|
||||
#define ENGINE_FIELD_STORE(type, name, src) *(src) = name;
|
||||
|
||||
#define ENGINE_DEFINE_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
htsblk* const r HTS_UNUSED = stre->r_; \
|
||||
hash_struct* const hash HTS_UNUSED = stre->hash_; \
|
||||
char* const codebase HTS_UNUSED = stre->codebase; \
|
||||
char* const base HTS_UNUSED = stre->base; \
|
||||
/* */ \
|
||||
const char * const template_header HTS_UNUSED = stre->template_header_; \
|
||||
const char * const template_body HTS_UNUSED = stre->template_body_; \
|
||||
const char * const template_footer HTS_UNUSED = stre->template_footer_; \
|
||||
/* */ \
|
||||
HTS_UNUSED char* const makeindex_firstlink = stre->makeindex_firstlink_; \
|
||||
/* */ \
|
||||
/* */ \
|
||||
int error = * stre->error_; \
|
||||
int store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
int makeindex_done = *stre->makeindex_done_; \
|
||||
FILE* makeindex_fp = *stre->makeindex_fp_; \
|
||||
int makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
LLint stat_fragment = *stre->stat_fragment_; \
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_DECLARE) \
|
||||
/* load-once (kept out of SET/SAVE): re-reading would reset the throttle */ \
|
||||
HTS_UNUSED TStamp makestat_time = stre->makestat_time; \
|
||||
HTS_UNUSED FILE* makestat_fp = stre->makestat_fp
|
||||
|
||||
#define ENGINE_SET_CONTEXT() \
|
||||
ENGINE_SET_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
error = * stre->error_; \
|
||||
store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
makeindex_done = *stre->makeindex_done_; \
|
||||
makeindex_fp = *stre->makeindex_fp_; \
|
||||
makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
stat_fragment = *stre->stat_fragment_; \
|
||||
makestat_time = stre->makestat_time; \
|
||||
makestat_fp = stre->makestat_fp
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_LOAD)
|
||||
|
||||
#define ENGINE_LOAD_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT()
|
||||
|
||||
#define ENGINE_SAVE_CONTEXT() \
|
||||
ENGINE_SAVE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
* stre->error_ = error; \
|
||||
* stre->store_errpage_ = store_errpage; \
|
||||
/* */ \
|
||||
*stre->makeindex_done_ = makeindex_done; \
|
||||
*stre->makeindex_fp_ = makeindex_fp; \
|
||||
*stre->makeindex_links_ = makeindex_links; \
|
||||
/* */ \
|
||||
*stre->stat_fragment_ = stat_fragment
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_STORE)
|
||||
/* clang-format on */
|
||||
|
||||
#define _FILTERS (*opt->filters.filters)
|
||||
#define _FILTERS_PTR (opt->filters.filptr)
|
||||
#define _ROBOTS ((robots_wizard*)opt->robotsptr)
|
||||
|
||||
/* Apply current *adr character for the script automate */
|
||||
#define AUTOMATE_LOOKUP_CURRENT_ADR() do { \
|
||||
if (inscript) { \
|
||||
int new_state_pos; \
|
||||
new_state_pos=inscript_state[inscript_state_pos][(unsigned char)*html]; \
|
||||
if (new_state_pos < 0) { \
|
||||
new_state_pos=inscript_state[inscript_state_pos][INSCRIPT_DEFAULT]; \
|
||||
} \
|
||||
assertf(new_state_pos >= 0); \
|
||||
assertf(new_state_pos*sizeof(inscript_state[0]) < sizeof(inscript_state)); \
|
||||
inscript_state_pos=new_state_pos; \
|
||||
} \
|
||||
} while(0)
|
||||
/* JS-detection automaton states; INSCRIPT_DEFAULT is the synthetic "any other
|
||||
char" column of the transition table. */
|
||||
typedef enum {
|
||||
INSCRIPT_START = 0,
|
||||
INSCRIPT_ANTISLASH,
|
||||
INSCRIPT_INQUOTE,
|
||||
INSCRIPT_INQUOTE2,
|
||||
INSCRIPT_SLASH,
|
||||
INSCRIPT_SLASHSLASH,
|
||||
INSCRIPT_COMMENT,
|
||||
INSCRIPT_COMMENT2,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE2,
|
||||
INSCRIPT_DEFAULT = 256
|
||||
} INSCRIPT;
|
||||
|
||||
/* Increment current pointer to 'steps' characters, modifying automate if necessary */
|
||||
#define INCREMENT_CURRENT_ADR(steps) do { \
|
||||
int steps__ = (int) ( steps ); \
|
||||
while(steps__ > 0) { \
|
||||
html++; \
|
||||
AUTOMATE_LOOKUP_CURRENT_ADR(); \
|
||||
steps__ --; \
|
||||
} \
|
||||
} while(0)
|
||||
#define INSCRIPT_NSTATES 10 /* rows in the transition table */
|
||||
|
||||
/* Live view of the parser's automaton locals, set up once so the helpers below
|
||||
can drive it without capturing them by lexical scope. */
|
||||
typedef struct {
|
||||
const int *inscript; /* nonzero while inside a script body */
|
||||
const signed char (*table)[257]; /* [INSCRIPT_NSTATES][257] transitions */
|
||||
INSCRIPT *pos; /* current state */
|
||||
const char **html; /* parse cursor */
|
||||
} script_automate;
|
||||
|
||||
/* Feed the current *html byte to the automaton. No-op outside a script body. */
|
||||
static void hts_automate_lookup(const script_automate *aut) {
|
||||
if (*aut->inscript) {
|
||||
int next = aut->table[*aut->pos][(unsigned char) **aut->html];
|
||||
if (next < 0) {
|
||||
next = aut->table[*aut->pos][INSCRIPT_DEFAULT];
|
||||
}
|
||||
assertf(next >= 0 && next < INSCRIPT_NSTATES);
|
||||
*aut->pos = (INSCRIPT) next;
|
||||
}
|
||||
}
|
||||
|
||||
/* Advance the cursor by 'steps' bytes, feeding each to the automaton. */
|
||||
static void hts_automate_increment(const script_automate *aut, int steps) {
|
||||
while (steps > 0) {
|
||||
(*aut->html)++;
|
||||
hts_automate_lookup(aut);
|
||||
steps--;
|
||||
}
|
||||
}
|
||||
|
||||
/* Percent-encode the angle brackets of a string so it is safe to embed inside
|
||||
an HTML comment (the default footer) or any other HTML context. A URL holding
|
||||
@@ -417,20 +343,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int incomment = 0; // dans un <!--
|
||||
int inscript = 0; // dans un scipt pour applets javascript)
|
||||
int inscript_locked = 0; // in locked script (ie. js file)
|
||||
signed char inscript_state[10][257];
|
||||
typedef enum {
|
||||
INSCRIPT_START = 0,
|
||||
INSCRIPT_ANTISLASH,
|
||||
INSCRIPT_INQUOTE,
|
||||
INSCRIPT_INQUOTE2,
|
||||
INSCRIPT_SLASH,
|
||||
INSCRIPT_SLASHSLASH,
|
||||
INSCRIPT_COMMENT,
|
||||
INSCRIPT_COMMENT2,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE2,
|
||||
INSCRIPT_DEFAULT = 256
|
||||
} INSCRIPT;
|
||||
signed char inscript_state[INSCRIPT_NSTATES][257];
|
||||
INSCRIPT inscript_state_pos = INSCRIPT_START;
|
||||
const char *inscript_name = NULL; // script tag name
|
||||
int inscript_tag = 0; // on est dans un <body onLoad="... terminé par >
|
||||
@@ -491,6 +404,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
inscript_state[INSCRIPT_COMMENT2]['*'] = INSCRIPT_COMMENT2;
|
||||
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE; /* #8: escape in '' */
|
||||
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE2][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE2; /* #9: escape in "" */
|
||||
const script_automate saut = {&inscript, inscript_state,
|
||||
&inscript_state_pos, &html};
|
||||
|
||||
/* Primary list or URLs */
|
||||
if (ptr == 0) {
|
||||
@@ -689,13 +604,14 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
// Decode title with encoding
|
||||
if (str->page_charset_ != NULL
|
||||
&& *str->page_charset_ != '\0') {
|
||||
char *const sUtf =
|
||||
hts_convertStringToUTF8(s, strlen(s), str->page_charset_);
|
||||
if (str->page_charset_ != NULL &&
|
||||
*str->page_charset_ != '\0') {
|
||||
char *sUtf = hts_convertStringToUTF8(
|
||||
s, strlen(s), str->page_charset_);
|
||||
if (sUtf != NULL) {
|
||||
strcpy(s, sUtf);
|
||||
free(sUtf);
|
||||
/* UTF-8 can expand past s[]; truncate to fit */
|
||||
snprintf(s, sizeof(s), "%s", sUtf);
|
||||
freet(sUtf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -709,7 +625,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
} else if (heap(ptr)->depth < opt->depth) { // on a sauté level1+1 et level1
|
||||
HT_INDEX_END;
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp,
|
||||
makeindex_links, makeindex_firstlink,
|
||||
template_footer, "primary", "primary");
|
||||
}
|
||||
} // if (opt->makeindex)
|
||||
}
|
||||
@@ -927,7 +845,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
/* automate */
|
||||
AUTOMATE_LOOKUP_CURRENT_ADR();
|
||||
hts_automate_lookup(&saut);
|
||||
|
||||
// Note:
|
||||
// Certaines pages ne respectent pas le html
|
||||
@@ -1843,7 +1761,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
// sauter espaces
|
||||
// adr+=p;
|
||||
INCREMENT_CURRENT_ADR(p);
|
||||
hts_automate_increment(&saut, p);
|
||||
while((is_space(*html)
|
||||
|| (inscriptgen && html[0] == '\\' && is_space(html[1])
|
||||
)
|
||||
@@ -1858,7 +1776,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
// puis quitter
|
||||
// html++; // sauter les espaces, "" et cie
|
||||
INCREMENT_CURRENT_ADR(1);
|
||||
hts_automate_increment(&saut, 1);
|
||||
}
|
||||
|
||||
/* Stop at \n (LF) if primary links or link lists */
|
||||
@@ -1873,7 +1791,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (*html == '\\') {
|
||||
if ((*(html + 1) == '\'') || (*(html + 1) == '"')) { // \" ou \'
|
||||
// html+=2; // sauter
|
||||
INCREMENT_CURRENT_ADR(2);
|
||||
hts_automate_increment(&saut, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1921,7 +1839,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (srcset_p) {
|
||||
while(html < r->adr + r->size
|
||||
&& (is_realspace(*html) || *html == ','))
|
||||
INCREMENT_CURRENT_ADR(1);
|
||||
hts_automate_increment(&saut, 1);
|
||||
}
|
||||
eadr = html;
|
||||
|
||||
@@ -3381,7 +3299,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
|
||||
assertf(eadr - html >= 0); // Should not go back
|
||||
if (eadr > html) {
|
||||
INCREMENT_CURRENT_ADR(eadr - 1 - html);
|
||||
hts_automate_increment(&saut, (int) (eadr - 1 - html));
|
||||
}
|
||||
// adr=eadr-1; // ** sauter
|
||||
|
||||
@@ -3400,7 +3318,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
q++; // skip whitespace and empty candidates
|
||||
if (q < endp && *q != '\0' && *q != ',' && *q != quote
|
||||
&& *q != '<' && *q != '>' && (unsigned char) *q >= 32) {
|
||||
INCREMENT_CURRENT_ADR(q - html); // keep the automate in sync
|
||||
hts_automate_increment(
|
||||
&saut, (int) (q - html)); // keep the automate in sync
|
||||
ok = 1;
|
||||
goto srcset_next;
|
||||
}
|
||||
@@ -3540,7 +3459,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
/* Flush and save to disk */
|
||||
HT_ADD_END; // achever
|
||||
if (TypedArraySize(output_buffer) != 0) {
|
||||
hts_finish_html_file(
|
||||
opt, cache, r, &fp, TypedArrayElts(output_buffer),
|
||||
TypedArraySize(output_buffer), urladr(), urlfil(), savename());
|
||||
}
|
||||
TypedArrayFree(output_buffer);
|
||||
}
|
||||
//
|
||||
//
|
||||
@@ -3565,6 +3489,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
|
||||
* contract in htsparse.h. */
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil) {
|
||||
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
|
||||
const int norm_query = opt->urlhack && !opt->no_query_dedup;
|
||||
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
if (strcasecmp(jump_identification_const(moved_adr),
|
||||
jump_identification_const(cur_adr)) != 0)
|
||||
return HTS_FALSE;
|
||||
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
|
||||
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
|
||||
return strcasecmp(n_fil, pn_fil) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Check 301, 302, .. statuscodes (moved)
|
||||
*/
|
||||
@@ -3610,36 +3552,9 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
if ((reponse =
|
||||
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
|
||||
int set_prio_to = 0; // pas de priotité fixéd par wizard
|
||||
|
||||
// check whether URLHack is harmless or not (per the effective
|
||||
// sub-flags)
|
||||
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
|
||||
!opt->no_query_dedup)) {
|
||||
const int norm_host = !opt->no_www_dedup;
|
||||
const int norm_slash = !opt->no_slash_dedup;
|
||||
const int norm_query = !opt->no_query_dedup;
|
||||
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
strlcpybuff(n_adr,
|
||||
norm_host ? jump_normalized_const(moved->adr)
|
||||
: jump_identification_const(moved->adr),
|
||||
sizeof(n_adr));
|
||||
strlcpybuff(pn_adr,
|
||||
norm_host ? jump_normalized_const(urladr())
|
||||
: jump_identification_const(urladr()),
|
||||
sizeof(pn_adr));
|
||||
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
if (strcasecmp(n_adr, pn_adr) == 0
|
||||
&& strcasecmp(n_fil, pn_fil) == 0) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
}
|
||||
// A same-file alias redirect must be followed, not stubbed (#159).
|
||||
const hts_boolean same_savefile = hts_redirect_same_savefile(
|
||||
opt, urladr(), urlfil(), moved->adr, moved->fil);
|
||||
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
|
||||
// c'est (en gros) la même URL..
|
||||
// si c'est un problème de casse dans le host c'est que le serveur est buggé
|
||||
@@ -3667,7 +3582,17 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
|
||||
moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
} else if (same_savefile) {
|
||||
// A stub would point at itself; follow the redirect instead.
|
||||
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
|
||||
&set_prio_to, NULL) != 1) {
|
||||
get_it = 1;
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirect to a same-file alias, fetching real "
|
||||
"content: %s%s -> %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
}
|
||||
|
||||
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
|
||||
@@ -3690,7 +3615,11 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
heap(heap(ptr)->precedent)->adr,
|
||||
heap(heap(ptr)->precedent)->fil, opt,
|
||||
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
|
||||
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// Same-file alias: the reserved name is the invalidated source,
|
||||
// so record anyway.
|
||||
if (same_savefile ||
|
||||
hash_read(hash, savedmoved.save, NULL,
|
||||
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// enregistrer lien avec SAV IDENTIQUE
|
||||
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||
// mode test?
|
||||
@@ -3714,7 +3643,6 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
"moving %s to an existing file %s",
|
||||
heap(ptr)->fil, urlfil());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -116,6 +116,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
|
||||
int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
htsmoduleStructExtended * stre);
|
||||
|
||||
/*
|
||||
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
|
||||
same local file, so it must be followed rather than turned into a
|
||||
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
|
||||
stripped, www kept (www dedup is the crawl layer's job), path
|
||||
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
|
||||
on the dedup hash, which folds www and never collapses http<->https.
|
||||
*/
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil);
|
||||
|
||||
/*
|
||||
Process user intercations: pause, add link, delete link..
|
||||
*/
|
||||
|
||||
167
src/htsrobots.c
167
src/htsrobots.c
@@ -44,28 +44,84 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
// -- robots --
|
||||
|
||||
/* RFC 9309 path-prefix match; '*' any run, '$' anchors end; linear. */
|
||||
static hts_boolean robots_pattern_match(const char *pattern, const char *path) {
|
||||
size_t patlen = strlen(pattern);
|
||||
hts_boolean anchored = HTS_FALSE;
|
||||
const char *p, *pend, *s;
|
||||
const char *star = NULL, *star_s = NULL;
|
||||
|
||||
if (patlen > 0 && pattern[patlen - 1] == '$') {
|
||||
anchored = HTS_TRUE;
|
||||
patlen--;
|
||||
}
|
||||
p = pattern;
|
||||
pend = pattern + patlen;
|
||||
s = path;
|
||||
while (*s != '\0') {
|
||||
if (p == pend) {
|
||||
if (!anchored)
|
||||
return HTS_TRUE; // prefix matched
|
||||
if (star != NULL) { // anchored: '*' must eat the rest
|
||||
p = star + 1;
|
||||
s = ++star_s;
|
||||
continue;
|
||||
}
|
||||
return HTS_FALSE;
|
||||
}
|
||||
if (*p == '*') {
|
||||
star = p++;
|
||||
star_s = s;
|
||||
} else if (*p == *s) {
|
||||
p++;
|
||||
s++;
|
||||
} else if (star != NULL) {
|
||||
p = star + 1;
|
||||
s = ++star_s;
|
||||
} else {
|
||||
return HTS_FALSE;
|
||||
}
|
||||
}
|
||||
while (p < pend && *p == '*')
|
||||
p++;
|
||||
return (p == pend) ? HTS_TRUE : HTS_FALSE;
|
||||
}
|
||||
|
||||
// fil="" : vérifier si règle déja enregistrée
|
||||
int checkrobots(robots_wizard * robots, const char *adr, const char *fil) {
|
||||
while(robots) {
|
||||
if (strfield2(robots->adr, adr)) {
|
||||
if (fil[0]) {
|
||||
/* RFC 9309: longest pattern wins, Allow beats Disallow on ties. */
|
||||
int ptr = 0;
|
||||
char line[250];
|
||||
char line[HTS_ROBOTS_TOKEN_SIZE];
|
||||
size_t toklen = strlen(robots->token);
|
||||
size_t best_len = 0;
|
||||
hts_boolean matched = HTS_FALSE;
|
||||
hts_boolean best_allow = HTS_FALSE;
|
||||
|
||||
if (strnotempty(robots->token)) {
|
||||
do {
|
||||
ptr += binput(robots->token + ptr, line, 200);
|
||||
if (line[0] == '/') { // absolu
|
||||
if (strfield(fil, line)) { // commence avec ligne
|
||||
return -1; // interdit
|
||||
}
|
||||
} else { // relatif
|
||||
if (strstrcase(fil, line)) {
|
||||
return -1;
|
||||
while (ptr < (int) toklen) {
|
||||
ptr += binput(robots->token + ptr, line, sizeof(line) - 1);
|
||||
if (line[0] != 'A' && line[0] != 'D')
|
||||
continue;
|
||||
{
|
||||
const hts_boolean is_allow =
|
||||
(line[0] == 'A') ? HTS_TRUE : HTS_FALSE;
|
||||
const char *pat = line + 1;
|
||||
|
||||
if (robots_pattern_match(pat, fil)) {
|
||||
const size_t len = strlen(pat);
|
||||
|
||||
if (!matched || len > best_len || (len == best_len && is_allow)) {
|
||||
matched = HTS_TRUE;
|
||||
best_len = len;
|
||||
best_allow = is_allow;
|
||||
}
|
||||
}
|
||||
} while((strnotempty(line)) && (ptr < (int) strlen(robots->token)));
|
||||
}
|
||||
}
|
||||
if (matched && !best_allow)
|
||||
return -1; // forbidden
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
@@ -74,6 +130,93 @@ int checkrobots(robots_wizard * robots, const char *adr, const char *fil) {
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Append "<marker><pattern>\n" to the bounded rule blob if it fits. */
|
||||
static void robots_blob_add(char *blob, size_t blobsize, char marker,
|
||||
const char *pat) {
|
||||
const size_t used = strlen(blob);
|
||||
const size_t need = strlen(pat) + 2; // marker + '\n'
|
||||
|
||||
if (need < blobsize - used) { // overflow-safe: used <= blobsize-1
|
||||
blob[used] = marker;
|
||||
blob[used + 1] = '\0';
|
||||
strlcatbuff(blob, pat, blobsize);
|
||||
strlcatbuff(blob, "\n", blobsize);
|
||||
}
|
||||
}
|
||||
|
||||
void robots_parse(robots_wizard *robots, const char *adr, const char *body,
|
||||
size_t bodysize, char *info, size_t infosize,
|
||||
hts_boolean keep_root_disallow) {
|
||||
size_t bptr = 0;
|
||||
int record = 0;
|
||||
char BIGSTK line[1024];
|
||||
char BIGSTK blob[HTS_ROBOTS_TOKEN_SIZE];
|
||||
|
||||
blob[0] = '\0';
|
||||
if (info != NULL && infosize > 0)
|
||||
info[0] = '\0';
|
||||
#if DEBUG_ROBOTS
|
||||
printf("robots.txt dump:\n%s\n", body);
|
||||
#endif
|
||||
while (bptr < bodysize) {
|
||||
char *comm;
|
||||
int llen;
|
||||
|
||||
bptr += binput(body + bptr, line, sizeof(line) - 2);
|
||||
comm = strchr(line, '#'); // strip comment
|
||||
if (comm != NULL)
|
||||
*comm = '\0';
|
||||
llen = (int) strlen(line); // strip trailing spaces
|
||||
while (llen > 0 && is_realspace(line[llen - 1])) {
|
||||
line[llen - 1] = '\0';
|
||||
llen--;
|
||||
}
|
||||
if (strfield(line, "user-agent:")) {
|
||||
char *a = line + 11;
|
||||
|
||||
while (is_realspace(*a))
|
||||
a++;
|
||||
if (*a == '*') {
|
||||
if (record != 2)
|
||||
record = 1; // generic group applies to us
|
||||
} else if (strfield(a, "httrack") || strfield(a, "winhttrack") ||
|
||||
strfield(a, "webhttrack")) {
|
||||
blob[0] = '\0'; // explicit group: restart capture
|
||||
if (info != NULL && infosize > 0)
|
||||
info[0] = '\0';
|
||||
record = 2; // locked to the httrack group
|
||||
} else
|
||||
record = 0;
|
||||
} else if (record) {
|
||||
hts_boolean is_allow = strfield(line, "allow:");
|
||||
hts_boolean is_disallow = !is_allow && strfield(line, "disallow:");
|
||||
|
||||
if (is_allow || is_disallow) {
|
||||
char *a = line + (is_allow ? 6 : 9);
|
||||
|
||||
while (is_realspace(*a))
|
||||
a++;
|
||||
if (strnotempty(a)) {
|
||||
if (is_disallow && !keep_root_disallow && strcmp(a, "/") == 0) {
|
||||
// dropped: site-wide disallow ignored by option
|
||||
} else {
|
||||
robots_blob_add(blob, sizeof(blob), is_allow ? 'A' : 'D', a);
|
||||
if (is_disallow && info != NULL &&
|
||||
strlen(a) + 2 < infosize - strlen(info)) {
|
||||
if (strnotempty(info))
|
||||
strlcatbuff(info, ", ", infosize);
|
||||
strlcatbuff(info, a, infosize);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (strnotempty(blob))
|
||||
checkrobots_set(robots, adr, blob);
|
||||
}
|
||||
|
||||
int checkrobots_set(robots_wizard * robots, const char *adr, const char *data) {
|
||||
if (((int) strlen(adr)) >= sizeof(robots->adr) - 2)
|
||||
return 0;
|
||||
|
||||
@@ -39,17 +39,27 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_DEF_FWSTRUCT_robots_wizard
|
||||
typedef struct robots_wizard robots_wizard;
|
||||
#endif
|
||||
|
||||
/* Per-host blob: one rule per line, first byte 'A'/'D' then path pattern. */
|
||||
#define HTS_ROBOTS_TOKEN_SIZE 4096
|
||||
|
||||
struct robots_wizard {
|
||||
char adr[128];
|
||||
char token[4096];
|
||||
char token[HTS_ROBOTS_TOKEN_SIZE];
|
||||
struct robots_wizard *next;
|
||||
};
|
||||
|
||||
/* Library internal definictions */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
/* -1 if `fil` disallowed for `adr` (RFC 9309); empty: -1 if rules exist. */
|
||||
int checkrobots(robots_wizard * robots, const char *adr, const char *fil);
|
||||
void checkrobots_free(robots_wizard * robots);
|
||||
int checkrobots_set(robots_wizard * robots, const char *adr, const char *data);
|
||||
/* Parse robots.txt `body` for `adr`, storing the HTTrack group's rules; `info`
|
||||
gets a disallow summary, `keep_root_disallow` FALSE drops "Disallow: /". */
|
||||
void robots_parse(robots_wizard *robots, const char *adr, const char *body,
|
||||
size_t bodysize, char *info, size_t infosize,
|
||||
hts_boolean keep_root_disallow);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -45,11 +45,15 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htscore.h"
|
||||
#include "htsdefines.h"
|
||||
#include "htslib.h"
|
||||
#include "htsparse.h"
|
||||
#include "htscache_selftest.h"
|
||||
#include "htsdns_selftest.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htsmd5.h"
|
||||
#if HTS_USEZLIB
|
||||
#include "htszlib.h"
|
||||
#endif
|
||||
#include "coucal/coucal.h"
|
||||
|
||||
#include <ctype.h>
|
||||
@@ -521,6 +525,41 @@ static int string_safety_selftests(void) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* StringCatN/StringSetLength must eval SIZE once: (n_eval++, V) leaves
|
||||
n_eval == 2 on a double-eval macro. */
|
||||
{
|
||||
String s = STRING_EMPTY;
|
||||
int n_eval = 0;
|
||||
|
||||
StringCat(s, "hello");
|
||||
StringCatN(s, "world", (n_eval++, 3)); /* strlen>SIZE so the clamp runs */
|
||||
if (n_eval != 1 || strcmp(StringBuff(s), "hellowor") != 0) {
|
||||
StringFree(s);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_eval = 0;
|
||||
StringSetLength(s, (n_eval++, 5));
|
||||
if (n_eval != 1 || StringLength(s) != 5) {
|
||||
StringFree(s);
|
||||
return 1;
|
||||
}
|
||||
StringFree(s);
|
||||
}
|
||||
|
||||
/* StringSubRW still reads/writes after dropping its duplicate definition. */
|
||||
{
|
||||
String s = STRING_EMPTY;
|
||||
|
||||
StringCat(s, "abc");
|
||||
StringSubRW(s, 1) = 'X';
|
||||
if (StringSub(s, 1) != 'X' || strcmp(StringBuff(s), "aXc") != 0) {
|
||||
StringFree(s);
|
||||
return 1;
|
||||
}
|
||||
StringFree(s);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -669,7 +708,8 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
|
||||
}
|
||||
s = strdupt(argv[0]);
|
||||
enc = argc >= 2 ? argv[1] : "UTF-8";
|
||||
if (s != NULL && hts_unescapeEntitiesWithCharset(s, s, strlen(s), enc) == 0) {
|
||||
if (s != NULL &&
|
||||
hts_unescapeEntitiesWithCharset(s, s, strlen(s) + 1, enc) == 0) {
|
||||
printf("%s\n", s);
|
||||
freet(s);
|
||||
} else {
|
||||
@@ -678,6 +718,23 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* The unescapers must reserve one byte for the trailing NUL: a 'max'-byte
|
||||
dest holding 'max' output chars pre-fix wrote dest[max] (1-byte OOB, caught
|
||||
by ASan). Both unescapeEntities and unescapeUrl share the guard. */
|
||||
static int st_unescape_bounds(httrackp *opt, int argc, char **argv) {
|
||||
char dest[4];
|
||||
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
assertf(hts_unescapeEntities("abcd", dest, sizeof(dest)) == -1);
|
||||
assertf(hts_unescapeUrl("abcd", dest, sizeof(dest)) == -1);
|
||||
assertf(hts_unescapeEntities("abc", dest, sizeof(dest)) == 0);
|
||||
assertf(strcmp(dest, "abc") == 0);
|
||||
printf("unescape-bounds self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int st_hashtable(httrackp *opt, int argc, char **argv) {
|
||||
char *snum;
|
||||
unsigned long count = 0;
|
||||
@@ -1302,6 +1359,165 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* #159: hts_redirect_same_savefile decides whether a redirect is a same-file
|
||||
* alias. */
|
||||
static int st_redirect_samefile(httrackp *opt, int argc, char **argv) {
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
#define SAME(aa, fa, ab, fb) hts_redirect_same_savefile(opt, aa, fa, ab, fb)
|
||||
/* scheme and userinfo collapse (the #159 case); a different path does not */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
assertf(SAME("http://user@foo.com", "/a", "http://foo.com", "/a"));
|
||||
assertf(!SAME("http://foo.com", "/a", "http://foo.com", "/b"));
|
||||
/* www stays distinct here; the crawl's dedup layer folds www, not this helper
|
||||
*/
|
||||
opt->urlhack = HTS_TRUE;
|
||||
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
assertf(!SAME("http://www.foo.com", "/a", "http://foo.com", "/a"));
|
||||
/* slash/query fold only when the dedup flag is on */
|
||||
assertf(SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
|
||||
assertf(!SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
!SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
/* but a pure scheme alias still collapses regardless of dedup opt-outs */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
#undef SAME
|
||||
printf("redirect-samefile self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// hts_finish_makeindex writes the footer, emits the refresh meta only when
|
||||
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
|
||||
static int st_makeindex(httrackp *opt, int argc, char **argv) {
|
||||
char path[HTS_URLMAXSIZE];
|
||||
char buf[4096];
|
||||
FILE *fp;
|
||||
size_t n;
|
||||
int done;
|
||||
|
||||
assertf(argc >= 1);
|
||||
snprintf(path, sizeof(path), "%s/index.html", argv[0]);
|
||||
|
||||
/* single first link: footer + a refresh meta carrying the escaped URL */
|
||||
done = 0;
|
||||
fp = fopen(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
hts_finish_makeindex(opt, &done, &fp, 1, "http://example.com/a b", "%s%s", "",
|
||||
"");
|
||||
assertf(fp == NULL); /* the function closed and cleared it */
|
||||
assertf(done != 0);
|
||||
fp = fopen(path, "rb");
|
||||
assertf(fp != NULL);
|
||||
n = fread(buf, 1, sizeof(buf) - 1, fp);
|
||||
fclose(fp);
|
||||
buf[n] = '\0';
|
||||
assertf(strstr(buf, "Mirror and index made by HTTrack") != NULL);
|
||||
assertf(strstr(buf, "Refresh") != NULL);
|
||||
assertf(strstr(buf, "example.com") != NULL);
|
||||
|
||||
/* no single link: footer only, no refresh meta */
|
||||
done = 0;
|
||||
fp = fopen(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
hts_finish_makeindex(opt, &done, &fp, 0, NULL, "%s%s", "", "");
|
||||
assertf(fp == NULL);
|
||||
assertf(done != 0);
|
||||
fp = fopen(path, "rb");
|
||||
assertf(fp != NULL);
|
||||
n = fread(buf, 1, sizeof(buf) - 1, fp);
|
||||
fclose(fp);
|
||||
buf[n] = '\0';
|
||||
assertf(strstr(buf, "Mirror and index made by HTTrack") != NULL);
|
||||
assertf(strstr(buf, "Refresh") == NULL);
|
||||
|
||||
UNLINK(path);
|
||||
printf("makeindex self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Each inplace_escape_*() must equal escape_*() on a copy. */
|
||||
static int st_inplace_escape(httrackp *opt, int argc, char **argv) {
|
||||
/* >255 bytes forces the helper's malloct path, not the stack buffer */
|
||||
static char longstr[600];
|
||||
static const char *const samples[] = {
|
||||
"", "abc", "a b/c?d=e&f", "h\x8ello w\x94rld",
|
||||
"a%b\"c<d>", "/path to/file", longstr};
|
||||
static size_t (*const inplace[])(char *, size_t) = {
|
||||
inplace_escape_in_url, inplace_escape_spc_url, inplace_escape_uri_utf,
|
||||
inplace_escape_check_url, inplace_escape_uri};
|
||||
static size_t (*const plain[])(const char *, char *, size_t) = {
|
||||
escape_in_url, escape_spc_url, escape_uri_utf, escape_check_url,
|
||||
escape_uri};
|
||||
size_t i, f;
|
||||
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
|
||||
memset(longstr, 'a', sizeof(longstr) - 1);
|
||||
for (f = 0; f < sizeof(inplace) / sizeof(inplace[0]); f++) {
|
||||
for (i = 0; i < sizeof(samples) / sizeof(samples[0]); i++) {
|
||||
char ref[4096], work[4096];
|
||||
size_t rret, iret;
|
||||
rret = plain[f](samples[i], ref, sizeof(ref));
|
||||
strcpybuff(work, samples[i]);
|
||||
iret = inplace[f](work, sizeof(work));
|
||||
assertf(iret == rret);
|
||||
assertf(strcmp(work, ref) == 0);
|
||||
}
|
||||
}
|
||||
printf("inplace-escape self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Pin HTS_HTMLESCAPE*_MAXEXP to each escaper's true max byte expansion. */
|
||||
static int st_escape_room(httrackp *opt, int argc, char **argv) {
|
||||
/* N > 1023: where 6n outgrows the old 5n+1024 reservation */
|
||||
enum { N = 2000 };
|
||||
|
||||
char *src = malloct(N + 1);
|
||||
char *dst;
|
||||
size_t room, got;
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
|
||||
/* _full worst case: a high byte expands to "&#xHH;" (6 bytes) */
|
||||
memset(src, 0xE9, N);
|
||||
src[N] = '\0';
|
||||
room = (size_t) N * HTS_HTMLESCAPE_FULL_MAXEXP + 1024;
|
||||
dst = malloct(room);
|
||||
got = escape_for_html_print_full(src, dst, room);
|
||||
assertf(got == (size_t) N * HTS_HTMLESCAPE_FULL_MAXEXP);
|
||||
assertf(strlen(dst) == got);
|
||||
freet(dst);
|
||||
|
||||
/* one factor short overflows (returns size), truncating the page: the bug */
|
||||
room = (size_t) N * (HTS_HTMLESCAPE_FULL_MAXEXP - 1) + 1024;
|
||||
dst = malloct(room);
|
||||
got = escape_for_html_print_full(src, dst, room);
|
||||
assertf(got == room);
|
||||
freet(dst);
|
||||
|
||||
/* plain escaper worst case: '&' -> "&" (5); high bytes stay verbatim */
|
||||
memset(src, '&', N);
|
||||
src[N] = '\0';
|
||||
room = (size_t) N * HTS_HTMLESCAPE_MAXEXP + 1024;
|
||||
dst = malloct(room);
|
||||
got = escape_for_html_print(src, dst, room);
|
||||
assertf(got == (size_t) N * HTS_HTMLESCAPE_MAXEXP);
|
||||
assertf(strlen(dst) == got);
|
||||
freet(dst);
|
||||
|
||||
freet(src);
|
||||
printf("escape-room self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Default User-Agent: honest HTTrack token, no resurrected Windows 98. */
|
||||
static int st_useragent(httrackp *opt, int argc, char **argv) {
|
||||
const char *ua = StringBuff(opt->user_agent);
|
||||
@@ -1318,6 +1534,259 @@ static int st_useragent(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* HTTP status code -> reason phrase, including the modern 429/451. */
|
||||
static int st_status(httrackp *opt, int argc, char **argv) {
|
||||
const char *s;
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
s = infostatuscode_const(429);
|
||||
assertf(s != NULL && strcmp(s, "Too Many Requests") == 0);
|
||||
s = infostatuscode_const(451);
|
||||
assertf(s != NULL && strcmp(s, "Unavailable For Legal Reasons") == 0);
|
||||
/* A spot-check of a long-standing code, and an unknown one. */
|
||||
s = infostatuscode_const(404);
|
||||
assertf(s != NULL && strcmp(s, "Not Found") == 0);
|
||||
assertf(infostatuscode_const(799) == NULL);
|
||||
printf("status self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if HTS_USEZLIB
|
||||
/* Deflate src->path at windowBits (16+ gzip, + zlib, - raw); 0 on success. */
|
||||
static int ae_write_packed(const char *path, int windowBits,
|
||||
const unsigned char *src, size_t len) {
|
||||
unsigned char out[8192];
|
||||
z_stream strm;
|
||||
FILE *f;
|
||||
int zerr;
|
||||
|
||||
memset(&strm, 0, sizeof(strm));
|
||||
if (deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, windowBits, 8,
|
||||
Z_DEFAULT_STRATEGY) != Z_OK)
|
||||
return 1;
|
||||
if ((f = FOPEN(path, "wb")) == NULL) {
|
||||
deflateEnd(&strm);
|
||||
return 1;
|
||||
}
|
||||
strm.next_in = (Bytef *) src;
|
||||
strm.avail_in = (uInt) len;
|
||||
do {
|
||||
size_t n;
|
||||
|
||||
strm.next_out = out;
|
||||
strm.avail_out = sizeof(out);
|
||||
zerr = deflate(&strm, Z_FINISH);
|
||||
n = sizeof(out) - strm.avail_out;
|
||||
if (n > 0 && fwrite(out, 1, n, f) != n) {
|
||||
deflateEnd(&strm);
|
||||
fclose(f);
|
||||
return 1;
|
||||
}
|
||||
} while (zerr == Z_OK);
|
||||
deflateEnd(&strm);
|
||||
fclose(f);
|
||||
return (zerr == Z_STREAM_END) ? 0 : 1;
|
||||
}
|
||||
|
||||
/* Forged raw deflate (08 1D) that misdetects as zlib; only fallback decodes */
|
||||
static int ae_write_collision(const char *path, const unsigned char *src,
|
||||
size_t len) {
|
||||
/* block-1 LEN low byte 0x1D: with 0x08, (0x081D)%31==0 */
|
||||
const size_t n1 = 29;
|
||||
size_t n2, p = 0;
|
||||
unsigned char *buf;
|
||||
FILE *f;
|
||||
int ok;
|
||||
|
||||
if (len < n1 || len - n1 > 0xFFFF)
|
||||
return 1;
|
||||
n2 = len - n1;
|
||||
buf = malloct(10 + len);
|
||||
if (buf == NULL)
|
||||
return 1;
|
||||
buf[p++] = 0x08; /* BFINAL=0, BTYPE=00, forged padding -> zlib CMF nibble */
|
||||
buf[p++] = (unsigned char) (n1 & 0xff);
|
||||
buf[p++] = (unsigned char) (n1 >> 8);
|
||||
buf[p++] = (unsigned char) (~n1 & 0xff);
|
||||
buf[p++] = (unsigned char) ((~n1 >> 8) & 0xff);
|
||||
memcpy(buf + p, src, n1);
|
||||
p += n1;
|
||||
buf[p++] = 0x01; /* BFINAL=1, BTYPE=00 */
|
||||
buf[p++] = (unsigned char) (n2 & 0xff);
|
||||
buf[p++] = (unsigned char) (n2 >> 8);
|
||||
buf[p++] = (unsigned char) (~n2 & 0xff);
|
||||
buf[p++] = (unsigned char) ((~n2 >> 8) & 0xff);
|
||||
memcpy(buf + p, src + n1, n2);
|
||||
p += n2;
|
||||
f = FOPEN(path, "wb");
|
||||
ok = (f != NULL && fwrite(buf, 1, p, f) == p);
|
||||
if (f != NULL)
|
||||
fclose(f);
|
||||
freet(buf);
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
/* Compare path's bytes to expect[0..len); 0 if equal. Streams (large files). */
|
||||
static int ae_check_decoded(const char *path, const unsigned char *expect,
|
||||
size_t len) {
|
||||
unsigned char buf[8192];
|
||||
FILE *f = FOPEN(path, "rb");
|
||||
size_t off = 0, n;
|
||||
|
||||
if (f == NULL)
|
||||
return 1;
|
||||
while ((n = fread(buf, 1, sizeof(buf), f)) > 0) {
|
||||
if (n > len - off || memcmp(buf, expect + off, n) != 0) {
|
||||
fclose(f);
|
||||
return 1;
|
||||
}
|
||||
off += n;
|
||||
}
|
||||
fclose(f);
|
||||
return (off == len) ? 0 : 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Accept-Encoding (#450): advertise gzip+deflate; both decode (hts_zunpack) */
|
||||
static int st_acceptencoding(httrackp *opt, int argc, char **argv) {
|
||||
const char *off = hts_acceptencoding(HTS_FALSE);
|
||||
const char *on = hts_acceptencoding(HTS_TRUE);
|
||||
|
||||
(void) opt;
|
||||
assertf(strcmp(off, "identity") == 0);
|
||||
assertf(strstr(on, "gzip") != NULL);
|
||||
assertf(strstr(on, "deflate") != NULL); /* fails on the old gzip-only list */
|
||||
#if HTS_USEZLIB
|
||||
if (argc >= 1) {
|
||||
static const int windowBits[] = {16 + MAX_WBITS, MAX_WBITS, -MAX_WBITS};
|
||||
const unsigned char small[] =
|
||||
"deflate round-trip: HTTrack decodes gzip and deflate alike. "
|
||||
"deflate round-trip: HTTrack decodes gzip and deflate alike.";
|
||||
const size_t slen = sizeof(small) - 1;
|
||||
/* 64 KiB of varied (LCG) bytes: forces the multi-fread loop */
|
||||
const size_t blen = 64 * 1024;
|
||||
unsigned char *body = malloct(blen);
|
||||
uint32_t x = 0x1234567u;
|
||||
char inpath[HTS_URLMAXSIZE], outpath[HTS_URLMAXSIZE];
|
||||
size_t i;
|
||||
|
||||
assertf(body != NULL);
|
||||
for (i = 0; i < blen; i++) {
|
||||
x = x * 1103515245u + 12345u;
|
||||
body[i] = (unsigned char) (x >> 16);
|
||||
}
|
||||
/* gzip, zlib (RFC1950) and raw deflate (RFC1951), both small and large. */
|
||||
for (i = 0; i < sizeof(windowBits) / sizeof(windowBits[0]); i++) {
|
||||
snprintf(inpath, sizeof(inpath), "%s/ae-in-%d.z", argv[0], windowBits[i]);
|
||||
snprintf(outpath, sizeof(outpath), "%s/ae-out-%d", argv[0],
|
||||
windowBits[i]);
|
||||
assertf(ae_write_packed(inpath, windowBits[i], small, slen) == 0);
|
||||
assertf(hts_zunpack(inpath, outpath) == (int) slen);
|
||||
assertf(ae_check_decoded(outpath, small, slen) == 0);
|
||||
assertf(ae_write_packed(inpath, windowBits[i], body, blen) == 0);
|
||||
assertf(hts_zunpack(inpath, outpath) == (int) blen);
|
||||
assertf(ae_check_decoded(outpath, body, blen) == 0);
|
||||
}
|
||||
/* Fallback teeth: raw deflate misdetected as zlib; -1 without the retry. */
|
||||
snprintf(inpath, sizeof(inpath), "%s/ae-collide.z", argv[0]);
|
||||
snprintf(outpath, sizeof(outpath), "%s/ae-collide.out", argv[0]);
|
||||
assertf(ae_write_collision(inpath, body, 64) == 0);
|
||||
assertf(hts_zunpack(inpath, outpath) == 64);
|
||||
assertf(ae_check_decoded(outpath, body, 64) == 0);
|
||||
freet(body);
|
||||
}
|
||||
#else
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
#endif
|
||||
printf("acceptencoding self-test OK: %s\n", on);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Each call parses `txt` under a fresh host, then checkrobots() for `path`. */
|
||||
static int rb_decide(robots_wizard *r, const char *txt, const char *path) {
|
||||
static int n = 0;
|
||||
char host[64];
|
||||
|
||||
snprintf(host, sizeof(host), "h%d.example", n++);
|
||||
robots_parse(r, host, txt, strlen(txt), NULL, 0, HTS_TRUE);
|
||||
return checkrobots(r, host, path);
|
||||
}
|
||||
|
||||
static int st_robots(httrackp *opt, int argc, char **argv) {
|
||||
robots_wizard robots;
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
memset(&robots, 0, sizeof(robots));
|
||||
|
||||
/* Longer Allow re-opens subtree under Disallow: / (old matcher couldn't). */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /\nAllow: /public/\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/public/x") == 0); /* allowed */
|
||||
assertf(rb_decide(&robots, txt, "/private") == -1); /* denied */
|
||||
assertf(rb_decide(&robots, txt, "/") == -1); /* denied */
|
||||
}
|
||||
|
||||
/* Equal-length match: Allow wins the tie over Disallow. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /foo\nAllow: /foo\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/foo/bar") == 0);
|
||||
}
|
||||
|
||||
/* Longest match wins even when it is not the last rule. */
|
||||
{
|
||||
assertf(rb_decide(&robots, "User-agent: *\nDisallow: /a/b\nAllow: /a\n",
|
||||
"/a/b/c") == -1);
|
||||
assertf(rb_decide(&robots, "User-agent: *\nAllow: /a/b\nDisallow: /a\n",
|
||||
"/a/b/c") == 0);
|
||||
}
|
||||
|
||||
/* '*' matches any run of characters. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /*.php\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/a/b/index.php") == -1);
|
||||
assertf(rb_decide(&robots, txt, "/a/b/index.html") == 0);
|
||||
}
|
||||
|
||||
/* Trailing '$' anchors the end of the path. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /a$\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/a") == -1);
|
||||
assertf(rb_decide(&robots, txt, "/ab") == 0);
|
||||
assertf(rb_decide(&robots, txt, "/a/b") == 0);
|
||||
}
|
||||
|
||||
/* The httrack-specific group replaces the generic '*' group entirely. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /everyone\n"
|
||||
"User-agent: httrack\nDisallow: /\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/anything") == -1);
|
||||
}
|
||||
|
||||
/* Replace, not merge: the generic group does not bind the httrack group. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /x\n"
|
||||
"User-agent: httrack\nDisallow: /y\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/x") == 0);
|
||||
assertf(rb_decide(&robots, txt, "/y") == -1);
|
||||
}
|
||||
|
||||
/* No rules: everything is allowed. */
|
||||
assertf(rb_decide(&robots, "User-agent: *\nDisallow:\n", "/x") == 0);
|
||||
|
||||
checkrobots_free(&robots);
|
||||
printf("robots self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* Registry: name -> handler, with a usage hint and a one-line description. */
|
||||
/* ------------------------------------------------------------ */
|
||||
@@ -1338,6 +1807,8 @@ static const struct selftest_entry {
|
||||
st_stripquery},
|
||||
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
|
||||
st_urlhack},
|
||||
{"redirect-samefile", "", "same-file redirect detection self-test (#159)",
|
||||
st_redirect_samefile},
|
||||
{"mime", "<filename>", "MIME type for a filename", st_mime},
|
||||
{"charset", "<charset> <string>",
|
||||
"convert a string to UTF-8 from a charset", st_charset},
|
||||
@@ -1346,6 +1817,8 @@ static const struct selftest_entry {
|
||||
{"idna-decode", "<host>", "decode an IDNA/punycode hostname",
|
||||
st_idna_decode},
|
||||
{"entities", "<string> [encoding]", "unescape HTML entities", st_entities},
|
||||
{"unescape-bounds", "", "unescapers reserve the NUL byte (no 1-byte OOB)",
|
||||
st_unescape_bounds},
|
||||
{"hashtable", "<count|file>", "coucal hashtable stress test", st_hashtable},
|
||||
{"strsafe", "[overflow|overflow-buff [str]]", "bounded string-op self-test",
|
||||
st_strsafe},
|
||||
@@ -1365,6 +1838,17 @@ static const struct selftest_entry {
|
||||
{"dns", "", "DNS resolver/cache self-test", st_dns},
|
||||
{"cookies", "", "cookie request-header self-test", st_cookies},
|
||||
{"useragent", "", "default User-Agent self-test", st_useragent},
|
||||
{"makeindex", "[dir]", "hts_finish_makeindex footer/refresh self-test",
|
||||
st_makeindex},
|
||||
{"inplace-escape", "", "inplace_escape_* vs escape_* equivalence self-test",
|
||||
st_inplace_escape},
|
||||
{"escape-room", "", "HT_ADD_HTMLESCAPED* reservation-factor self-test",
|
||||
st_escape_room},
|
||||
{"status", "", "HTTP status code -> reason phrase self-test", st_status},
|
||||
{"acceptencoding", "[dir]",
|
||||
"Accept-Encoding advertises gzip+deflate, both decode", st_acceptencoding},
|
||||
{"robots", "", "robots.txt RFC 9309 Allow/Disallow precedence self-test",
|
||||
st_robots},
|
||||
};
|
||||
|
||||
static void list_selftests(void) {
|
||||
|
||||
@@ -121,9 +121,6 @@ struct String {
|
||||
/** Byte at POS (read/write). No bounds check; POS must be < StringLength. **/
|
||||
#define StringSubRW(BLK, POS) (StringBuffRW(BLK)[POS])
|
||||
|
||||
/** Subcharacter (read/write) **/
|
||||
#define StringSubRW(BLK, POS) (StringBuffRW(BLK)[POS])
|
||||
|
||||
/** Byte POS positions from the end (read). POS==1 is the last byte. **/
|
||||
#define StringRight(BLK, POS) (StringBuff(BLK)[StringLength(BLK) - POS])
|
||||
|
||||
@@ -191,8 +188,9 @@ HTS_STATIC char *StringBuffN_(String *blk, int size) {
|
||||
asserts SIZE fits the existing content; does not (re)allocate. **/
|
||||
#define StringSetLength(BLK, SIZE) \
|
||||
do { \
|
||||
if (SIZE >= 0) { \
|
||||
(BLK).length_ = SIZE; \
|
||||
const int len__ = (SIZE); /* signed: negative means strlen(buffer_) */ \
|
||||
if (len__ >= 0) { \
|
||||
(BLK).length_ = len__; \
|
||||
} else { \
|
||||
(BLK).length_ = strlen((BLK).buffer_); \
|
||||
} \
|
||||
@@ -308,10 +306,11 @@ HTS_STATIC void StringAttach(String *blk, char **str) {
|
||||
#define StringCatN(BLK, STR, SIZE) \
|
||||
do { \
|
||||
const char *str__ = (STR); \
|
||||
const size_t usize__ = (SIZE); \
|
||||
if (str__ != NULL) { \
|
||||
size_t size__ = strlen(str__); \
|
||||
if (size__ > (SIZE)) { \
|
||||
size__ = (SIZE); \
|
||||
if (size__ > usize__) { \
|
||||
size__ = usize__; \
|
||||
} \
|
||||
StringMemcat(BLK, str__, size__); \
|
||||
} \
|
||||
|
||||
103
src/htszlib.c
103
src/htszlib.c
@@ -47,48 +47,89 @@ Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
|
||||
/*
|
||||
Unpack file into a new file
|
||||
Unpack file into a new file (gzip, zlib RFC1950 or raw deflate RFC1951).
|
||||
Return value: size of the new file, or -1 if an error occurred
|
||||
*/
|
||||
/* Note: utf-8 */
|
||||
int hts_zunpack(char *filename, char *newfile) {
|
||||
int ret = -1;
|
||||
|
||||
if (filename != NULL && newfile != NULL) {
|
||||
if (filename[0] && newfile[0]) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
FILE *const in = FOPEN(fconv(catbuff, sizeof(catbuff), filename), "rb");
|
||||
const int fd = in != NULL ? fileno(in) : -1;
|
||||
const int dup_fd = fd != -1 ? dup(fd) : -1;
|
||||
// Note: we must dup to be able to flose cleanly.
|
||||
const gzFile gz = dup_fd != -1 ? gzdopen(dup_fd, "rb") : NULL;
|
||||
if (filename != NULL && newfile != NULL && filename[0] && newfile[0]) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
FILE *const in = FOPEN(fconv(catbuff, sizeof(catbuff), filename), "rb");
|
||||
|
||||
if (gz) {
|
||||
FILE *const fpout = FOPEN(fconv(catbuff, sizeof(catbuff), newfile), "wb");
|
||||
int size = 0;
|
||||
if (in != NULL) {
|
||||
unsigned char BIGSTK inbuf[8192];
|
||||
size_t navail = fread(inbuf, 1, sizeof(inbuf), in);
|
||||
/* gzip/zlib headers -> +32 windowBits; else raw deflate (RFC1951) */
|
||||
const hts_boolean wrapped =
|
||||
(navail >= 2 &&
|
||||
((inbuf[0] == 0x1f && inbuf[1] == 0x8b) ||
|
||||
((inbuf[0] & 0x0f) == Z_DEFLATED &&
|
||||
(((unsigned) inbuf[0] << 8 | inbuf[1]) % 31) == 0)));
|
||||
int attempt;
|
||||
|
||||
if (fpout) {
|
||||
int nr;
|
||||
/* deflate is ambiguous; on failure retry with the other windowBits */
|
||||
for (attempt = 0; attempt < 2 && ret < 0; attempt++) {
|
||||
const int windowBits =
|
||||
(attempt == 0 ? wrapped : !wrapped) ? (32 + MAX_WBITS) : -MAX_WBITS;
|
||||
FILE *fpout;
|
||||
z_stream strm;
|
||||
|
||||
do {
|
||||
char BIGSTK buff[1024];
|
||||
|
||||
nr = gzread(gz, buff, sizeof(buff));
|
||||
if (nr > 0) {
|
||||
size += nr;
|
||||
if (fwrite(buff, 1, nr, fpout) != nr)
|
||||
nr = size = -1;
|
||||
}
|
||||
} while(nr > 0);
|
||||
if (attempt > 0) {
|
||||
/* rewind input; reopening fpout "wb" discards the partial output */
|
||||
if (fseek(in, 0, SEEK_SET) != 0)
|
||||
break;
|
||||
navail = fread(inbuf, 1, sizeof(inbuf), in);
|
||||
}
|
||||
fpout = FOPEN(fconv(catbuff, sizeof(catbuff), newfile), "wb");
|
||||
if (fpout == NULL)
|
||||
break;
|
||||
memset(&strm, 0, sizeof(strm));
|
||||
if (inflateInit2(&strm, windowBits) != Z_OK) {
|
||||
fclose(fpout);
|
||||
} else
|
||||
size = -1;
|
||||
gzclose(gz);
|
||||
ret = (int) size;
|
||||
}
|
||||
if (in != NULL) {
|
||||
fclose(in);
|
||||
break;
|
||||
}
|
||||
{
|
||||
hts_boolean ok = HTS_TRUE;
|
||||
int size = 0;
|
||||
int zerr = Z_OK;
|
||||
|
||||
/* chunked inflate; first chunk in inbuf, single member */
|
||||
do {
|
||||
strm.next_in = inbuf;
|
||||
strm.avail_in = (uInt) navail;
|
||||
do {
|
||||
unsigned char BIGSTK outbuf[8192];
|
||||
size_t produced;
|
||||
|
||||
strm.next_out = outbuf;
|
||||
strm.avail_out = sizeof(outbuf);
|
||||
zerr = inflate(&strm, Z_NO_FLUSH);
|
||||
if (zerr == Z_NEED_DICT || zerr == Z_DATA_ERROR ||
|
||||
zerr == Z_MEM_ERROR || zerr == Z_STREAM_ERROR) {
|
||||
ok = HTS_FALSE;
|
||||
break;
|
||||
}
|
||||
produced = sizeof(outbuf) - strm.avail_out;
|
||||
if (produced > 0 &&
|
||||
fwrite(outbuf, 1, produced, fpout) != produced) {
|
||||
ok = HTS_FALSE;
|
||||
break;
|
||||
}
|
||||
size += (int) produced;
|
||||
} while (strm.avail_out == 0);
|
||||
if (!ok || zerr == Z_STREAM_END)
|
||||
break;
|
||||
navail = fread(inbuf, 1, sizeof(inbuf), in);
|
||||
} while (navail > 0);
|
||||
if (ok && zerr == Z_STREAM_END)
|
||||
ret = size;
|
||||
}
|
||||
inflateEnd(&strm);
|
||||
fclose(fpout);
|
||||
}
|
||||
fclose(in);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
||||
@@ -497,6 +497,12 @@ static const char *GetHttpMessage(int statuscode) {
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
break;
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
break;
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
break;
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
break;
|
||||
|
||||
7
tests/01_engine-escape-room.test
Normal file
7
tests/01_engine-escape-room.test
Normal file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# HT_ADD_HTMLESCAPED* must reserve the escaper's worst case (6 for _full).
|
||||
httrack -O /dev/null -#test=escape-room run | grep -q "escape-room self-test OK"
|
||||
7
tests/01_engine-inplace-escape.test
Executable file
7
tests/01_engine-inplace-escape.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# inplace_escape_*() must match escape_*() on a copy: guards the shared helper.
|
||||
httrack -O /dev/null -#test=inplace-escape run | grep -q "inplace-escape self-test OK"
|
||||
12
tests/01_engine-makeindex.test
Executable file
12
tests/01_engine-makeindex.test
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# hts_finish_makeindex writes the footer and gates the refresh meta on a single
|
||||
# first link (guards the macro->function extraction).
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
httrack -O /dev/null -#test=makeindex "$dir" run |
|
||||
grep -q "makeindex self-test OK"
|
||||
9
tests/01_engine-redirect.test
Normal file
9
tests/01_engine-redirect.test
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
|
||||
# followed through, not turned into a self-pointing "moved" stub. The decision
|
||||
# helper is exercised by the engine self-test.
|
||||
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"
|
||||
7
tests/01_engine-robots.test
Executable file
7
tests/01_engine-robots.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# robots.txt RFC 9309 Allow/Disallow precedence (#452): longest match wins.
|
||||
httrack -O /dev/null -#test=robots run | grep -q "robots self-test OK"
|
||||
7
tests/01_engine-status.test
Executable file
7
tests/01_engine-status.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# HTTP status -> reason phrase, including the modern 429/451 (#453).
|
||||
httrack -O /dev/null -#test=status run | grep -q "status self-test OK"
|
||||
7
tests/01_engine-unescape-bounds.test
Executable file
7
tests/01_engine-unescape-bounds.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Entity/URL unescapers reserve one byte for the trailing NUL (no 1-byte OOB).
|
||||
httrack -O /dev/null -#test=unescape-bounds run | grep -q "unescape-bounds self-test OK"
|
||||
11
tests/01_zlib-acceptencoding.test
Executable file
11
tests/01_zlib-acceptencoding.test
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Accept-Encoding (#450): advertise gzip+deflate; decode gzip/zlib/raw-deflate.
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
httrack -O /dev/null -#test=acceptencoding "$dir" run |
|
||||
grep -q "acceptencoding self-test OK"
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
# Golden cache-format regression test (driven by 'httrack -#test=cache-golden <dir>').
|
||||
#
|
||||
# 01_engine-cache.test writes the cache with the same build it reads back (a
|
||||
# 01_zlib-cache.test writes the cache with the same build it reads back (a
|
||||
# round-trip), so it cannot catch a read-path or ZIP-format regression where
|
||||
# writer and reader drift together. This reads a *committed* cache frozen by an
|
||||
# earlier build and asserts a fixed set of entries still decodes field- and
|
||||
@@ -20,6 +20,14 @@ if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "python3 missing, skipping"
|
||||
exit 77
|
||||
fi
|
||||
# The fixture needs a second loopback IP (dead 127.0.0.2 + live 127.0.0.1) for
|
||||
# the fallback to have a target; GNU/Hurd has only 127.0.0.1, so skip there.
|
||||
case "$(uname -s)" in
|
||||
GNU | GNU/*)
|
||||
echo "GNU/Hurd: single loopback IP, connect-fallback fixture unbuildable, skipping"
|
||||
exit 77
|
||||
;;
|
||||
esac
|
||||
|
||||
server="$top_srcdir/tests/local-server.py"
|
||||
root="$top_srcdir/tests/server-root"
|
||||
|
||||
13
tests/30_local-fragment-link.test
Executable file
13
tests/30_local-fragment-link.test
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
# Issue #279: an anchored link (target.html#sec, quoted or bare) fetches the
|
||||
# target with the fragment dropped (strict server 400s on a '#' in the request)
|
||||
# but keeps it in the rewritten local link so the anchor still works.
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'fraglink/target.html' \
|
||||
--file-matches 'fraglink/index.html' 'href=target\.html#sec' \
|
||||
--file-matches 'fraglink/index.html' 'href="target\.html#sec2"' \
|
||||
httrack 'BASEURL/fraglink/index.html'
|
||||
@@ -1,4 +1,4 @@
|
||||
# Committed binary fixture read by 01_engine-cache-golden.test. List it
|
||||
# Committed binary fixture read by 01_zlib-cache-golden.test. List it
|
||||
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
||||
# silently drop it from the dist tarball and break "make distcheck".
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
@@ -6,6 +6,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
local-crawl.sh local-server.py server.crt server.key \
|
||||
server-root/simple/basic.html server-root/simple/link.html \
|
||||
server-root/stripquery/index.html server-root/stripquery/a.html \
|
||||
server-root/fraglink/index.html server-root/fraglink/target.html \
|
||||
fixtures/cache-golden/hts-cache/new.zip
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
@@ -25,9 +26,6 @@ TEST_EXTENSIONS = .test
|
||||
TEST_LOG_COMPILER = $(BASH)
|
||||
TESTS = \
|
||||
00_runnable.test \
|
||||
01_engine-cache.test \
|
||||
01_engine-cache-golden.test \
|
||||
01_engine-cache-writefail.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-cookies.test \
|
||||
@@ -39,18 +37,29 @@ TESTS = \
|
||||
01_engine-filter.test \
|
||||
01_engine-hashtable.test \
|
||||
01_engine-idna.test \
|
||||
01_engine-escape-room.test \
|
||||
01_engine-inplace-escape.test \
|
||||
01_engine-makeindex.test \
|
||||
01_engine-mime.test \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-redirect.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-status.test \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
01_engine-urlhack.test \
|
||||
01_engine-unescape-bounds.test \
|
||||
01_engine-useragent.test \
|
||||
01_zlib-acceptencoding.test \
|
||||
01_zlib-cache.test \
|
||||
01_zlib-cache-golden.test \
|
||||
01_zlib-cache-writefail.test \
|
||||
02_manpage-regen.test \
|
||||
02_update-cache.test \
|
||||
10_crawl-simple.test \
|
||||
@@ -77,6 +86,7 @@ TESTS = \
|
||||
26_local-strip-query.test \
|
||||
27_local-cookies-file.test \
|
||||
28_local-pause.test \
|
||||
29_local-redirect-fragment.test
|
||||
29_local-redirect-fragment.test \
|
||||
30_local-fragment-link.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -15,8 +15,11 @@
|
||||
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
|
||||
# host root), to assert rewritten link/content survived the crawl.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
# which the ephemeral port forces into the cookie domain) and passes it to
|
||||
# httrack via --cookies-file, to exercise preloaded cookies.
|
||||
@@ -121,6 +124,10 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--file-matches | --file-not-matches)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}" "${args[$((pos + 2))]}")
|
||||
pos=$((pos + 2))
|
||||
;;
|
||||
httrack)
|
||||
pos=$((pos + 1))
|
||||
break
|
||||
@@ -294,6 +301,24 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
--file-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} matches ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then result "OK"; else
|
||||
result "no match"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--file-not-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} lacks ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then
|
||||
result "matched"
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
4
tests/server-root/fraglink/index.html
Normal file
4
tests/server-root/fraglink/index.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html><body>
|
||||
<a href=target.html#sec>unquoted fragment link</a>
|
||||
<a href="target.html#sec2">quoted fragment link</a>
|
||||
</body></html>
|
||||
1
tests/server-root/fraglink/target.html
Normal file
1
tests/server-root/fraglink/target.html
Normal file
@@ -0,0 +1 @@
|
||||
<html><body><a name="sec"></a><a name="sec2"></a>target</body></html>
|
||||
Reference in New Issue
Block a user