mirror of
https://github.com/xroche/httrack.git
synced 2026-07-04 16:14:47 +03:00
Compare commits
18 Commits
worktree-a
...
naming-con
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
92ad109c30 | ||
|
|
56b809c82d | ||
|
|
d7c4eab1f5 | ||
|
|
2eac19655b | ||
|
|
83c231d50e | ||
|
|
9d29b8329b | ||
|
|
ac4a1ca48e | ||
|
|
9f2f2e52fa | ||
|
|
92db2f2b41 | ||
|
|
ec52112446 | ||
|
|
1eaddc9c0e | ||
|
|
d97a7bdfd9 | ||
|
|
d2d02d87c2 | ||
|
|
4958bb8666 | ||
|
|
07da404cb8 | ||
|
|
694e45c698 | ||
|
|
db9ec2cc3b | ||
|
|
6a9ab2a11f |
10
configure.ac
10
configure.ac
@@ -63,6 +63,16 @@ AC_SUBST(LT_CV_OBJDIR,$lt_cv_objdir)
|
||||
# Export version info
|
||||
AC_SUBST(VERSION_INFO)
|
||||
|
||||
# Versioned plugin name for dlopen() in hts_create_opt(); soname major is
|
||||
# libtool's current - age, so this tracks VERSION_INFO bumps automatically.
|
||||
HTS_SONAME_MAJOR=$((${VERSION_INFO%%:*} - ${VERSION_INFO##*:}))
|
||||
case "$host_os" in
|
||||
darwin*) HTS_LIBHTSJAVA_NAME="libhtsjava.$HTS_SONAME_MAJOR.dylib" ;;
|
||||
*) HTS_LIBHTSJAVA_NAME="libhtsjava.so.$HTS_SONAME_MAJOR" ;;
|
||||
esac
|
||||
AC_DEFINE_UNQUOTED([HTS_LIBHTSJAVA_NAME], ["$HTS_LIBHTSJAVA_NAME"],
|
||||
[Versioned libhtsjava runtime name, derived from VERSION_INFO])
|
||||
|
||||
### Default CFLAGS
|
||||
DEFAULT_CFLAGS="-Wall -Wformat -Wformat-security \
|
||||
-Wmultichar -Wwrite-strings -Wcast-qual -Wcast-align \
|
||||
|
||||
3
debian/control
vendored
3
debian/control
vendored
@@ -1,9 +1,8 @@
|
||||
Source: httrack
|
||||
Section: web
|
||||
Priority: optional
|
||||
Maintainer: Xavier Roche <roche@httrack.com>
|
||||
Standards-Version: 4.7.4
|
||||
Build-Depends: debhelper-compat (= 13), autoconf, autoconf-archive, automake, libtool, zlib1g-dev, libssl-dev
|
||||
Build-Depends: debhelper-compat (= 14), autoconf, autoconf-archive, automake, libtool, zlib1g-dev, libssl-dev
|
||||
Rules-Requires-Root: no
|
||||
Homepage: http://www.httrack.com
|
||||
Vcs-Git: https://github.com/xroche/httrack.git
|
||||
|
||||
4
debian/source/lintian-overrides
vendored
4
debian/source/lintian-overrides
vendored
@@ -1,4 +1,6 @@
|
||||
httrack source: changelog-should-mention-nmu
|
||||
# Maintainer uploads sign the changelog as xavier@debian.org while the control
|
||||
# Maintainer is roche@httrack.com; lintian reads the address mismatch as an NMU.
|
||||
httrack source: no-nmu-in-changelog
|
||||
httrack source: source-nmu-has-incorrect-version-number
|
||||
|
||||
# The bundled HTML pages are the genuine upstream documentation taken from
|
||||
|
||||
6
debian/upstream/metadata
vendored
Normal file
6
debian/upstream/metadata
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
Repository: https://github.com/xroche/httrack.git
|
||||
Repository-Browse: https://github.com/xroche/httrack
|
||||
Bug-Database: https://github.com/xroche/httrack/issues
|
||||
Bug-Submit: https://github.com/xroche/httrack/issues/new
|
||||
Contact: Xavier Roche <roche@httrack.com>
|
||||
@@ -2237,12 +2237,13 @@ int host_wait(httrackp * opt, lien_back * back) {
|
||||
|
||||
static int slot_can_be_cleaned(const lien_back * back) {
|
||||
return (back->status == STATUS_READY) // ready
|
||||
/* Check autoclean */
|
||||
&& (!back->testmode) // not test mode
|
||||
&& (strnotempty(back->url_sav)) // filename exists
|
||||
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
|
||||
&& (back->r.size >= 0) // size>=0
|
||||
;
|
||||
/* Check autoclean */
|
||||
&& (!back->locked) // not held by hts_wait_delayed (name pending)
|
||||
&& (!back->testmode) // not test mode
|
||||
&& (strnotempty(back->url_sav)) // filename exists
|
||||
&& (HTTP_IS_OK(back->r.statuscode)) // HTTP "OK"
|
||||
&& (back->r.size >= 0) // size>=0
|
||||
;
|
||||
}
|
||||
|
||||
static int slot_can_be_finalized(httrackp * opt, const lien_back * back) {
|
||||
@@ -2891,10 +2892,10 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
// range size hack old location
|
||||
|
||||
#if HTS_DIRECTDISK
|
||||
// Court-circuit:
|
||||
// Peut-on stocker le fichier directement sur disque?
|
||||
// Ahh que ca serait vachement mieux et que ahh que la mémoire vous dit merci!
|
||||
if (back[i].status) {
|
||||
// Shortcut: store the file directly on disk when possible,
|
||||
// sparing memory
|
||||
if (back[i].status &&
|
||||
!back[i].locked) { // name still pending when locked
|
||||
if (back[i].r.is_write == 0) { // mode mémoire
|
||||
if (back[i].r.adr == NULL) { // rien n'a été écrit
|
||||
if (!back[i].testmode) { // pas mode test
|
||||
@@ -3960,8 +3961,12 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
&& (back[i].r.adr = (char *) malloct(2))) {
|
||||
back[i].r.adr[0] = 0;
|
||||
}
|
||||
hts_log_print(opt, LOG_TRACE, "finalizing empty");
|
||||
back_finalize(opt, cache, sback, i);
|
||||
/* locked = name pending; the waiter finalizes after
|
||||
patching url_sav (else: cached as .delayed, #5) */
|
||||
if (!back[i].locked) {
|
||||
hts_log_print(opt, LOG_TRACE, "finalizing empty");
|
||||
back_finalize(opt, cache, sback, i);
|
||||
}
|
||||
} else if (!back[i].r.is_chunk) { // pas de chunk
|
||||
//if (back[i].r.http11!=2) { // pas de chunk
|
||||
back[i].is_chunk = 0;
|
||||
|
||||
@@ -175,7 +175,9 @@ HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||
//
|
||||
socinput(soc, line, 1000);
|
||||
if (strnotempty(line)) {
|
||||
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
|
||||
/* widths bound the caller buffers: method[32], url[HTS_URLMAXSIZE*2],
|
||||
protocol[256] */
|
||||
if (sscanf(line, "%31s %2047s %255s", method, url, protocol) == 3) {
|
||||
lien_adrfil af;
|
||||
|
||||
// méthode en majuscule
|
||||
|
||||
124
src/htscore.c
124
src/htscore.c
@@ -406,29 +406,106 @@ void hts_invalidate_link(httrackp * opt, int lpos) {
|
||||
opt->liens[lpos]->pass2 = -1;
|
||||
}
|
||||
|
||||
// Write the makeindex footer (refresh meta when makeindex_links==1), close
|
||||
// the file, then run usercommand.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil) {
|
||||
if (!*makeindex_done) {
|
||||
if (*makeindex_fp) {
|
||||
char BIGSTK tempo[1024];
|
||||
if (makeindex_links == 1) {
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE * 2];
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped));
|
||||
snprintf(tempo, sizeof(tempo),
|
||||
"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">" CRLF,
|
||||
link_escaped);
|
||||
} else
|
||||
tempo[0] = '\0';
|
||||
hts_template_format(*makeindex_fp, template_footer,
|
||||
"<!-- Mirror and index made by HTTrack Website "
|
||||
"Copier/" HTTRACK_VERSION " " HTTRACK_AFF_AUTHORS
|
||||
" -->",
|
||||
tempo, /* EOF */ NULL);
|
||||
fflush(*makeindex_fp);
|
||||
fclose(*makeindex_fp);
|
||||
*makeindex_fp = NULL;
|
||||
usercommand(opt, 0, NULL,
|
||||
fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_html_utf8), "index.html"),
|
||||
adr, fil);
|
||||
}
|
||||
}
|
||||
*makeindex_done = 1;
|
||||
}
|
||||
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF, link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),StringBuff(opt->path_html_utf8),"index.html"),"",""); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
/* Flush the parsed HTML output buffer to disk, skipping the rewrite when the
|
||||
* on-disk MD5 is unchanged. */
|
||||
void hts_finish_html_file(httrackp *opt, cache_back *cache, htsblk *r,
|
||||
FILE **fp, const char *ht_buff, size_t ht_len,
|
||||
const char *adr, const char *fil, const char *save) {
|
||||
char digest[32 + 2];
|
||||
off_t fsize_old =
|
||||
fsize(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), save));
|
||||
int ok = 0;
|
||||
|
||||
digest[0] = '\0';
|
||||
domd5mem(ht_buff, ht_len, digest, 1);
|
||||
if (fsize_old == (off_t) ht_len) {
|
||||
int mlen = 0;
|
||||
char *mbuff;
|
||||
|
||||
cache_readdata(cache, "//[HTML-MD5]//", save, &mbuff, &mlen);
|
||||
if (mlen)
|
||||
mbuff[mlen] = '\0';
|
||||
if ((mlen == 32) && (strcmp(((mbuff != NULL) ? mbuff : ""), digest) == 0)) {
|
||||
ok = 1;
|
||||
hts_log_print(opt, LOG_DEBUG, "File not re-written (md5): %s", save);
|
||||
}
|
||||
freet(mbuff);
|
||||
}
|
||||
if (!ok) {
|
||||
file_notify(opt, adr, fil, save, 1, 1, r->notmodified);
|
||||
*fp = filecreate(&opt->state.strc, save);
|
||||
if (*fp) {
|
||||
if (ht_len > 0 && fwrite(ht_buff, 1, ht_len, *fp) != ht_len) {
|
||||
int fcheck = check_fatal_io_errno();
|
||||
|
||||
if (fcheck)
|
||||
opt->state.exit_xh = -1;
|
||||
if (opt->log) {
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO,
|
||||
"Unable to write HTML file %s", save);
|
||||
if (fcheck)
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");
|
||||
}
|
||||
}
|
||||
fclose(*fp);
|
||||
*fp = NULL;
|
||||
if (strnotempty(r->lastmodified))
|
||||
set_filetime_rfc822(save, r->lastmodified);
|
||||
} else {
|
||||
int fcheck = check_fatal_io_errno();
|
||||
|
||||
if (fcheck) {
|
||||
hts_log_print(opt, LOG_ERROR,
|
||||
"Mirror aborted: disk full or filesystem problems");
|
||||
opt->state.exit_xh = -1;
|
||||
}
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to save file %s", save);
|
||||
if (fcheck)
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");
|
||||
}
|
||||
} else {
|
||||
file_notify(opt, adr, fil, save, 0, 0, r->notmodified);
|
||||
filenote(&opt->state.strc, save, NULL);
|
||||
}
|
||||
if (cache->ndx)
|
||||
cache_writedata(cache->ndx, cache->dat, "//[HTML-MD5]//", save, digest,
|
||||
(int) strlen(digest));
|
||||
}
|
||||
|
||||
/* does it look like XML ? (SVG et al.) */
|
||||
static int look_like_xml(const char *s) {
|
||||
@@ -2044,7 +2121,8 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
/*
|
||||
Ensure the index is being closed
|
||||
*/
|
||||
HT_INDEX_END;
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp, makeindex_links,
|
||||
makeindex_firstlink, template_footer, "", "");
|
||||
|
||||
/*
|
||||
updating-a-remotely-deteted-website hack
|
||||
|
||||
@@ -362,6 +362,20 @@ void usercommand(httrackp * opt, int exe, const char *cmd, const char *file,
|
||||
|
||||
void usercommand_exe(const char *cmd, const char *file);
|
||||
|
||||
// Finish the makeindex index.html (footer + refresh meta), run usercommand.
|
||||
// Updates *makeindex_done/*makeindex_fp in place; adr/fil are the mode strings.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil);
|
||||
|
||||
// Flush ht_buff[0..ht_len] to save on disk (skip if MD5 unchanged); *fp
|
||||
// closed+NULLed on write. Precondition: ht_len>0.
|
||||
void hts_finish_html_file(httrackp *opt, cache_back *cache, htsblk *r,
|
||||
FILE **fp, const char *ht_buff, size_t ht_len,
|
||||
const char *adr, const char *fil, const char *save);
|
||||
|
||||
int filters_init(char ***ptrfilters, int maxfilter, int filterinc);
|
||||
|
||||
int fspc(httrackp * opt, FILE * fp, const char *type);
|
||||
@@ -470,4 +484,8 @@ void voidf(void);
|
||||
/* HTML marker comment marking where the top index is spliced. */
|
||||
#define HTS_TOPINDEX "TOP_INDEX_HTTRACK"
|
||||
|
||||
/* Worst-case byte expansion HT_ADD_HTMLESCAPED* must reserve per escaper. */
|
||||
#define HTS_HTMLESCAPE_MAXEXP 5 /* escape_for_html_print: '&'->"&" */
|
||||
#define HTS_HTMLESCAPE_FULL_MAXEXP 6 /* _full: high byte->"&#xHH;" */
|
||||
|
||||
#endif
|
||||
|
||||
@@ -69,11 +69,15 @@ typedef struct t_hts_callbackarg t_hts_callbackarg;
|
||||
typedef struct t_hts_callbackarg t_hts_callbackarg;
|
||||
#endif
|
||||
|
||||
/* Marks a symbol an external wrapper module exports back to the engine
|
||||
(dllexport on Windows, nothing elsewhere). */
|
||||
/* Marks a symbol an external wrapper module exports back to the engine.
|
||||
Must override -fvisibility=hidden on ELF, or dlopen()ed plugins (htsjava)
|
||||
hide their own hts_plug()/hts_unplug() entry points. */
|
||||
#ifndef EXTERNAL_FUNCTION
|
||||
#ifdef _WIN32
|
||||
#define EXTERNAL_FUNCTION __declspec(dllexport)
|
||||
#elif ((defined(__GNUC__) && (__GNUC__ >= 4)) || \
|
||||
(defined(HAVE_VISIBILITY) && HAVE_VISIBILITY))
|
||||
#define EXTERNAL_FUNCTION __attribute__((visibility("default")))
|
||||
#else
|
||||
#define EXTERNAL_FUNCTION
|
||||
#endif
|
||||
|
||||
@@ -190,9 +190,9 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* copy */
|
||||
if (j + 1 > max) {
|
||||
|
||||
/* reserve one byte for the trailing NUL written after the loop */
|
||||
if (j + 1 >= max) {
|
||||
/* overflow */
|
||||
return -1;
|
||||
}
|
||||
@@ -300,6 +300,11 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
|
||||
|
||||
/* Was the character read successfully ? */
|
||||
if (nRead == utfBufferSize) {
|
||||
/* the 'continue' below skips the NUL-reserve guard: re-check */
|
||||
if (utfBufferJ + utfBufferSize >= max) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Rollback write position to sequence start write position */
|
||||
j = utfBufferJ;
|
||||
|
||||
@@ -314,8 +319,8 @@ int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
|
||||
}
|
||||
}
|
||||
|
||||
/* Check for overflow */
|
||||
if (j + 1 > max) {
|
||||
/* reserve one byte for the trailing NUL written after the loop */
|
||||
if (j + 1 >= max) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
50
src/htsftp.c
50
src/htsftp.c
@@ -128,6 +128,33 @@ void launch_ftp(FTPDownloadStruct * params) {
|
||||
return 0; \
|
||||
}
|
||||
|
||||
/* Bounded split of a hostile-URL "user[:pass]@" prefix (see htsftp.h). */
|
||||
void ftp_split_userpass(const char *src, const char *end, char *user,
|
||||
size_t user_size, char *pass, size_t pass_size) {
|
||||
size_t n = 0;
|
||||
|
||||
assertf(user_size > 0 && pass_size > 0); /* the size-1 math underflows on 0 */
|
||||
|
||||
while (src[n] != '\0' && src[n] != ':') {
|
||||
if (n < user_size - 1)
|
||||
user[n] = src[n];
|
||||
n++;
|
||||
}
|
||||
user[n < user_size ? n : user_size - 1] = '\0';
|
||||
pass[0] = '\0';
|
||||
if (src[n] == ':') { // password follows the colon
|
||||
const size_t base = n + 1;
|
||||
size_t k = 0;
|
||||
|
||||
while (&src[base + k + 1] < end && src[base + k] != '\0') {
|
||||
if (k < pass_size - 1)
|
||||
pass[k] = src[base + k];
|
||||
k++;
|
||||
}
|
||||
pass[k < pass_size ? k : pass_size - 1] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
// la véritable fonction une fois lancées les routines thread/fork
|
||||
int run_launch_ftp(FTPDownloadStruct * pStruct) {
|
||||
lien_back *back = pStruct->pBack;
|
||||
@@ -173,24 +200,7 @@ int run_launch_ftp(FTPDownloadStruct * pStruct) {
|
||||
while(*real_adr == '/')
|
||||
real_adr++; // sauter /
|
||||
if ((adr = jump_identification(real_adr)) != real_adr) { // user
|
||||
int i = -1;
|
||||
|
||||
pass[0] = '\0';
|
||||
do {
|
||||
i++;
|
||||
user[i] = real_adr[i];
|
||||
} while((real_adr[i] != ':') && (real_adr[i]));
|
||||
user[i] = '\0';
|
||||
if (real_adr[i] == ':') { // pass
|
||||
int j = -1;
|
||||
|
||||
i++; // oui on saute aussi le :
|
||||
do {
|
||||
j++;
|
||||
pass[j] = real_adr[i + j];
|
||||
} while(((&real_adr[i + j + 1]) < adr) && (real_adr[i + j]));
|
||||
pass[j] = '\0';
|
||||
}
|
||||
ftp_split_userpass(real_adr, adr, user, sizeof(user), pass, sizeof(pass));
|
||||
}
|
||||
// Calculer RETR <nom>
|
||||
{
|
||||
@@ -984,8 +994,8 @@ int get_ftp_line(T_SOC soc, char *ptrline, size_t line_size, int timeout) {
|
||||
//case 0: break; // pas encore --> erreur (on attend)!
|
||||
case 1:
|
||||
HTS_STAT.HTS_TOTAL_RECV += 1; // compter flux entrant
|
||||
if ((b != 10) && (b != 13))
|
||||
data[i++] = b;
|
||||
if ((b != 10) && (b != 13) && (i < (int) sizeof(data) - 1))
|
||||
data[i++] = b; // truncate hostile over-long reply lines
|
||||
break;
|
||||
default:
|
||||
if (ptrline)
|
||||
|
||||
@@ -70,6 +70,11 @@ int back_launch_ftp(FTPDownloadStruct * params);
|
||||
int run_launch_ftp(FTPDownloadStruct * params);
|
||||
int send_line(T_SOC soc, const char *data);
|
||||
int get_ftp_line(T_SOC soc, char *line, size_t line_size, int timeout);
|
||||
/* Split a "user[:pass]@" prefix (end = jump_identification result) into
|
||||
bounded, NUL-terminated user/pass buffers, truncating to fit.
|
||||
Both sizes must be nonzero. */
|
||||
void ftp_split_userpass(const char *src, const char *end, char *user,
|
||||
size_t user_size, char *pass, size_t pass_size);
|
||||
T_SOC get_datasocket(char *to_send, size_t to_send_size);
|
||||
int stop_ftp(lien_back * back);
|
||||
char *linejmp(char *line);
|
||||
|
||||
@@ -63,6 +63,9 @@ Please visit our Website: http://www.httrack.com
|
||||
/* This file */
|
||||
#include "htsjava.h"
|
||||
|
||||
/* calloct/freet wrappers */
|
||||
#include "htssafe.h"
|
||||
|
||||
static int reverse_endian(void) {
|
||||
int endian = 1;
|
||||
|
||||
@@ -204,7 +207,16 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
|
||||
return 0;
|
||||
}
|
||||
|
||||
tab = (RESP_STRUCT *) calloc(header.count, sizeof(RESP_STRUCT));
|
||||
/* A constant-pool entry is >= 1 byte on disk; reject a count exceeding
|
||||
the file size (hostile .class ~68 MB alloc DoS). */
|
||||
if (!hts_count_fits(header.count, (LLint) fsize(file))) {
|
||||
fclose(fpout);
|
||||
sprintf(str->err_msg,
|
||||
"Invalid constant pool count %u (file len " LLintP ")",
|
||||
(unsigned) header.count, (LLint) fsize(file));
|
||||
return 0;
|
||||
}
|
||||
tab = (RESP_STRUCT *) calloct(header.count, sizeof(RESP_STRUCT));
|
||||
if (!tab) {
|
||||
sprintf(str->err_msg, "Unable to alloc %d bytes",
|
||||
(int) sizeof(RESP_STRUCT));
|
||||
@@ -230,7 +242,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
|
||||
} else { // ++ une erreur est survenue!
|
||||
if (strnotempty(str->err_msg) == 0)
|
||||
strcpy(str->err_msg, "Internal readtable error");
|
||||
free(tab);
|
||||
freet(tab);
|
||||
if (fpout) {
|
||||
fclose(fpout);
|
||||
fpout = NULL;
|
||||
@@ -288,7 +300,7 @@ static int hts_parse_java(t_hts_callbackarg * carg, httrackp * opt,
|
||||
#if JAVADEBUG
|
||||
printf("end\n");
|
||||
#endif
|
||||
free(tab);
|
||||
freet(tab);
|
||||
if (fpout) {
|
||||
fclose(fpout);
|
||||
fpout = NULL;
|
||||
|
||||
@@ -33,15 +33,19 @@ Please visit our Website: http://www.httrack.com
|
||||
#ifndef HTSJAVA_DEFH
|
||||
#define HTSJAVA_DEFH
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_JAVA_HEADER
|
||||
#define HTS_DEF_FWSTRUCT_JAVA_HEADER
|
||||
typedef struct JAVA_HEADER JAVA_HEADER;
|
||||
#endif
|
||||
/* 10-byte on-disk .class header image, fread() directly: fields need exact
|
||||
widths (LP64's 8-byte 'unsigned long' magic never matched 0xCAFEBABE). */
|
||||
struct JAVA_HEADER {
|
||||
unsigned long int magic;
|
||||
unsigned short int minor;
|
||||
unsigned short int major;
|
||||
unsigned short int count;
|
||||
uint32_t magic;
|
||||
uint16_t minor;
|
||||
uint16_t major;
|
||||
uint16_t count;
|
||||
};
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_RESP_STRUCT
|
||||
|
||||
53
src/htslib.c
53
src/htslib.c
@@ -1149,7 +1149,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
char BIGSTK protocol[256], url[HTS_URLMAXSIZE * 2], method[256];
|
||||
|
||||
linput(fp, line, 1000);
|
||||
if (sscanf(line, "%s %s %s", method, url, protocol) == 3) {
|
||||
/* widths bound method[256], url[HTS_URLMAXSIZE*2], protocol[256] */
|
||||
if (sscanf(line, "%255s %2047s %255s", method, url, protocol) == 3) {
|
||||
size_t ret;
|
||||
// selon que l'on a ou pas un proxy
|
||||
if (retour->req.proxy.active) {
|
||||
@@ -4131,25 +4132,33 @@ DECLARE_APPEND_ESCAPE_VERSION(escape_uri)
|
||||
|
||||
#undef DECLARE_APPEND_ESCAPE_VERSION
|
||||
|
||||
// Same as above, but in-place
|
||||
#undef DECLARE_INPLACE_ESCAPE_VERSION
|
||||
#define DECLARE_INPLACE_ESCAPE_VERSION(NAME) \
|
||||
HTSEXT_API size_t inplace_ ##NAME(char *const dest, const size_t size) { \
|
||||
char buffer[256]; \
|
||||
const size_t len = strnlen(dest, size); \
|
||||
const int in_buffer = len + 1 < sizeof(buffer); \
|
||||
char *src = in_buffer ? buffer : malloct(len + 1); \
|
||||
size_t ret; \
|
||||
assertf(src != NULL); \
|
||||
assertf(len < size); \
|
||||
memcpy(src, dest, len + 1); \
|
||||
ret = NAME(src, dest, size); \
|
||||
if (!in_buffer) { \
|
||||
freet(src); \
|
||||
} \
|
||||
return ret; \
|
||||
// In-place escaping: copy dest aside, then escape that copy back into dest.
|
||||
typedef size_t (*escape_fn_t)(const char *src, char *dest, size_t size);
|
||||
|
||||
static size_t inplace_escape(char *const dest, const size_t size,
|
||||
escape_fn_t escape) {
|
||||
char buffer[256];
|
||||
const size_t len = strnlen(dest, size);
|
||||
const int in_buffer = len + 1 < sizeof(buffer);
|
||||
char *src = in_buffer ? buffer : malloct(len + 1);
|
||||
size_t ret;
|
||||
assertf(src != NULL);
|
||||
assertf(len < size);
|
||||
memcpy(src, dest, len + 1);
|
||||
ret = escape(src, dest, size);
|
||||
if (!in_buffer) {
|
||||
freet(src);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Thin exported wrappers binding inplace_escape() to each escaper (ABI).
|
||||
#undef DECLARE_INPLACE_ESCAPE_VERSION
|
||||
#define DECLARE_INPLACE_ESCAPE_VERSION(NAME) \
|
||||
HTSEXT_API size_t inplace_##NAME(char *const dest, const size_t size) { \
|
||||
return inplace_escape(dest, size, NAME); \
|
||||
}
|
||||
|
||||
DECLARE_INPLACE_ESCAPE_VERSION(escape_in_url)
|
||||
DECLARE_INPLACE_ESCAPE_VERSION(escape_spc_url)
|
||||
DECLARE_INPLACE_ESCAPE_VERSION(escape_uri_utf)
|
||||
@@ -6014,9 +6023,11 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
"htsswf", "htsjava", "httrack-plugin", NULL
|
||||
};
|
||||
#else
|
||||
static const char *defaultModules[] = {
|
||||
"libhtsswf.so.1", "libhtsjava.so.2", "httrack-plugin", NULL
|
||||
};
|
||||
#ifndef HTS_LIBHTSJAVA_NAME
|
||||
#define HTS_LIBHTSJAVA_NAME "libhtsjava.so" /* non-autoconf fallback */
|
||||
#endif
|
||||
static const char *defaultModules[] = {"libhtsswf.so.1", HTS_LIBHTSJAVA_NAME,
|
||||
"httrack-plugin", NULL};
|
||||
#endif
|
||||
httrackp *opt = malloc(sizeof(httrackp));
|
||||
|
||||
|
||||
159
src/htsname.c
159
src/htsname.c
@@ -138,37 +138,66 @@ static void cleanEndingSpaceOrDot(char *s) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Should the wire Content-Type override the URL's own extension when naming the
|
||||
saved file? True when the type is patchable (may_unknown2) and either the URL
|
||||
extension implies no specific type or the server declared a disagreeing one.
|
||||
A URL extension mapping to a specific non-HTML type is kept only when the
|
||||
server declared NO type (the HTS_UNKNOWN_MIME sentinel; the #267 mangle
|
||||
guard): a typeless .png stays .png, but a .pdf explicitly served as text/html
|
||||
is named .html. The sentinel rides the cache, so updates stay consistent. */
|
||||
/* Wire Content-Type vs URL extension: a patchable wire type wins over an
|
||||
unspecific ext, the HTS_UNKNOWN_MIME sentinel keeps a specific non-HTML ext
|
||||
(#267 guard), a declared disagreement is CONTESTED. Sentinel and verdict
|
||||
ride the cache, so updates stay consistent. */
|
||||
typedef enum wire_verdict {
|
||||
WIRE_KEEPS_EXT,
|
||||
WIRE_WINS,
|
||||
WIRE_CONTESTED
|
||||
} wire_verdict;
|
||||
|
||||
static wire_verdict wire_ext_verdict(httrackp *opt, const char *wiremime,
|
||||
const char *file, char *urlmime,
|
||||
size_t urlmime_size) {
|
||||
if (may_unknown2(opt, wiremime, file))
|
||||
return WIRE_KEEPS_EXT; /* type kept verbatim (keep-list / bogus-multiple) */
|
||||
urlmime[0] = '\0';
|
||||
/* type implied by the URL extension, only when confidently known (flag 0) */
|
||||
if (!get_httptype_sized(opt, urlmime, urlmime_size, file, 0))
|
||||
return WIRE_WINS; /* URL ext implies no known type */
|
||||
if (strfield2(wiremime, urlmime))
|
||||
return WIRE_KEEPS_EXT; /* agreement (no .htm->.html churn) */
|
||||
if (!is_hypertext_mime(opt, urlmime, file) &&
|
||||
strfield2(wiremime, HTS_UNKNOWN_MIME))
|
||||
return WIRE_KEEPS_EXT; /* no declared type */
|
||||
return WIRE_CONTESTED;
|
||||
}
|
||||
|
||||
static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
char urlmime[256];
|
||||
|
||||
if (may_unknown2(opt, wiremime, file))
|
||||
return 0; /* type kept verbatim (keep-list / bogus-multiple) */
|
||||
urlmime[0] = '\0';
|
||||
/* type implied by the URL extension, only when confidently known (flag 0) */
|
||||
if (!get_httptype_sized(opt, urlmime, sizeof(urlmime), file, 0))
|
||||
return 1; /* URL ext implies no known type: trust the wire type */
|
||||
if (strfield2(wiremime, urlmime))
|
||||
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
|
||||
/* wire disagrees with a specific non-HTML URL ext. Keep the ext only when
|
||||
the server declared no type (the sentinel); an explicitly declared type,
|
||||
even text/html, is trusted, so a binary-looking URL that really serves
|
||||
HTML (login/error interstitial, soft-404) is named .html. */
|
||||
if (!is_hypertext_mime(opt, urlmime, file) &&
|
||||
strfield2(wiremime, HTS_UNKNOWN_MIME))
|
||||
switch (wire_ext_verdict(opt, wiremime, file, urlmime, sizeof(urlmime))) {
|
||||
case WIRE_KEEPS_EXT:
|
||||
return 0;
|
||||
case WIRE_WINS:
|
||||
return 1;
|
||||
case WIRE_CONTESTED:
|
||||
break; /* no content evidence is consulted today: trust the wire */
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
// forme le nom du fichier à sauver (save) à partir de fil et adr
|
||||
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
|
||||
/* Wire-metadata name change: a Content-Disposition filename wins (returns 2),
|
||||
else the declared type's ext when wire_patches_ext() allows (returns 1),
|
||||
else 0. ext receives the new extension or replacement filename. */
|
||||
static int resolve_extension(httrackp *opt, const char *cdispo,
|
||||
const char *contenttype, const char *fil,
|
||||
char *ext, size_t ext_size) {
|
||||
if (strnotempty(cdispo)) {
|
||||
strlcpybuff(ext, cdispo, ext_size);
|
||||
return 2;
|
||||
}
|
||||
if (wire_patches_ext(opt, contenttype, fil) &&
|
||||
give_mimext(ext, ext_size, contenttype))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Build the local save name (save) from adr/fil; renames on collision
|
||||
// (e.g. INDEX.HTML vs index.html).
|
||||
int url_savename(lien_adrfilsave *const afs,
|
||||
lien_adrfil *const former,
|
||||
const char *referer_adr, const char *referer_fil,
|
||||
@@ -405,45 +434,23 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
|
||||
// si option check_type activée
|
||||
if (is_html < 0 && opt->check_type && !ext_chg) {
|
||||
int ishtest = 0;
|
||||
|
||||
if (protocol != PROTOCOL_FILE
|
||||
&& protocol != PROTOCOL_FTP
|
||||
) {
|
||||
// tester type avec requète HEAD si on ne connait pas le type du fichier
|
||||
if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD ||
|
||||
(ishtest = ishtml(opt, fil)) <
|
||||
0) { // unsure whether it's html or a file
|
||||
ishtml(opt, fil) < 0) { // unsure whether it's html or a file
|
||||
// lire dans le cache
|
||||
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
|
||||
|
||||
if (r.statuscode != -1) { // pas d'erreur de lecture cache
|
||||
char s[32];
|
||||
|
||||
s[0] = '\0';
|
||||
if (r.statuscode != -1) { // cache entry read OK
|
||||
hts_log_print(opt, LOG_DEBUG, "Testing link type (from cache) %s%s",
|
||||
adr_complete, fil_complete);
|
||||
if (!HTTP_IS_REDIRECT(r.statuscode)) {
|
||||
if (strnotempty(r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, r.cdispo);
|
||||
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
|
||||
if (give_mimext(s, sizeof(s),
|
||||
r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, s);
|
||||
}
|
||||
}
|
||||
ext_chg = resolve_extension(opt, r.cdispo, r.contenttype, fil,
|
||||
ext, sizeof(ext));
|
||||
}
|
||||
#ifdef DEFAULT_BIN_EXT
|
||||
// no extension and potentially bogus
|
||||
else if (ishtest == -2) {
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
|
||||
}
|
||||
#endif
|
||||
//
|
||||
} else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD &&
|
||||
is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
||||
Lookup mimetype not only by extension,
|
||||
@@ -467,22 +474,11 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
// fail later
|
||||
else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
|
||||
!opt->state.stop) {
|
||||
// Check if the file is ready in backing. We basically take the same logic as later.
|
||||
// FIXME: we should cleanup and factorize this unholy mess
|
||||
// Check if the file is ready in backing.
|
||||
if (headers != NULL && headers->status >= 0 && !is_redirect) {
|
||||
if (strnotempty(headers->r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, headers->r.cdispo);
|
||||
} else if (wire_patches_ext(opt, headers->r.contenttype,
|
||||
headers->url_fil)) {
|
||||
char s[16];
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
headers->r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, s);
|
||||
}
|
||||
}
|
||||
ext_chg = resolve_extension(opt, headers->r.cdispo,
|
||||
headers->r.contenttype,
|
||||
headers->url_fil, ext, sizeof(ext));
|
||||
}
|
||||
else if (mime_type != NULL) {
|
||||
ext[0] = '\0';
|
||||
@@ -500,13 +496,6 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (!may_unknown2(opt, mime_type, fil)) {
|
||||
ext_chg = 1;
|
||||
}
|
||||
#ifdef DEFAULT_BIN_EXT
|
||||
// no extension and potentially bogus
|
||||
else if (ishtml(opt, fil) == -2) {
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
ext_chg = 0;
|
||||
}
|
||||
@@ -696,30 +685,10 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
// libérer emplacement backing
|
||||
}
|
||||
|
||||
{ // pas d'erreur, changer type?
|
||||
char s[16];
|
||||
|
||||
s[0] = '\0';
|
||||
if (strnotempty(back[b].r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, back[b].r.cdispo);
|
||||
} else if (wire_patches_ext(opt, back[b].r.contenttype,
|
||||
back[b].url_fil)) {
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
back[b].r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, s);
|
||||
}
|
||||
}
|
||||
#ifdef DEFAULT_BIN_EXT
|
||||
// no extension and potentially bogus
|
||||
else if (ishtest == -2) {
|
||||
ext_chg = 1;
|
||||
strcpybuff(ext, DEFAULT_BIN_EXT + 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// no error: change the type?
|
||||
ext_chg = resolve_extension(
|
||||
opt, back[b].r.cdispo, back[b].r.contenttype,
|
||||
back[b].url_fil, ext, sizeof(ext));
|
||||
}
|
||||
// FIN Si non déplacé, forcer type?
|
||||
|
||||
|
||||
357
src/htsparse.c
357
src/htsparse.c
@@ -77,13 +77,14 @@ Please visit our Website: http://www.httrack.com
|
||||
/** Append to the output buffer the string 'A'. **/
|
||||
#define HT_ADD(A) TypedArrayAppend(output_buffer, A, strlen(A))
|
||||
|
||||
/** Append to the output buffer the string 'A', html-escaped. **/
|
||||
#define HT_ADD_HTMLESCAPED_ANY(A, FUNCTION) do { \
|
||||
/* clang-format off: an edit realigns all backslashes, churning the macro. */
|
||||
/* clang-format off */
|
||||
/** Append 'A' to the output buffer, html-escaped; FACTOR = max byte expansion. **/
|
||||
#define HT_ADD_HTMLESCAPED_ANY(A, FUNCTION, FACTOR) do { \
|
||||
if ((opt->getmode & 1) != 0 && ptr>0) { \
|
||||
const char *const str_ = (A); \
|
||||
size_t size_; \
|
||||
/* & is the maximum expansion */ \
|
||||
TypedArrayEnsureRoom(output_buffer, strlen(str_) * 5 + 1024); \
|
||||
TypedArrayEnsureRoom(output_buffer, strlen(str_) * (FACTOR) + 1024); \
|
||||
size_ = FUNCTION(str_, &TypedArrayTail(output_buffer), \
|
||||
TypedArrayRoom(output_buffer)); \
|
||||
TypedArraySize(output_buffer) += size_; \
|
||||
@@ -91,188 +92,113 @@ Please visit our Website: http://www.httrack.com
|
||||
} while(0)
|
||||
|
||||
/** Append to the output buffer the string 'A', html-escaped for &. **/
|
||||
#define HT_ADD_HTMLESCAPED(A) HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print)
|
||||
#define HT_ADD_HTMLESCAPED(A) \
|
||||
HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print, HTS_HTMLESCAPE_MAXEXP)
|
||||
|
||||
/**
|
||||
* Append to the output buffer the string 'A', html-escaped for & and
|
||||
* Append to the output buffer the string 'A', html-escaped for & and
|
||||
* high chars.
|
||||
**/
|
||||
#define HT_ADD_HTMLESCAPED_FULL(A) HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print_full)
|
||||
#define HT_ADD_HTMLESCAPED_FULL(A) \
|
||||
HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print_full, HTS_HTMLESCAPE_FULL_MAXEXP)
|
||||
/* clang-format on */
|
||||
|
||||
// does nothing
|
||||
#define XH_uninit do {} while(0)
|
||||
|
||||
#define HT_ADD_END { \
|
||||
int ok=0;\
|
||||
if (TypedArraySize(output_buffer) != 0) { \
|
||||
const size_t ht_len = TypedArraySize(output_buffer); \
|
||||
const char *const ht_buff = TypedArrayElts(output_buffer); \
|
||||
char digest[32+2];\
|
||||
off_t fsize_old = fsize(fconv(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),savename()));\
|
||||
digest[0] = '\0';\
|
||||
domd5mem(TypedArrayElts(output_buffer), ht_len, digest, 1);\
|
||||
if (fsize_old == (off_t) ht_len) { \
|
||||
int mlen = 0;\
|
||||
char* mbuff;\
|
||||
cache_readdata(cache,"//[HTML-MD5]//",savename(),&mbuff,&mlen);\
|
||||
if (mlen) \
|
||||
mbuff[mlen]='\0';\
|
||||
if ((mlen == 32) && (strcmp(((mbuff!=NULL)?mbuff:""),digest)==0)) {\
|
||||
ok=1;\
|
||||
hts_log_print(opt, LOG_DEBUG, "File not re-written (md5): %s",savename());\
|
||||
} else {\
|
||||
ok=0;\
|
||||
} \
|
||||
}\
|
||||
if (!ok) { \
|
||||
file_notify(opt,urladr(), urlfil(), savename(), 1, 1, r->notmodified); \
|
||||
fp=filecreate(&opt->state.strc, savename()); \
|
||||
if (fp) { \
|
||||
if (ht_len>0) {\
|
||||
if (fwrite(ht_buff,1,ht_len,fp) != ht_len) { \
|
||||
int fcheck;\
|
||||
if ((fcheck=check_fatal_io_errno())) {\
|
||||
opt->state.exit_xh=-1;\
|
||||
}\
|
||||
if (opt->log) { \
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to write HTML file %s", savename());\
|
||||
if (fcheck) {\
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
|
||||
}\
|
||||
}\
|
||||
}\
|
||||
}\
|
||||
fclose(fp); fp=NULL; \
|
||||
if (strnotempty(r->lastmodified)) \
|
||||
set_filetime_rfc822(savename(),r->lastmodified); \
|
||||
} else {\
|
||||
int fcheck;\
|
||||
if ((fcheck=check_fatal_io_errno())) {\
|
||||
hts_log_print(opt, LOG_ERROR, "Mirror aborted: disk full or filesystem problems"); \
|
||||
opt->state.exit_xh=-1;\
|
||||
}\
|
||||
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to save file %s", savename());\
|
||||
if (fcheck) {\
|
||||
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
|
||||
}\
|
||||
}\
|
||||
} else {\
|
||||
file_notify(opt,urladr(), urlfil(), savename(), 0, 0, r->notmodified); \
|
||||
filenote(&opt->state.strc, savename(),NULL); \
|
||||
}\
|
||||
if (cache->ndx)\
|
||||
cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename(),digest,(int)strlen(digest));\
|
||||
} \
|
||||
TypedArrayFree(output_buffer); \
|
||||
}
|
||||
#define HT_ADD_FOP
|
||||
|
||||
// COPY IN HTSCORE.C
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_html_utf8),"index.html"),"primary","primary"); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
/* Mutable extended-context fields: one source of truth so the DEFINE/SET/SAVE
|
||||
load and store lists can't drift apart. */
|
||||
/* clang-format off */
|
||||
#define ENGINE_MUTABLE_FIELDS(X) \
|
||||
X(int, error, stre->error_) \
|
||||
X(int, store_errpage, stre->store_errpage_) \
|
||||
X(int, makeindex_done, stre->makeindex_done_) \
|
||||
X(FILE *, makeindex_fp, stre->makeindex_fp_) \
|
||||
X(int, makeindex_links, stre->makeindex_links_) \
|
||||
X(LLint, stat_fragment, stre->stat_fragment_)
|
||||
|
||||
#define ENGINE_FIELD_DECLARE(type, name, src) type name = *(src);
|
||||
#define ENGINE_FIELD_LOAD(type, name, src) name = *(src);
|
||||
#define ENGINE_FIELD_STORE(type, name, src) *(src) = name;
|
||||
|
||||
#define ENGINE_DEFINE_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
htsblk* const r HTS_UNUSED = stre->r_; \
|
||||
hash_struct* const hash HTS_UNUSED = stre->hash_; \
|
||||
char* const codebase HTS_UNUSED = stre->codebase; \
|
||||
char* const base HTS_UNUSED = stre->base; \
|
||||
/* */ \
|
||||
const char * const template_header HTS_UNUSED = stre->template_header_; \
|
||||
const char * const template_body HTS_UNUSED = stre->template_body_; \
|
||||
const char * const template_footer HTS_UNUSED = stre->template_footer_; \
|
||||
/* */ \
|
||||
HTS_UNUSED char* const makeindex_firstlink = stre->makeindex_firstlink_; \
|
||||
/* */ \
|
||||
/* */ \
|
||||
int error = * stre->error_; \
|
||||
int store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
int makeindex_done = *stre->makeindex_done_; \
|
||||
FILE* makeindex_fp = *stre->makeindex_fp_; \
|
||||
int makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
LLint stat_fragment = *stre->stat_fragment_; \
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_DECLARE) \
|
||||
/* load-once (kept out of SET/SAVE): re-reading would reset the throttle */ \
|
||||
HTS_UNUSED TStamp makestat_time = stre->makestat_time; \
|
||||
HTS_UNUSED FILE* makestat_fp = stre->makestat_fp
|
||||
|
||||
#define ENGINE_SET_CONTEXT() \
|
||||
ENGINE_SET_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
error = * stre->error_; \
|
||||
store_errpage = * stre->store_errpage_; \
|
||||
/* */ \
|
||||
makeindex_done = *stre->makeindex_done_; \
|
||||
makeindex_fp = *stre->makeindex_fp_; \
|
||||
makeindex_links = *stre->makeindex_links_; \
|
||||
/* */ \
|
||||
stat_fragment = *stre->stat_fragment_; \
|
||||
makestat_time = stre->makestat_time; \
|
||||
makestat_fp = stre->makestat_fp
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_LOAD)
|
||||
|
||||
#define ENGINE_LOAD_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT()
|
||||
|
||||
#define ENGINE_SAVE_CONTEXT() \
|
||||
ENGINE_SAVE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
* stre->error_ = error; \
|
||||
* stre->store_errpage_ = store_errpage; \
|
||||
/* */ \
|
||||
*stre->makeindex_done_ = makeindex_done; \
|
||||
*stre->makeindex_fp_ = makeindex_fp; \
|
||||
*stre->makeindex_links_ = makeindex_links; \
|
||||
/* */ \
|
||||
*stre->stat_fragment_ = stat_fragment
|
||||
ENGINE_MUTABLE_FIELDS(ENGINE_FIELD_STORE)
|
||||
/* clang-format on */
|
||||
|
||||
#define _FILTERS (*opt->filters.filters)
|
||||
#define _FILTERS_PTR (opt->filters.filptr)
|
||||
#define _ROBOTS ((robots_wizard*)opt->robotsptr)
|
||||
|
||||
/* Apply current *adr character for the script automate */
|
||||
#define AUTOMATE_LOOKUP_CURRENT_ADR() do { \
|
||||
if (inscript) { \
|
||||
int new_state_pos; \
|
||||
new_state_pos=inscript_state[inscript_state_pos][(unsigned char)*html]; \
|
||||
if (new_state_pos < 0) { \
|
||||
new_state_pos=inscript_state[inscript_state_pos][INSCRIPT_DEFAULT]; \
|
||||
} \
|
||||
assertf(new_state_pos >= 0); \
|
||||
assertf(new_state_pos*sizeof(inscript_state[0]) < sizeof(inscript_state)); \
|
||||
inscript_state_pos=new_state_pos; \
|
||||
} \
|
||||
} while(0)
|
||||
/* JS-detection automaton states; INSCRIPT_DEFAULT is the synthetic "any other
|
||||
char" column of the transition table. */
|
||||
typedef enum {
|
||||
INSCRIPT_START = 0,
|
||||
INSCRIPT_ANTISLASH,
|
||||
INSCRIPT_INQUOTE,
|
||||
INSCRIPT_INQUOTE2,
|
||||
INSCRIPT_SLASH,
|
||||
INSCRIPT_SLASHSLASH,
|
||||
INSCRIPT_COMMENT,
|
||||
INSCRIPT_COMMENT2,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE2,
|
||||
INSCRIPT_DEFAULT = 256
|
||||
} INSCRIPT;
|
||||
|
||||
/* Increment current pointer to 'steps' characters, modifying automate if necessary */
|
||||
#define INCREMENT_CURRENT_ADR(steps) do { \
|
||||
int steps__ = (int) ( steps ); \
|
||||
while(steps__ > 0) { \
|
||||
html++; \
|
||||
AUTOMATE_LOOKUP_CURRENT_ADR(); \
|
||||
steps__ --; \
|
||||
} \
|
||||
} while(0)
|
||||
#define INSCRIPT_NSTATES 10 /* rows in the transition table */
|
||||
|
||||
/* Live view of the parser's automaton locals, set up once so the helpers below
|
||||
can drive it without capturing them by lexical scope. */
|
||||
typedef struct {
|
||||
const int *inscript; /* nonzero while inside a script body */
|
||||
const signed char (*table)[257]; /* [INSCRIPT_NSTATES][257] transitions */
|
||||
INSCRIPT *pos; /* current state */
|
||||
const char **html; /* parse cursor */
|
||||
} script_automate;
|
||||
|
||||
/* Feed the current *html byte to the automaton. No-op outside a script body. */
|
||||
static void hts_automate_lookup(const script_automate *aut) {
|
||||
if (*aut->inscript) {
|
||||
int next = aut->table[*aut->pos][(unsigned char) **aut->html];
|
||||
if (next < 0) {
|
||||
next = aut->table[*aut->pos][INSCRIPT_DEFAULT];
|
||||
}
|
||||
assertf(next >= 0 && next < INSCRIPT_NSTATES);
|
||||
*aut->pos = (INSCRIPT) next;
|
||||
}
|
||||
}
|
||||
|
||||
/* Advance the cursor by 'steps' bytes, feeding each to the automaton. */
|
||||
static void hts_automate_increment(const script_automate *aut, int steps) {
|
||||
while (steps > 0) {
|
||||
(*aut->html)++;
|
||||
hts_automate_lookup(aut);
|
||||
steps--;
|
||||
}
|
||||
}
|
||||
|
||||
/* Percent-encode the angle brackets of a string so it is safe to embed inside
|
||||
an HTML comment (the default footer) or any other HTML context. A URL holding
|
||||
@@ -417,20 +343,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int incomment = 0; // dans un <!--
|
||||
int inscript = 0; // dans un scipt pour applets javascript)
|
||||
int inscript_locked = 0; // in locked script (ie. js file)
|
||||
signed char inscript_state[10][257];
|
||||
typedef enum {
|
||||
INSCRIPT_START = 0,
|
||||
INSCRIPT_ANTISLASH,
|
||||
INSCRIPT_INQUOTE,
|
||||
INSCRIPT_INQUOTE2,
|
||||
INSCRIPT_SLASH,
|
||||
INSCRIPT_SLASHSLASH,
|
||||
INSCRIPT_COMMENT,
|
||||
INSCRIPT_COMMENT2,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE,
|
||||
INSCRIPT_ANTISLASH_IN_QUOTE2,
|
||||
INSCRIPT_DEFAULT = 256
|
||||
} INSCRIPT;
|
||||
signed char inscript_state[INSCRIPT_NSTATES][257];
|
||||
INSCRIPT inscript_state_pos = INSCRIPT_START;
|
||||
const char *inscript_name = NULL; // script tag name
|
||||
int inscript_tag = 0; // on est dans un <body onLoad="... terminé par >
|
||||
@@ -491,6 +404,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
inscript_state[INSCRIPT_COMMENT2]['*'] = INSCRIPT_COMMENT2;
|
||||
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE; /* #8: escape in '' */
|
||||
inscript_state[INSCRIPT_ANTISLASH_IN_QUOTE2][INSCRIPT_DEFAULT] = INSCRIPT_INQUOTE2; /* #9: escape in "" */
|
||||
const script_automate saut = {&inscript, inscript_state,
|
||||
&inscript_state_pos, &html};
|
||||
|
||||
/* Primary list or URLs */
|
||||
if (ptr == 0) {
|
||||
@@ -689,13 +604,14 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
// Decode title with encoding
|
||||
if (str->page_charset_ != NULL
|
||||
&& *str->page_charset_ != '\0') {
|
||||
char *const sUtf =
|
||||
hts_convertStringToUTF8(s, strlen(s), str->page_charset_);
|
||||
if (str->page_charset_ != NULL &&
|
||||
*str->page_charset_ != '\0') {
|
||||
char *sUtf = hts_convertStringToUTF8(
|
||||
s, strlen(s), str->page_charset_);
|
||||
if (sUtf != NULL) {
|
||||
strcpy(s, sUtf);
|
||||
free(sUtf);
|
||||
/* UTF-8 can expand past s[]; truncate to fit */
|
||||
snprintf(s, sizeof(s), "%s", sUtf);
|
||||
freet(sUtf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -709,7 +625,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
} else if (heap(ptr)->depth < opt->depth) { // on a sauté level1+1 et level1
|
||||
HT_INDEX_END;
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp,
|
||||
makeindex_links, makeindex_firstlink,
|
||||
template_footer, "primary", "primary");
|
||||
}
|
||||
} // if (opt->makeindex)
|
||||
}
|
||||
@@ -927,7 +845,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
/* automate */
|
||||
AUTOMATE_LOOKUP_CURRENT_ADR();
|
||||
hts_automate_lookup(&saut);
|
||||
|
||||
// Note:
|
||||
// Certaines pages ne respectent pas le html
|
||||
@@ -1843,7 +1761,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
// sauter espaces
|
||||
// adr+=p;
|
||||
INCREMENT_CURRENT_ADR(p);
|
||||
hts_automate_increment(&saut, p);
|
||||
while((is_space(*html)
|
||||
|| (inscriptgen && html[0] == '\\' && is_space(html[1])
|
||||
)
|
||||
@@ -1858,7 +1776,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
// puis quitter
|
||||
// html++; // sauter les espaces, "" et cie
|
||||
INCREMENT_CURRENT_ADR(1);
|
||||
hts_automate_increment(&saut, 1);
|
||||
}
|
||||
|
||||
/* Stop at \n (LF) if primary links or link lists */
|
||||
@@ -1873,7 +1791,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (*html == '\\') {
|
||||
if ((*(html + 1) == '\'') || (*(html + 1) == '"')) { // \" ou \'
|
||||
// html+=2; // sauter
|
||||
INCREMENT_CURRENT_ADR(2);
|
||||
hts_automate_increment(&saut, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1921,7 +1839,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (srcset_p) {
|
||||
while(html < r->adr + r->size
|
||||
&& (is_realspace(*html) || *html == ','))
|
||||
INCREMENT_CURRENT_ADR(1);
|
||||
hts_automate_increment(&saut, 1);
|
||||
}
|
||||
eadr = html;
|
||||
|
||||
@@ -3381,7 +3299,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
|
||||
assertf(eadr - html >= 0); // Should not go back
|
||||
if (eadr > html) {
|
||||
INCREMENT_CURRENT_ADR(eadr - 1 - html);
|
||||
hts_automate_increment(&saut, (int) (eadr - 1 - html));
|
||||
}
|
||||
// adr=eadr-1; // ** sauter
|
||||
|
||||
@@ -3400,7 +3318,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
q++; // skip whitespace and empty candidates
|
||||
if (q < endp && *q != '\0' && *q != ',' && *q != quote
|
||||
&& *q != '<' && *q != '>' && (unsigned char) *q >= 32) {
|
||||
INCREMENT_CURRENT_ADR(q - html); // keep the automate in sync
|
||||
hts_automate_increment(
|
||||
&saut, (int) (q - html)); // keep the automate in sync
|
||||
ok = 1;
|
||||
goto srcset_next;
|
||||
}
|
||||
@@ -3540,7 +3459,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
/* Flush and save to disk */
|
||||
HT_ADD_END; // achever
|
||||
if (TypedArraySize(output_buffer) != 0) {
|
||||
hts_finish_html_file(
|
||||
opt, cache, r, &fp, TypedArrayElts(output_buffer),
|
||||
TypedArraySize(output_buffer), urladr(), urlfil(), savename());
|
||||
}
|
||||
TypedArrayFree(output_buffer);
|
||||
}
|
||||
//
|
||||
//
|
||||
@@ -3565,6 +3489,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Mirror the savename to tell whether a redirect saves to the same file (#159);
|
||||
* contract in htsparse.h. */
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil) {
|
||||
const int norm_slash = opt->urlhack && !opt->no_slash_dedup;
|
||||
const int norm_query = opt->urlhack && !opt->no_query_dedup;
|
||||
char BIGSTK n_fil[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
if (strcasecmp(jump_identification_const(moved_adr),
|
||||
jump_identification_const(cur_adr)) != 0)
|
||||
return HTS_FALSE;
|
||||
fil_normalized_filtered_ex(moved_fil, n_fil, NULL, norm_slash, norm_query);
|
||||
fil_normalized_filtered_ex(cur_fil, pn_fil, NULL, norm_slash, norm_query);
|
||||
return strcasecmp(n_fil, pn_fil) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Check 301, 302, .. statuscodes (moved)
|
||||
*/
|
||||
@@ -3610,36 +3552,9 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
if ((reponse =
|
||||
ident_url_relatif(mov_url, urladr(), urlfil(), moved)) >= 0) {
|
||||
int set_prio_to = 0; // pas de priotité fixéd par wizard
|
||||
|
||||
// check whether URLHack is harmless or not (per the effective
|
||||
// sub-flags)
|
||||
if (opt->urlhack && (!opt->no_www_dedup || !opt->no_slash_dedup ||
|
||||
!opt->no_query_dedup)) {
|
||||
const int norm_host = !opt->no_www_dedup;
|
||||
const int norm_slash = !opt->no_slash_dedup;
|
||||
const int norm_query = !opt->no_query_dedup;
|
||||
char BIGSTK n_adr[HTS_URLMAXSIZE * 2], n_fil[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK pn_adr[HTS_URLMAXSIZE * 2], pn_fil[HTS_URLMAXSIZE * 2];
|
||||
|
||||
strlcpybuff(n_adr,
|
||||
norm_host ? jump_normalized_const(moved->adr)
|
||||
: jump_identification_const(moved->adr),
|
||||
sizeof(n_adr));
|
||||
strlcpybuff(pn_adr,
|
||||
norm_host ? jump_normalized_const(urladr())
|
||||
: jump_identification_const(urladr()),
|
||||
sizeof(pn_adr));
|
||||
fil_normalized_filtered_ex(moved->fil, n_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
fil_normalized_filtered_ex(urlfil(), pn_fil, NULL, norm_slash,
|
||||
norm_query);
|
||||
if (strcasecmp(n_adr, pn_adr) == 0
|
||||
&& strcasecmp(n_fil, pn_fil) == 0) {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirected link is identical because of 'URL Hack' option: %s%s and %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
}
|
||||
// A same-file alias redirect must be followed, not stubbed (#159).
|
||||
const hts_boolean same_savefile = hts_redirect_same_savefile(
|
||||
opt, urladr(), urlfil(), moved->adr, moved->fil);
|
||||
//if (ident_url_absolute(mov_url,moved->adr,moved->fil)!=-1) { // ok URL reconnue
|
||||
// c'est (en gros) la même URL..
|
||||
// si c'est un problème de casse dans le host c'est que le serveur est buggé
|
||||
@@ -3667,7 +3582,17 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
hts_log_print(opt, LOG_DEBUG, "moved link accepted: %s%s",
|
||||
moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
} else if (same_savefile) {
|
||||
// A stub would point at itself; follow the redirect instead.
|
||||
if (hts_acceptlink(opt, ptr, moved->adr, moved->fil, NULL, NULL,
|
||||
&set_prio_to, NULL) != 1) {
|
||||
get_it = 1;
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
"Redirect to a same-file alias, fetching real "
|
||||
"content: %s%s -> %s%s",
|
||||
urladr(), urlfil(), moved->adr, moved->fil);
|
||||
}
|
||||
} /* sinon traité normalement */
|
||||
}
|
||||
|
||||
//if ((strfield2(moved->adr,urladr())!=0) && (strfield2(moved->fil,urlfil())!=0)) { // identique à casse près
|
||||
@@ -3690,7 +3615,11 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
heap(heap(ptr)->precedent)->adr,
|
||||
heap(heap(ptr)->precedent)->fil, opt,
|
||||
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
|
||||
if (hash_read(hash, savedmoved.save, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// Same-file alias: the reserved name is the invalidated source,
|
||||
// so record anyway.
|
||||
if (same_savefile ||
|
||||
hash_read(hash, savedmoved.save, NULL,
|
||||
HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
|
||||
// enregistrer lien avec SAV IDENTIQUE
|
||||
if (hts_record_link(opt, moved->adr, moved->fil, heap(ptr)->sav, "", "", NULL)) {
|
||||
// mode test?
|
||||
@@ -3714,7 +3643,6 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
"moving %s to an existing file %s",
|
||||
heap(ptr)->fil, urlfil());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4917,6 +4845,9 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
|
||||
/* Still have a back reference */
|
||||
if (b >= 0) {
|
||||
/* Patch destination filename for direct-to-disk mode, BEFORE any
|
||||
finalize: it records and caches the entry under url_sav */
|
||||
strcpybuff(back[b].url_sav, afs->save);
|
||||
/* Finalize now as we have the type */
|
||||
if (back[b].status == STATUS_READY) {
|
||||
if (!back[b].finalized) {
|
||||
@@ -4924,8 +4855,6 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
back_finalize(opt, cache, sback, b);
|
||||
}
|
||||
}
|
||||
/* Patch destination filename for direct-to-disk mode */
|
||||
strcpybuff(back[b].url_sav, afs->save);
|
||||
}
|
||||
|
||||
} // b >= 0
|
||||
|
||||
@@ -116,6 +116,19 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre);
|
||||
int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
htsmoduleStructExtended * stre);
|
||||
|
||||
/*
|
||||
Non-zero if a redirect (cur_adr,cur_fil)->(moved_adr,moved_fil) saves to the
|
||||
same local file, so it must be followed rather than turned into a
|
||||
self-pointing "moved" stub (#159). Mirrors the savename: scheme+userinfo
|
||||
stripped, www kept (www dedup is the crawl layer's job), path
|
||||
slash/query-normalized per the URL-hack flags. Not hash_url_equals: that keys
|
||||
on the dedup hash, which folds www and never collapses http<->https.
|
||||
*/
|
||||
hts_boolean hts_redirect_same_savefile(httrackp *opt, const char *cur_adr,
|
||||
const char *cur_fil,
|
||||
const char *moved_adr,
|
||||
const char *moved_fil);
|
||||
|
||||
/*
|
||||
Process user intercations: pause, add link, delete link..
|
||||
*/
|
||||
|
||||
@@ -456,6 +456,13 @@ static HTS_INLINE HTS_UNUSED const char *htsbuff_str(const htsbuff *b) {
|
||||
return b->buf;
|
||||
}
|
||||
|
||||
/** True if 'count' records of >= 1 byte each fit in 'available' bytes; guards
|
||||
an attacker-controlled count driving a large allocation. */
|
||||
static HTS_INLINE HTS_UNUSED hts_boolean hts_count_fits(size_t count,
|
||||
LLint available) {
|
||||
return (available >= 0 && (LLint) count <= available) ? HTS_TRUE : HTS_FALSE;
|
||||
}
|
||||
|
||||
/* Thin aliases over the libc allocator/memcpy (historical "t" suffix); no
|
||||
added bounds checking. freet() also NULLs the freed pointer and tolerates
|
||||
NULL. memcpybuff() despite the name is a raw memcpy: the caller owns the
|
||||
|
||||
@@ -45,10 +45,12 @@ Please visit our Website: http://www.httrack.com
|
||||
#include "htscore.h"
|
||||
#include "htsdefines.h"
|
||||
#include "htslib.h"
|
||||
#include "htsparse.h"
|
||||
#include "htscache_selftest.h"
|
||||
#include "htsdns_selftest.h"
|
||||
#include "htscharset.h"
|
||||
#include "htsencoding.h"
|
||||
#include "htsftp.h"
|
||||
#include "htsmd5.h"
|
||||
#if HTS_USEZLIB
|
||||
#include "htszlib.h"
|
||||
@@ -60,6 +62,10 @@ Please visit our Website: http://www.httrack.com
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#ifndef _WIN32
|
||||
#include <sys/socket.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
/* very minimalistic internal tests */
|
||||
static void basic_selftests(void) {
|
||||
@@ -524,6 +530,41 @@ static int string_safety_selftests(void) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* StringCatN/StringSetLength must eval SIZE once: (n_eval++, V) leaves
|
||||
n_eval == 2 on a double-eval macro. */
|
||||
{
|
||||
String s = STRING_EMPTY;
|
||||
int n_eval = 0;
|
||||
|
||||
StringCat(s, "hello");
|
||||
StringCatN(s, "world", (n_eval++, 3)); /* strlen>SIZE so the clamp runs */
|
||||
if (n_eval != 1 || strcmp(StringBuff(s), "hellowor") != 0) {
|
||||
StringFree(s);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_eval = 0;
|
||||
StringSetLength(s, (n_eval++, 5));
|
||||
if (n_eval != 1 || StringLength(s) != 5) {
|
||||
StringFree(s);
|
||||
return 1;
|
||||
}
|
||||
StringFree(s);
|
||||
}
|
||||
|
||||
/* StringSubRW still reads/writes after dropping its duplicate definition. */
|
||||
{
|
||||
String s = STRING_EMPTY;
|
||||
|
||||
StringCat(s, "abc");
|
||||
StringSubRW(s, 1) = 'X';
|
||||
if (StringSub(s, 1) != 'X' || strcmp(StringBuff(s), "aXc") != 0) {
|
||||
StringFree(s);
|
||||
return 1;
|
||||
}
|
||||
StringFree(s);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -672,7 +713,8 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
|
||||
}
|
||||
s = strdupt(argv[0]);
|
||||
enc = argc >= 2 ? argv[1] : "UTF-8";
|
||||
if (s != NULL && hts_unescapeEntitiesWithCharset(s, s, strlen(s), enc) == 0) {
|
||||
if (s != NULL &&
|
||||
hts_unescapeEntitiesWithCharset(s, s, strlen(s) + 1, enc) == 0) {
|
||||
printf("%s\n", s);
|
||||
freet(s);
|
||||
} else {
|
||||
@@ -681,6 +723,34 @@ static int st_entities(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* The unescapers must reserve one byte for the trailing NUL: a 'max'-byte
|
||||
dest holding 'max' output chars pre-fix wrote dest[max] (1-byte OOB, caught
|
||||
by ASan). Both unescapeEntities and unescapeUrl share the guard. */
|
||||
static int st_unescape_bounds(httrackp *opt, int argc, char **argv) {
|
||||
char dest[4];
|
||||
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
assertf(hts_unescapeEntities("abcd", dest, sizeof(dest)) == -1);
|
||||
assertf(hts_unescapeUrl("abcd", dest, sizeof(dest)) == -1);
|
||||
assertf(hts_unescapeEntities("abc", dest, sizeof(dest)) == 0);
|
||||
assertf(strcmp(dest, "abc") == 0);
|
||||
/* raw multi-byte UTF-8 flush path (bypasses the per-byte guard) */
|
||||
assertf(hts_unescapeUrl("ab\xC3\xA9", dest, sizeof(dest)) == -1);
|
||||
assertf(hts_unescapeUrl("a\xC3\xA9", dest, sizeof(dest)) == 0);
|
||||
assertf(strcmp(dest, "a\xC3\xA9") == 0);
|
||||
{
|
||||
/* %xx-encoded flush path (utfBufferJ = lastJ rollback) */
|
||||
char wide[8];
|
||||
|
||||
assertf(hts_unescapeUrl("%C3%A9", wide, sizeof(wide)) == 0);
|
||||
assertf(strcmp(wide, "\xC3\xA9") == 0);
|
||||
}
|
||||
printf("unescape-bounds self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int st_hashtable(httrackp *opt, int argc, char **argv) {
|
||||
char *snum;
|
||||
unsigned long count = 0;
|
||||
@@ -1023,35 +1093,202 @@ static int st_resolve(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Extra args are key=value: adr= cdispo= statuscode= status= strip= urlhack=
|
||||
no-www= no-slash= no-query= n83= type=, plus repeatable prior=adr|fil|sav
|
||||
registering an already-crawled link (dedup/collision paths). */
|
||||
/* Parse raw response-header lines and print the naming-relevant fields. */
|
||||
static int st_header(httrackp *opt, int argc, char **argv) {
|
||||
htsblk r;
|
||||
int i;
|
||||
|
||||
(void) opt;
|
||||
if (argc < 1) {
|
||||
fprintf(stderr, "header: needs at least one raw header line\n");
|
||||
return 1;
|
||||
}
|
||||
memset(&r, 0, sizeof(r));
|
||||
for (i = 0; i < argc; i++) {
|
||||
char BIGSTK line[HTS_URLMAXSIZE * 2];
|
||||
|
||||
strcpybuff(line, argv[i]);
|
||||
treathead(NULL, "www.example.com", "/", &r, line);
|
||||
}
|
||||
printf("contenttype=%s cdispo=%s\n", r.contenttype, r.cdispo);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Decode a body argument ("hex:FFD8.." or literal text) into buf. */
|
||||
static size_t st_decode_body(const char *arg, char *buf, size_t size) {
|
||||
size_t n = 0;
|
||||
|
||||
if (strncmp(arg, "hex:", 4) == 0) {
|
||||
const char *s = arg + 4;
|
||||
|
||||
for (; s[0] != '\0' && s[1] != '\0' && n + 1 < size; s += 2) {
|
||||
unsigned int byte;
|
||||
|
||||
if (sscanf(s, "%2x", &byte) != 1)
|
||||
break;
|
||||
buf[n++] = (char) byte;
|
||||
}
|
||||
} else {
|
||||
n = strlen(arg);
|
||||
if (n >= size)
|
||||
n = size - 1;
|
||||
memcpy(buf, arg, n);
|
||||
}
|
||||
buf[n] = '\0';
|
||||
return n;
|
||||
}
|
||||
|
||||
static int st_savename(httrackp *opt, int argc, char **argv) {
|
||||
lien_adrfilsave afs;
|
||||
cache_back cache;
|
||||
struct_back *sback;
|
||||
hash_struct hash;
|
||||
lien_back headers;
|
||||
const char *adr = "www.example.com";
|
||||
const char *cdispo = NULL;
|
||||
const char *body = NULL;
|
||||
const char *cached = NULL;
|
||||
const char *bodyfile = "st-savename-body.tmp";
|
||||
int statuscode = HTTP_OK, status = 0;
|
||||
int i;
|
||||
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "savename: needs a fil and a content-type\n");
|
||||
return 1;
|
||||
}
|
||||
/* knobs first: hash_init and the prior links depend on them */
|
||||
for (i = 2; i < argc; i++) {
|
||||
const char *const a = argv[i];
|
||||
|
||||
if (strncmp(a, "adr=", 4) == 0)
|
||||
adr = a + 4;
|
||||
else if (strncmp(a, "cdispo=", 7) == 0)
|
||||
cdispo = a + 7;
|
||||
else if (strncmp(a, "statuscode=", 11) == 0)
|
||||
statuscode = atoi(a + 11);
|
||||
else if (strncmp(a, "status=", 7) == 0)
|
||||
status = atoi(a + 7);
|
||||
else if (strncmp(a, "strip=", 6) == 0)
|
||||
StringCopy(opt->strip_query, a + 6);
|
||||
else if (strncmp(a, "urlhack=", 8) == 0)
|
||||
opt->urlhack = atoi(a + 8) ? HTS_TRUE : HTS_FALSE;
|
||||
else if (strncmp(a, "no-www=", 7) == 0)
|
||||
opt->no_www_dedup = atoi(a + 7) ? HTS_TRUE : HTS_FALSE;
|
||||
else if (strncmp(a, "no-slash=", 9) == 0)
|
||||
opt->no_slash_dedup = atoi(a + 9) ? HTS_TRUE : HTS_FALSE;
|
||||
else if (strncmp(a, "no-query=", 9) == 0)
|
||||
opt->no_query_dedup = atoi(a + 9) ? HTS_TRUE : HTS_FALSE;
|
||||
else if (strncmp(a, "n83=", 4) == 0)
|
||||
opt->savename_83 = atoi(a + 4);
|
||||
else if (strncmp(a, "type=", 5) == 0)
|
||||
opt->savename_type = atoi(a + 5);
|
||||
else if (strncmp(a, "body=", 5) == 0)
|
||||
body = a + 5;
|
||||
else if (strncmp(a, "cached=", 7) == 0)
|
||||
cached = a + 7;
|
||||
else if (strncmp(a, "prior=", 6) != 0) {
|
||||
fprintf(stderr, "savename: unknown arg '%s'\n", a);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
memset(&afs, 0, sizeof(afs));
|
||||
strcpybuff(afs.af.adr, "www.example.com");
|
||||
strcpybuff(afs.af.adr, adr);
|
||||
strcpybuff(afs.af.fil, argv[0]);
|
||||
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.hashtable = (void *) coucal_new(0);
|
||||
if (cached != NULL) { /* cached=<content-type>|<save name> */
|
||||
char *dup = strdupt(cached);
|
||||
char *const sep = strchr(dup, '|');
|
||||
char locbuf[64] = "";
|
||||
htsblk cr;
|
||||
|
||||
if (sep == NULL) {
|
||||
fprintf(stderr, "savename: cached needs ctype|save\n");
|
||||
return 1;
|
||||
}
|
||||
*sep = '\0';
|
||||
/* one-entry cache in cwd, reopened read-only; body is PNG magic on
|
||||
purpose: naming must not depend on stored content */
|
||||
StringCopy(opt->path_log, "");
|
||||
cache.type = 1;
|
||||
cache.log = cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache_init(&cache, opt);
|
||||
hts_init_htsblk(&cr);
|
||||
cr.statuscode = HTTP_OK;
|
||||
strcpybuff(cr.msg, "OK");
|
||||
strcpybuff(cr.contenttype, dup);
|
||||
cr.location = locbuf;
|
||||
cr.adr = strdupt("\x89PNG\r\n\x1a\n");
|
||||
cr.size = 8;
|
||||
cache_add(opt, &cache, &cr, adr, argv[0], sep + 1, 1, NULL);
|
||||
freet(cr.adr);
|
||||
if (cache.zipOutput != NULL) {
|
||||
zipClose(cache.zipOutput, NULL);
|
||||
cache.zipOutput = NULL;
|
||||
}
|
||||
memset(&cache, 0, sizeof(cache));
|
||||
cache.type = 1;
|
||||
cache.log = cache.errlog = stderr;
|
||||
cache.hashtable = coucal_new(0);
|
||||
cache.ro = 1;
|
||||
cache_init(&cache, opt);
|
||||
freet(dup);
|
||||
} else {
|
||||
cache.hashtable = (void *) coucal_new(0);
|
||||
}
|
||||
|
||||
sback = back_new(opt, opt->maxsoc * 32 + 1024);
|
||||
/* same wiring as hts_mirror (htscore.c) */
|
||||
hash_init(opt, &hash, opt->urlhack);
|
||||
hash.liens = (const lien_url *const *const *) &opt->liens;
|
||||
opt->hash = &hash;
|
||||
hts_record_init(opt);
|
||||
|
||||
for (i = 2; i < argc; i++) {
|
||||
if (strncmp(argv[i], "prior=", 6) == 0) {
|
||||
char *dup = strdupt(argv[i] + 6);
|
||||
char *const p1 = strchr(dup, '|');
|
||||
char *const p2 = p1 != NULL ? strchr(p1 + 1, '|') : NULL;
|
||||
|
||||
if (p2 == NULL) {
|
||||
fprintf(stderr, "savename: prior needs adr|fil|sav\n");
|
||||
return 1;
|
||||
}
|
||||
*p1 = *p2 = '\0';
|
||||
if (!hts_record_link(opt, dup, p1 + 1, p2 + 1, "", "", NULL))
|
||||
return 1;
|
||||
freet(dup);
|
||||
}
|
||||
}
|
||||
|
||||
memset(&headers, 0, sizeof(headers));
|
||||
headers.status = 0;
|
||||
headers.r.statuscode = HTTP_OK;
|
||||
headers.status = status;
|
||||
headers.r.statuscode = statuscode;
|
||||
strcpybuff(headers.r.contenttype, argv[1]);
|
||||
if (cdispo != NULL)
|
||||
strcpybuff(headers.r.cdispo, cdispo);
|
||||
strcpybuff(headers.url_fil, argv[0]);
|
||||
if (body != NULL) { /* leading body bytes, exposed via url_sav */
|
||||
char BIGSTK data[1024];
|
||||
const size_t n = st_decode_body(body, data, sizeof(data));
|
||||
FILE *const fp = fopen(bodyfile, "wb");
|
||||
|
||||
if (fp == NULL || fwrite(data, 1, n, fp) != n) {
|
||||
fprintf(stderr, "savename: can not write %s\n", bodyfile);
|
||||
return 1;
|
||||
}
|
||||
fclose(fp);
|
||||
strcpybuff(headers.url_sav, bodyfile);
|
||||
}
|
||||
|
||||
url_savename(&afs, NULL, NULL, NULL, opt, sback, &cache, &hash, 0, 0,
|
||||
&headers);
|
||||
if (body != NULL)
|
||||
(void) UNLINK(bodyfile);
|
||||
printf("savename: %s\n", afs.save);
|
||||
return 0;
|
||||
}
|
||||
@@ -1305,6 +1542,165 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* #159: hts_redirect_same_savefile decides whether a redirect is a same-file
|
||||
* alias. */
|
||||
static int st_redirect_samefile(httrackp *opt, int argc, char **argv) {
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
#define SAME(aa, fa, ab, fb) hts_redirect_same_savefile(opt, aa, fa, ab, fb)
|
||||
/* scheme and userinfo collapse (the #159 case); a different path does not */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
assertf(SAME("http://user@foo.com", "/a", "http://foo.com", "/a"));
|
||||
assertf(!SAME("http://foo.com", "/a", "http://foo.com", "/b"));
|
||||
/* www stays distinct here; the crawl's dedup layer folds www, not this helper
|
||||
*/
|
||||
opt->urlhack = HTS_TRUE;
|
||||
opt->no_www_dedup = opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
assertf(!SAME("http://www.foo.com", "/a", "http://foo.com", "/a"));
|
||||
/* slash/query fold only when the dedup flag is on */
|
||||
assertf(SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_TRUE;
|
||||
assertf(!SAME("https://foo.com", "/a//b", "http://foo.com", "/a/b"));
|
||||
assertf(
|
||||
!SAME("https://foo.com", "/p?b=2&a=1", "http://foo.com", "/p?a=1&b=2"));
|
||||
/* but a pure scheme alias still collapses regardless of dedup opt-outs */
|
||||
assertf(SAME("http://foo.com", "/a/b", "https://foo.com", "/a/b"));
|
||||
opt->no_slash_dedup = opt->no_query_dedup = HTS_FALSE;
|
||||
#undef SAME
|
||||
printf("redirect-samefile self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// hts_finish_makeindex writes the footer, emits the refresh meta only when
|
||||
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
|
||||
static int st_makeindex(httrackp *opt, int argc, char **argv) {
|
||||
char path[HTS_URLMAXSIZE];
|
||||
char buf[4096];
|
||||
FILE *fp;
|
||||
size_t n;
|
||||
int done;
|
||||
|
||||
assertf(argc >= 1);
|
||||
snprintf(path, sizeof(path), "%s/index.html", argv[0]);
|
||||
|
||||
/* single first link: footer + a refresh meta carrying the escaped URL */
|
||||
done = 0;
|
||||
fp = fopen(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
hts_finish_makeindex(opt, &done, &fp, 1, "http://example.com/a b", "%s%s", "",
|
||||
"");
|
||||
assertf(fp == NULL); /* the function closed and cleared it */
|
||||
assertf(done != 0);
|
||||
fp = fopen(path, "rb");
|
||||
assertf(fp != NULL);
|
||||
n = fread(buf, 1, sizeof(buf) - 1, fp);
|
||||
fclose(fp);
|
||||
buf[n] = '\0';
|
||||
assertf(strstr(buf, "Mirror and index made by HTTrack") != NULL);
|
||||
assertf(strstr(buf, "Refresh") != NULL);
|
||||
assertf(strstr(buf, "example.com") != NULL);
|
||||
|
||||
/* no single link: footer only, no refresh meta */
|
||||
done = 0;
|
||||
fp = fopen(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
hts_finish_makeindex(opt, &done, &fp, 0, NULL, "%s%s", "", "");
|
||||
assertf(fp == NULL);
|
||||
assertf(done != 0);
|
||||
fp = fopen(path, "rb");
|
||||
assertf(fp != NULL);
|
||||
n = fread(buf, 1, sizeof(buf) - 1, fp);
|
||||
fclose(fp);
|
||||
buf[n] = '\0';
|
||||
assertf(strstr(buf, "Mirror and index made by HTTrack") != NULL);
|
||||
assertf(strstr(buf, "Refresh") == NULL);
|
||||
|
||||
UNLINK(path);
|
||||
printf("makeindex self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Each inplace_escape_*() must equal escape_*() on a copy. */
|
||||
static int st_inplace_escape(httrackp *opt, int argc, char **argv) {
|
||||
/* >255 bytes forces the helper's malloct path, not the stack buffer */
|
||||
static char longstr[600];
|
||||
static const char *const samples[] = {
|
||||
"", "abc", "a b/c?d=e&f", "h\x8ello w\x94rld",
|
||||
"a%b\"c<d>", "/path to/file", longstr};
|
||||
static size_t (*const inplace[])(char *, size_t) = {
|
||||
inplace_escape_in_url, inplace_escape_spc_url, inplace_escape_uri_utf,
|
||||
inplace_escape_check_url, inplace_escape_uri};
|
||||
static size_t (*const plain[])(const char *, char *, size_t) = {
|
||||
escape_in_url, escape_spc_url, escape_uri_utf, escape_check_url,
|
||||
escape_uri};
|
||||
size_t i, f;
|
||||
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
|
||||
memset(longstr, 'a', sizeof(longstr) - 1);
|
||||
for (f = 0; f < sizeof(inplace) / sizeof(inplace[0]); f++) {
|
||||
for (i = 0; i < sizeof(samples) / sizeof(samples[0]); i++) {
|
||||
char ref[4096], work[4096];
|
||||
size_t rret, iret;
|
||||
rret = plain[f](samples[i], ref, sizeof(ref));
|
||||
strcpybuff(work, samples[i]);
|
||||
iret = inplace[f](work, sizeof(work));
|
||||
assertf(iret == rret);
|
||||
assertf(strcmp(work, ref) == 0);
|
||||
}
|
||||
}
|
||||
printf("inplace-escape self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Pin HTS_HTMLESCAPE*_MAXEXP to each escaper's true max byte expansion. */
|
||||
static int st_escape_room(httrackp *opt, int argc, char **argv) {
|
||||
/* N > 1023: where 6n outgrows the old 5n+1024 reservation */
|
||||
enum { N = 2000 };
|
||||
|
||||
char *src = malloct(N + 1);
|
||||
char *dst;
|
||||
size_t room, got;
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
|
||||
/* _full worst case: a high byte expands to "&#xHH;" (6 bytes) */
|
||||
memset(src, 0xE9, N);
|
||||
src[N] = '\0';
|
||||
room = (size_t) N * HTS_HTMLESCAPE_FULL_MAXEXP + 1024;
|
||||
dst = malloct(room);
|
||||
got = escape_for_html_print_full(src, dst, room);
|
||||
assertf(got == (size_t) N * HTS_HTMLESCAPE_FULL_MAXEXP);
|
||||
assertf(strlen(dst) == got);
|
||||
freet(dst);
|
||||
|
||||
/* one factor short overflows (returns size), truncating the page: the bug */
|
||||
room = (size_t) N * (HTS_HTMLESCAPE_FULL_MAXEXP - 1) + 1024;
|
||||
dst = malloct(room);
|
||||
got = escape_for_html_print_full(src, dst, room);
|
||||
assertf(got == room);
|
||||
freet(dst);
|
||||
|
||||
/* plain escaper worst case: '&' -> "&" (5); high bytes stay verbatim */
|
||||
memset(src, '&', N);
|
||||
src[N] = '\0';
|
||||
room = (size_t) N * HTS_HTMLESCAPE_MAXEXP + 1024;
|
||||
dst = malloct(room);
|
||||
got = escape_for_html_print(src, dst, room);
|
||||
assertf(got == (size_t) N * HTS_HTMLESCAPE_MAXEXP);
|
||||
assertf(strlen(dst) == got);
|
||||
freet(dst);
|
||||
|
||||
freet(src);
|
||||
printf("escape-room self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Default User-Agent: honest HTTrack token, no resurrected Windows 98. */
|
||||
static int st_useragent(httrackp *opt, int argc, char **argv) {
|
||||
const char *ua = StringBuff(opt->user_agent);
|
||||
@@ -1574,6 +1970,86 @@ static int st_robots(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* get_ftp_line must bound a hostile, CRLF-less reply into its internal
|
||||
1024-byte buffer; ASan turns the pre-fix overflow into an abort here. */
|
||||
#ifndef _WIN32
|
||||
static int st_ftpline(httrackp *opt, int argc, char **argv) {
|
||||
int sv[2];
|
||||
char line[2048];
|
||||
char flood[4096];
|
||||
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
memset(flood, 'x', sizeof(flood));
|
||||
assertf(socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == 0);
|
||||
assertf(write(sv[1], "220 ", 4) == 4); // valid 3-digit code
|
||||
assertf(write(sv[1], flood, sizeof(flood)) == (ssize_t) sizeof(flood));
|
||||
assertf(write(sv[1], "\r\n", 2) == 2); // end the line so we return
|
||||
close(sv[1]);
|
||||
line[0] = '\0';
|
||||
get_ftp_line(sv[0], line, sizeof(line), 5);
|
||||
close(sv[0]);
|
||||
printf("ftp-line self-test OK (bounded %d-byte reply)\n",
|
||||
(int) sizeof(flood));
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ftp_split_userpass: well-formed split, plus a hostile over-long userinfo
|
||||
that pre-fix overran user[256]/pass[256]. */
|
||||
static int st_ftpuser(httrackp *opt, int argc, char **argv) {
|
||||
char user[256], pass[256];
|
||||
char in[1200];
|
||||
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
{
|
||||
const char ok[] = "bob:secret@host/f"; // '@' at index 10
|
||||
|
||||
ftp_split_userpass(ok, ok + 11, user, sizeof(user), pass, sizeof(pass));
|
||||
assertf(strcmp(user, "bob") == 0);
|
||||
assertf(strcmp(pass, "secret") == 0);
|
||||
}
|
||||
memset(in, 'u', 400);
|
||||
in[400] = ':';
|
||||
memset(in + 401, 'p', 400);
|
||||
in[801] = '@';
|
||||
in[802] = '\0';
|
||||
ftp_split_userpass(in, in + 802, user, sizeof(user), pass, sizeof(pass));
|
||||
assertf(strlen(user) == sizeof(user) - 1);
|
||||
assertf(strlen(pass) == sizeof(pass) - 1);
|
||||
{
|
||||
/* tight sizes + guard byte catch an off-by-one the 256 case can't */
|
||||
char ubuf[16], pbuf[16];
|
||||
|
||||
memset(ubuf, 'Z', sizeof(ubuf));
|
||||
memset(pbuf, 'Z', sizeof(pbuf));
|
||||
ftp_split_userpass(in, in + 802, ubuf, 8, pbuf, 8);
|
||||
assertf(strcmp(ubuf, "uuuuuuu") == 0);
|
||||
assertf(strcmp(pbuf, "ppppppp") == 0);
|
||||
assertf(ubuf[8] == 'Z' && pbuf[8] == 'Z');
|
||||
}
|
||||
printf("ftp-userpass self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* hts_count_fits caps the .class constant-pool entry count to the file size,
|
||||
rejecting the ~68 MB-per-file calloc DoS. */
|
||||
static int st_java(httrackp *opt, int argc, char **argv) {
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
assertf(hts_count_fits(10, 1000) == HTS_TRUE);
|
||||
assertf(hts_count_fits(0, 10) == HTS_TRUE);
|
||||
assertf(hts_count_fits(65535, 10) == HTS_FALSE);
|
||||
assertf(hts_count_fits(1, 0) == HTS_FALSE);
|
||||
assertf(hts_count_fits(1, -1) == HTS_FALSE);
|
||||
printf("java constant-pool cap self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* Registry: name -> handler, with a usage hint and a one-line description. */
|
||||
/* ------------------------------------------------------------ */
|
||||
@@ -1594,6 +2070,8 @@ static const struct selftest_entry {
|
||||
st_stripquery},
|
||||
{"urlhack", "", "-%u url-hack sub-flag (www/slash/query) self-test",
|
||||
st_urlhack},
|
||||
{"redirect-samefile", "", "same-file redirect detection self-test (#159)",
|
||||
st_redirect_samefile},
|
||||
{"mime", "<filename>", "MIME type for a filename", st_mime},
|
||||
{"charset", "<charset> <string>",
|
||||
"convert a string to UTF-8 from a charset", st_charset},
|
||||
@@ -1602,6 +2080,8 @@ static const struct selftest_entry {
|
||||
{"idna-decode", "<host>", "decode an IDNA/punycode hostname",
|
||||
st_idna_decode},
|
||||
{"entities", "<string> [encoding]", "unescape HTML entities", st_entities},
|
||||
{"unescape-bounds", "", "unescapers reserve the NUL byte (no 1-byte OOB)",
|
||||
st_unescape_bounds},
|
||||
{"hashtable", "<count|file>", "coucal hashtable stress test", st_hashtable},
|
||||
{"strsafe", "[overflow|overflow-buff [str]]", "bounded string-op self-test",
|
||||
st_strsafe},
|
||||
@@ -1611,8 +2091,10 @@ static const struct selftest_entry {
|
||||
st_relative},
|
||||
{"resolve", "<link> <adr> <fil>", "resolve a link against an origin",
|
||||
st_resolve},
|
||||
{"savename", "<fil> <content-type>", "local save-name for a URL",
|
||||
st_savename},
|
||||
{"header", "<raw-header-line> ...", "response header-line parsing",
|
||||
st_header},
|
||||
{"savename", "<fil> <content-type> [key=value ...]",
|
||||
"local save-name for a URL", st_savename},
|
||||
{"cache", "<dir>", "cache read/write round-trip self-test", st_cache},
|
||||
{"cache-golden", "<dir> [regen]", "frozen cache-format read self-test",
|
||||
st_cache_golden},
|
||||
@@ -1621,11 +2103,23 @@ static const struct selftest_entry {
|
||||
{"dns", "", "DNS resolver/cache self-test", st_dns},
|
||||
{"cookies", "", "cookie request-header self-test", st_cookies},
|
||||
{"useragent", "", "default User-Agent self-test", st_useragent},
|
||||
{"makeindex", "[dir]", "hts_finish_makeindex footer/refresh self-test",
|
||||
st_makeindex},
|
||||
{"inplace-escape", "", "inplace_escape_* vs escape_* equivalence self-test",
|
||||
st_inplace_escape},
|
||||
{"escape-room", "", "HT_ADD_HTMLESCAPED* reservation-factor self-test",
|
||||
st_escape_room},
|
||||
{"status", "", "HTTP status code -> reason phrase self-test", st_status},
|
||||
{"acceptencoding", "[dir]",
|
||||
"Accept-Encoding advertises gzip+deflate, both decode", st_acceptencoding},
|
||||
{"robots", "", "robots.txt RFC 9309 Allow/Disallow precedence self-test",
|
||||
st_robots},
|
||||
#ifndef _WIN32
|
||||
{"ftp-line", "", "get_ftp_line bounds a hostile FTP reply line",
|
||||
st_ftpline},
|
||||
#endif
|
||||
{"ftp-userpass", "", "ftp_split_userpass bounds URL userinfo", st_ftpuser},
|
||||
{"java", "", "java .class constant-pool count cap self-test", st_java},
|
||||
};
|
||||
|
||||
static void list_selftests(void) {
|
||||
|
||||
@@ -121,9 +121,6 @@ struct String {
|
||||
/** Byte at POS (read/write). No bounds check; POS must be < StringLength. **/
|
||||
#define StringSubRW(BLK, POS) (StringBuffRW(BLK)[POS])
|
||||
|
||||
/** Subcharacter (read/write) **/
|
||||
#define StringSubRW(BLK, POS) (StringBuffRW(BLK)[POS])
|
||||
|
||||
/** Byte POS positions from the end (read). POS==1 is the last byte. **/
|
||||
#define StringRight(BLK, POS) (StringBuff(BLK)[StringLength(BLK) - POS])
|
||||
|
||||
@@ -191,8 +188,9 @@ HTS_STATIC char *StringBuffN_(String *blk, int size) {
|
||||
asserts SIZE fits the existing content; does not (re)allocate. **/
|
||||
#define StringSetLength(BLK, SIZE) \
|
||||
do { \
|
||||
if (SIZE >= 0) { \
|
||||
(BLK).length_ = SIZE; \
|
||||
const int len__ = (SIZE); /* signed: negative means strlen(buffer_) */ \
|
||||
if (len__ >= 0) { \
|
||||
(BLK).length_ = len__; \
|
||||
} else { \
|
||||
(BLK).length_ = strlen((BLK).buffer_); \
|
||||
} \
|
||||
@@ -308,10 +306,11 @@ HTS_STATIC void StringAttach(String *blk, char **str) {
|
||||
#define StringCatN(BLK, STR, SIZE) \
|
||||
do { \
|
||||
const char *str__ = (STR); \
|
||||
const size_t usize__ = (SIZE); \
|
||||
if (str__ != NULL) { \
|
||||
size_t size__ = strlen(str__); \
|
||||
if (size__ > (SIZE)) { \
|
||||
size__ = (SIZE); \
|
||||
if (size__ > usize__) { \
|
||||
size__ = usize__; \
|
||||
} \
|
||||
StringMemcat(BLK, str__, size__); \
|
||||
} \
|
||||
|
||||
7
tests/01_engine-escape-room.test
Normal file
7
tests/01_engine-escape-room.test
Normal file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# HT_ADD_HTMLESCAPED* must reserve the escaper's worst case (6 for _full).
|
||||
httrack -O /dev/null -#test=escape-room run | grep -q "escape-room self-test OK"
|
||||
7
tests/01_engine-ftp-line.test
Executable file
7
tests/01_engine-ftp-line.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# get_ftp_line bounds a hostile CRLF-less FTP reply into its 1024-byte buffer.
|
||||
httrack -O /dev/null -#test=ftp-line run | grep -q "ftp-line self-test OK"
|
||||
7
tests/01_engine-ftp-userpass.test
Executable file
7
tests/01_engine-ftp-userpass.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ftp_split_userpass bounds an over-long user:pass@ from a hostile ftp:// URL.
|
||||
httrack -O /dev/null -#test=ftp-userpass run | grep -q "ftp-userpass self-test OK"
|
||||
29
tests/01_engine-header.test
Normal file
29
tests/01_engine-header.test
Normal file
@@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Response header-line parsing (treathead via -#test=header <raw-line> ...).
|
||||
# Isolates the wire layer from url_savename, which strips traversal on its own.
|
||||
|
||||
hdr() {
|
||||
local want="$1"
|
||||
shift
|
||||
out="$(httrack -O /dev/null -#test=header "$@" | grep '^contenttype=')"
|
||||
test "$out" == "$want" || {
|
||||
echo "FAIL: $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
hdr 'contenttype=application/pdf cdispo=' 'Content-Type: application/pdf'
|
||||
|
||||
# filename= is honored quoted or bare.
|
||||
hdr 'contenttype= cdispo=report.pdf' \
|
||||
'Content-Disposition: attachment; filename="report.pdf"'
|
||||
hdr 'contenttype= cdispo=report.pdf' \
|
||||
'Content-Disposition: attachment; filename=report.pdf'
|
||||
|
||||
# Path components in the filename are dropped on the wire (RFC 2616).
|
||||
hdr 'contenttype= cdispo=evil.pdf' \
|
||||
'Content-Disposition: attachment; filename="../../evil.pdf"'
|
||||
7
tests/01_engine-inplace-escape.test
Executable file
7
tests/01_engine-inplace-escape.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# inplace_escape_*() must match escape_*() on a copy: guards the shared helper.
|
||||
httrack -O /dev/null -#test=inplace-escape run | grep -q "inplace-escape self-test OK"
|
||||
7
tests/01_engine-java.test
Executable file
7
tests/01_engine-java.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# .class constant-pool count is capped to the file size (calloc DoS).
|
||||
httrack -O /dev/null -#test=java run | grep -q "java constant-pool cap self-test OK"
|
||||
12
tests/01_engine-makeindex.test
Executable file
12
tests/01_engine-makeindex.test
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# hts_finish_makeindex writes the footer and gates the refresh meta on a single
|
||||
# first link (guards the macro->function extraction).
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
httrack -O /dev/null -#test=makeindex "$dir" run |
|
||||
grep -q "makeindex self-test OK"
|
||||
9
tests/01_engine-redirect.test
Normal file
9
tests/01_engine-redirect.test
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# #159: a redirect to a same-file alias (http<->https, user@host, ..) must be
|
||||
# followed through, not turned into a self-pointing "moved" stub. The decision
|
||||
# helper is exercised by the engine self-test.
|
||||
httrack -O /dev/null -#test=redirect-samefile run | grep -q "redirect-samefile self-test OK"
|
||||
@@ -3,13 +3,38 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Local save-name extension resolution (url_savename via -#test=savename <fil> <content-type>).
|
||||
# Asserts on the basename of "savename: <path>".
|
||||
# Local save-name resolution (url_savename via -#test=savename <fil> <content-type> [key=value ...]).
|
||||
# name() asserts on the basename, full() on the whole path; prior= registers an
|
||||
# already-crawled link whose sav is rooted under the -O path (/dev/null here).
|
||||
|
||||
# resolve httrack before cd: make check puts a RELATIVE ../src on PATH
|
||||
httrack_bin=$(cd "$(dirname "$(command -v httrack)")" && pwd)/httrack
|
||||
|
||||
# scratch dir: body= and cached= write temp files (st-savename-body.tmp, hts-cache/)
|
||||
scratch=$(mktemp -d)
|
||||
trap 'rm -rf "$scratch"' EXIT
|
||||
cd "$scratch"
|
||||
|
||||
run() {
|
||||
"$httrack_bin" -O /dev/null -#test=savename "$@" | sed -n 's/^savename: //p'
|
||||
}
|
||||
|
||||
name() {
|
||||
out="$(httrack -O /dev/null -#test=savename "$1" "$2" | sed -n 's/^savename: //p')"
|
||||
test "${out##*/}" == "$3" || {
|
||||
echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
|
||||
local fil="$1" ctype="$2" want="$3"
|
||||
shift 3
|
||||
out="$(run "$fil" "$ctype" "$@")"
|
||||
test "${out##*/}" == "$want" || {
|
||||
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
full() {
|
||||
local fil="$1" ctype="$2" want="$3"
|
||||
shift 3
|
||||
out="$(run "$fil" "$ctype" "$@")"
|
||||
test "$out" == "$want" || {
|
||||
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
@@ -39,3 +64,96 @@ name '/types/data.json' 'application/json' 'data.json'
|
||||
|
||||
# Agreeing type must not rewrite the extension's casing (no strip-and-reappend).
|
||||
name '/x.JPG' 'image/jpeg' 'x.JPG'
|
||||
|
||||
# A Content-Disposition filename replaces the URL name outright.
|
||||
name '/x.php' 'application/pdf' 'report.pdf' cdispo=report.pdf
|
||||
name '/download' 'text/html' 'setup.exe' cdispo=setup.exe
|
||||
|
||||
# Reserved characters in a hostile Content-Disposition name are sanitized.
|
||||
name '/x.php' 'application/pdf' 'set_up.exe' 'cdispo=set:up.exe'
|
||||
|
||||
# The md5-of-query suffix lands inside a Content-Disposition name too.
|
||||
name '/x.php?id=1' 'application/pdf' 'report681a.pdf' cdispo=report.pdf
|
||||
|
||||
# Still-downloading path (status=-1): mime drives the ext, cdispo is ignored
|
||||
# there (the deliberately unfolded 4th resolve_extension variant).
|
||||
name '/x.pdf' 'text/html' 'x.html' status=-1
|
||||
name '/x.html' 'text/html' 'x.html' status=-1
|
||||
name '/x.php' 'application/pdf' 'x.pdf' status=-1 cdispo=report.pdf
|
||||
|
||||
# Contested type (wire disagrees with a specific ext): the wire is trusted and
|
||||
# body bytes are not consulted; pinned so a content-based tie-break shows up
|
||||
# as an explicit flip of these rows.
|
||||
name '/photo.jpg' 'image/png' 'photo.png' body=hex:FFD8FFE000104A46
|
||||
name '/photo.jpg' 'image/png' 'photo.png' body=hex:89504E470D0A1A0A
|
||||
name '/photo.jpg' 'image/png' 'photo.png'
|
||||
name '/doc.pdf' 'text/html' 'doc.html' body=hex:255044462D312E34
|
||||
name '/doc.pdf' 'text/html' 'doc.html' 'body=<html><body>soft 404</body></html>'
|
||||
name '/style.css' 'image/png' 'style.png' 'body=body { }'
|
||||
|
||||
# A redirect answer resolves nothing: delayed placeholder name.
|
||||
name '/x.php' 'text/html' 'x.0.delayed' statuscode=301
|
||||
|
||||
# Root and query-only URLs get index + the md5-of-query suffix.
|
||||
name '/' 'text/html' 'index.html'
|
||||
name '/?a=1' 'text/html' 'index3872.html'
|
||||
|
||||
# Same URL crawled before: reuse its sav verbatim (case preserved).
|
||||
full '/X.PHP' 'text/html' 'www.example.com/CASE.HTML' \
|
||||
'prior=www.example.com|/X.PHP|www.example.com/CASE.HTML'
|
||||
|
||||
# Another URL owns the name: collision suffix -2, then -3, case-insensitively.
|
||||
name '/x.php' 'text/html' 'x-2.html' \
|
||||
'prior=www.example.com|/other.html|/dev/null/www.example.com/x.html'
|
||||
name '/x.php' 'text/html' 'x-3.html' \
|
||||
'prior=www.example.com|/o1.html|/dev/null/www.example.com/x.html' \
|
||||
'prior=www.example.com|/o2.html|/dev/null/www.example.com/x-2.html'
|
||||
name '/INDEX.HTML' 'text/html' 'INDEX-2.HTML' \
|
||||
'prior=www.example.com|/index.html|/dev/null/www.example.com/index.html'
|
||||
|
||||
# Same basename in another directory is NOT a collision.
|
||||
name '/x.php' 'text/html' 'x.html' \
|
||||
'prior=www.example.com|/sub/x.html|/dev/null/www.example.com/sub/x.html'
|
||||
|
||||
# 8-3 modes: DOS truncates every component to 8+3, ISO9660 level 2 to 31.
|
||||
full '/directory-long/verylongfilename.html' 'text/html' \
|
||||
'/dev/null/EXAMPLE/DIRECTOR/VERYLONG.HTM' n83=1
|
||||
full '/directory-long/verylongfilename.html' 'text/html' \
|
||||
'/dev/null/EXAMPLE_C/DIRECTORY_LONG/VERYLONGFILENAME.HTM' n83=2
|
||||
name '/verylongfilename.php' 'text/html' 'VERYLO-2.HTM' n83=1 \
|
||||
'prior=www.example.com|/other.html|/dev/null/EXAMPLE/VERYLONG.HTM'
|
||||
|
||||
# urlhack dedup (#271): // collapse and www-strip map to the prior link's sav;
|
||||
# the per-feature negatives opt out and take a fresh name.
|
||||
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/PRIOR.html' \
|
||||
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
|
||||
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' no-slash=1 \
|
||||
'prior=www.example.com|/a/b.php|/dev/null/www.example.com/a/PRIOR.html'
|
||||
full '/w.php' 'text/html' '/dev/null/www.example.com/W-PRIOR.html' adr=example.com \
|
||||
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
|
||||
full '/w.php' 'text/html' '/dev/null/example.com/w.html' adr=example.com no-www=1 \
|
||||
'prior=www.example.com|/w.php|/dev/null/www.example.com/W-PRIOR.html'
|
||||
|
||||
# Distinct URLs must stay distinct under urlhack (no over-normalization).
|
||||
full '/a//b.php' 'text/html' '/dev/null/www.example.com/a/b.html' \
|
||||
'prior=www.example.com|/a/c.php|/dev/null/www.example.com/a/C-PRIOR.html'
|
||||
|
||||
# --strip-query (#112): stripped key dedups onto the prior sav; without the
|
||||
# option the same URLs stay distinct.
|
||||
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/PAGE-PRIOR.html' \
|
||||
strip=sid 'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
|
||||
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
|
||||
'prior=www.example.com|/page.php?id=3|/dev/null/www.example.com/PAGE-PRIOR.html'
|
||||
|
||||
# A kept key that differs must still block the dedup (no over-stripping).
|
||||
full '/page.php?id=3&sid=42' 'text/html' '/dev/null/www.example.com/page475b.html' \
|
||||
strip=sid 'prior=www.example.com|/page.php?id=4|/dev/null/www.example.com/PAGE-PRIOR.html'
|
||||
|
||||
# Hostile fils stay rooted under the mirror: ../ (raw or %2e-encoded) drops out,
|
||||
# control characters become spaces, oversized names cap at 210 chars (the cap
|
||||
# can chop the extension off entirely).
|
||||
full '/../../etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
|
||||
full '/%2e%2e/%2e%2e/etc/passwd' 'text/html' '/dev/null/www.example.com///etc/passwd.html'
|
||||
full '/x.php' 'application/pdf' '/dev/null/www.example.com///evil.exe' 'cdispo=../../evil.exe'
|
||||
name $'/evil\rname\t.php' 'text/html' 'evil name .html'
|
||||
name "/$(printf 'a%.0s' {1..300}).php" 'text/html' "$(printf 'a%.0s' {1..210})"
|
||||
|
||||
7
tests/01_engine-unescape-bounds.test
Executable file
7
tests/01_engine-unescape-bounds.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Entity/URL unescapers reserve one byte for the trailing NUL (no 1-byte OOB).
|
||||
httrack -O /dev/null -#test=unescape-bounds run | grep -q "unescape-bounds self-test OK"
|
||||
33
tests/01_zlib-savename-cached.test
Normal file
33
tests/01_zlib-savename-cached.test
Normal file
@@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Update-run naming from a real cache entry (-#test=savename cached=<ctype>|<save>).
|
||||
# Named 01_zlib-*: the cache writer needs zlib, which the MSan job can't run.
|
||||
|
||||
# resolve httrack before cd: make check puts a RELATIVE ../src on PATH
|
||||
httrack_bin=$(cd "$(dirname "$(command -v httrack)")" && pwd)/httrack
|
||||
|
||||
scratch=$(mktemp -d)
|
||||
trap 'rm -rf "$scratch"' EXIT
|
||||
cd "$scratch"
|
||||
|
||||
name() {
|
||||
local fil="$1" ctype="$2" want="$3"
|
||||
shift 3
|
||||
out="$("$httrack_bin" -O /dev/null -#test=savename "$fil" "$ctype" "$@" | sed -n 's/^savename: //p')"
|
||||
test "${out##*/}" == "$want" || {
|
||||
echo "FAIL: '$fil' '$ctype' $* -> '$out' (want '$want')"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
# Names are re-derived from the stored headers on every run: neither the
|
||||
# recorded save name nor the cached body bytes change the verdict (pinned).
|
||||
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.jpg'
|
||||
name '/photo.jpg' 'image/png' 'photo.png' 'cached=image/png|www.example.com/photo.png'
|
||||
name '/photo.jpg' 'image/jpeg' 'photo.jpg' 'cached=image/jpeg|www.example.com/photo.png'
|
||||
name '/style.css' 'image/png' 'style.png' 'cached=image/png|www.example.com/style.css'
|
||||
# agreement keeps the URL ext verbatim (.jpeg), never canonicalized to .jpg
|
||||
name '/photo.jpeg' 'image/jpeg' 'photo.jpeg' 'cached=image/jpeg|www.example.com/photo.jpeg'
|
||||
@@ -15,6 +15,10 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/photo.png' \
|
||||
--found 'types/doc.pdf' \
|
||||
--found 'types/lie.html' --not-found 'types/lie.png' \
|
||||
--found 'types/wrongtype.png' --not-found 'types/wrongtype.jpg' \
|
||||
--found 'types/bigtype.png' --not-found 'types/bigtype.jpg' \
|
||||
--found 'types/packed.png' --not-found 'types/packed.jpg' \
|
||||
--found 'types/mutant.png' --not-found 'types/mutant.jpg' \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/page.htm' --not-found 'types/page.html' \
|
||||
--found 'types/script.js' \
|
||||
|
||||
@@ -12,4 +12,7 @@ bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/lie.html' \
|
||||
--found 'types/wrongtype.png' --not-found 'types/wrongtype.jpg' \
|
||||
--found 'types/packed.png' --not-found 'types/packed.jpg' \
|
||||
--found 'types/mutant.png' --not-found 'types/mutant.jpg' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
|
||||
13
tests/30_local-fragment-link.test
Executable file
13
tests/30_local-fragment-link.test
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
# Issue #279: an anchored link (target.html#sec, quoted or bare) fetches the
|
||||
# target with the fragment dropped (strict server 400s on a '#' in the request)
|
||||
# but keeps it in the rewritten local link so the anchor still works.
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'fraglink/target.html' \
|
||||
--file-matches 'fraglink/index.html' 'href=target\.html#sec' \
|
||||
--file-matches 'fraglink/index.html' 'href="target\.html#sec2"' \
|
||||
httrack 'BASEURL/fraglink/index.html'
|
||||
23
tests/31_local-javaclass.test
Normal file
23
tests/31_local-javaclass.test
Normal file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
# The java plugin must load (versioned dlopen name) and parse a .class
|
||||
# constant pool: a resource named only inside Foo.class gets crawled.
|
||||
set -e
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
tmproot=$(mktemp -d)
|
||||
trap 'rm -rf "$tmproot"' EXIT
|
||||
mkdir "$tmproot/javaclass"
|
||||
|
||||
cat >"$tmproot/javaclass/index.html" <<'EOF'
|
||||
<html><body><a href="Foo.class">applet</a></body></html>
|
||||
EOF
|
||||
printf 'GIF89a' >"$tmproot/javaclass/hello.gif"
|
||||
# magic/minor/major, count=2, one CONSTANT_Utf8 "hello.gif", class/superclass
|
||||
printf '\xCA\xFE\xBA\xBE\x00\x00\x00\x32\x00\x02\x01\x00\x09hello.gif\x00\x00\x00\x00' \
|
||||
>"$tmproot/javaclass/Foo.class"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --root "$tmproot" --errors 0 \
|
||||
--found 'javaclass/Foo.class' \
|
||||
--found 'javaclass/hello.gif' \
|
||||
httrack 'BASEURL/javaclass/index.html'
|
||||
17
tests/32_local-cdispo.test
Normal file
17
tests/32_local-cdispo.test
Normal file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Content-Disposition names the saved file: the attachment filename replaces
|
||||
# the URL-derived name, and a traversal filename is reduced to its last
|
||||
# component, inside the mirror.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'cdispo/report.pdf' \
|
||||
--file-matches 'cdispo/report.pdf' '%PDF' \
|
||||
--not-found 'cdispo/fetch.pdf' \
|
||||
--found 'cdispo/evil.pdf' \
|
||||
--not-found 'evil.pdf' \
|
||||
httrack 'BASEURL/cdispo/index.html'
|
||||
20
tests/33_local-delayed.test
Normal file
20
tests/33_local-delayed.test
Normal file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Degenerate delayed-type paths (#5/#107 family): redirects that never resolve
|
||||
# a name must drop cleanly -- no .delayed leftovers (audited by local-crawl.sh),
|
||||
# no "bogus state" cache warnings, resolvable links still land correctly.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --rerun --errors 0 \
|
||||
--found 'delayed/real.pdf' \
|
||||
--file-matches 'delayed/real.pdf' '%PDF' \
|
||||
--found 'delayed/notype.bin.html' \
|
||||
--found 'delayed/empty.html' \
|
||||
--not-found 'delayed/noloc.html' \
|
||||
--not-found 'delayed/selfloop.html' \
|
||||
--not-found 'delayed/chain9.pdf' \
|
||||
--log-not-found 'bogus state' \
|
||||
httrack 'BASEURL/delayed/index.html'
|
||||
@@ -6,6 +6,7 @@ EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
local-crawl.sh local-server.py server.crt server.key \
|
||||
server-root/simple/basic.html server-root/simple/link.html \
|
||||
server-root/stripquery/index.html server-root/stripquery/a.html \
|
||||
server-root/fraglink/index.html server-root/fraglink/target.html \
|
||||
fixtures/cache-golden/hts-cache/new.zip
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
@@ -34,12 +35,20 @@ TESTS = \
|
||||
01_engine-entities.test \
|
||||
01_engine-filelist.test \
|
||||
01_engine-filter.test \
|
||||
01_engine-ftp-line.test \
|
||||
01_engine-ftp-userpass.test \
|
||||
01_engine-hashtable.test \
|
||||
01_engine-header.test \
|
||||
01_engine-idna.test \
|
||||
01_engine-escape-room.test \
|
||||
01_engine-inplace-escape.test \
|
||||
01_engine-java.test \
|
||||
01_engine-makeindex.test \
|
||||
01_engine-mime.test \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-redirect.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
01_engine-savename.test \
|
||||
@@ -49,11 +58,13 @@ TESTS = \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
01_engine-urlhack.test \
|
||||
01_engine-unescape-bounds.test \
|
||||
01_engine-useragent.test \
|
||||
01_zlib-acceptencoding.test \
|
||||
01_zlib-cache.test \
|
||||
01_zlib-cache-golden.test \
|
||||
01_zlib-cache-writefail.test \
|
||||
01_zlib-savename-cached.test \
|
||||
02_manpage-regen.test \
|
||||
02_update-cache.test \
|
||||
10_crawl-simple.test \
|
||||
@@ -80,6 +91,10 @@ TESTS = \
|
||||
26_local-strip-query.test \
|
||||
27_local-cookies-file.test \
|
||||
28_local-pause.test \
|
||||
29_local-redirect-fragment.test
|
||||
29_local-redirect-fragment.test \
|
||||
30_local-fragment-link.test \
|
||||
31_local-javaclass.test \
|
||||
32_local-cdispo.test \
|
||||
33_local-delayed.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -15,8 +15,11 @@
|
||||
# bash local-crawl.sh [--tls] [--root DIR] [--cookie NAME=VALUE ...] \
|
||||
# --errors N --files N --found PATH ... --directory PATH ... \
|
||||
# --log-found REGEX ... --log-not-found REGEX ... \
|
||||
# --file-matches PATH REGEX ... --file-not-matches PATH REGEX ... \
|
||||
# httrack BASEURL/some/path [httrack-args...]
|
||||
# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.
|
||||
# --file-matches/--file-not-matches grep (ERE) a mirrored file (PATH under the
|
||||
# host root), to assert rewritten link/content survived the crawl.
|
||||
# --cookie writes a Netscape cookies.txt (scoped to the discovered host:port,
|
||||
# which the ephemeral port forces into the cookie domain) and passes it to
|
||||
# httrack via --cookies-file, to exercise preloaded cookies.
|
||||
@@ -121,6 +124,10 @@ while test "$pos" -lt "$nargs"; do
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
|
||||
pos=$((pos + 1))
|
||||
;;
|
||||
--file-matches | --file-not-matches)
|
||||
audit+=("${args[$pos]}" "${args[$((pos + 1))]}" "${args[$((pos + 2))]}")
|
||||
pos=$((pos + 2))
|
||||
;;
|
||||
httrack)
|
||||
pos=$((pos + 1))
|
||||
break
|
||||
@@ -239,6 +246,14 @@ done
|
||||
test -n "$hostroot" || die "could not find host root under $out"
|
||||
debug "host root: $hostroot"
|
||||
|
||||
# A completed crawl must leave no .delayed temporaries (issue #107)
|
||||
info "checking for leftover .delayed files"
|
||||
leftovers=$(find "$out" -name '*.delayed' 2>/dev/null | head -5)
|
||||
if test -z "$leftovers"; then result "OK"; else
|
||||
result "leftover: $leftovers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- audit -------------------------------------------------------------------
|
||||
i=0
|
||||
while test "$i" -lt "${#audit[@]}"; do
|
||||
@@ -294,6 +309,24 @@ while test "$i" -lt "${#audit[@]}"; do
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
--file-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} matches ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then result "OK"; else
|
||||
result "no match"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--file-not-matches)
|
||||
path="${audit[$((i + 1))]}"
|
||||
i=$((i + 2))
|
||||
info "checking ${path} lacks ${audit[$i]}"
|
||||
if grep -aqE "${audit[$i]}" "${hostroot}/${path}"; then
|
||||
result "matched"
|
||||
exit 1
|
||||
else result "OK"; fi
|
||||
;;
|
||||
esac
|
||||
i=$((i + 1))
|
||||
done
|
||||
|
||||
@@ -14,6 +14,7 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import os
|
||||
import time
|
||||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||||
@@ -134,12 +135,14 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
|
||||
# --- type/extension matrix (issue #267 family) -------------------------
|
||||
|
||||
def send_raw(self, body, content_type):
|
||||
def send_raw(self, body, content_type, extra_headers=()):
|
||||
"""Send a raw body with an explicit Content-Type, or none at all when
|
||||
content_type is None (to observe httrack's typeless-file naming)."""
|
||||
self.send_response(200)
|
||||
if content_type is not None:
|
||||
self.send_header("Content-Type", content_type)
|
||||
for name, value in extra_headers:
|
||||
self.send_header(name, value)
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
@@ -148,6 +151,8 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
# Fake-binary blobs for the image/pdf/typeless cases.
|
||||
FAKE_PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64
|
||||
FAKE_PDF = b"%PDF-1.4\n" + b"\x00" * 64
|
||||
FAKE_JPEG = b"\xff\xd8\xff\xe0" + b"\x00" * 64
|
||||
BIG_JPEG = b"\xff\xd8\xff\xe0" + bytes(range(256)) * 64 # > sniff window
|
||||
|
||||
# path -> (body, content_type); None sends no header, "" sends an empty
|
||||
# Content-Type value (no usable type, must be treated like None).
|
||||
@@ -159,6 +164,8 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/types/notype.pdf": (FAKE_PDF, None),
|
||||
"/types/emptyct.png": (FAKE_PNG, ""),
|
||||
"/types/lie.png": (FAKE_PNG, "text/html"),
|
||||
"/types/wrongtype.jpg": (FAKE_JPEG, "image/png"),
|
||||
"/types/bigtype.jpg": (BIG_JPEG, "image/png"),
|
||||
"/types/report.pdf": (b"<html><body>real page</body></html>", "text/html"),
|
||||
"/types/page.htm": (b"<html><body>htm page</body></html>", "text/html"),
|
||||
"/types/script.js": (b"var x = 1;\n", "application/javascript"),
|
||||
@@ -176,6 +183,10 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
'\t<a href="notype.pdf">notypepdf</a>\n'
|
||||
'\t<img src="emptyct.png" />\n'
|
||||
'\t<img src="lie.png" />\n'
|
||||
'\t<img src="wrongtype.jpg" />\n'
|
||||
'\t<img src="bigtype.jpg" />\n'
|
||||
'\t<img src="mutant.jpg" />\n'
|
||||
'\t<img src="packed.jpg" />\n'
|
||||
'\t<a href="report.pdf">report</a>\n'
|
||||
'\t<a href="page.htm">htm</a>\n'
|
||||
'\t<script src="script.js"></script>\n'
|
||||
@@ -190,6 +201,25 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
body, ctype = self.TYPE_MATRIX[path]
|
||||
self.send_raw(body, ctype)
|
||||
|
||||
# content changes between crawls: run 1 sniffs JPEG, the update pass must
|
||||
# keep the run-1 name (recorded verdict) even though the body is now PNG
|
||||
MUTANT_SEEN = set()
|
||||
|
||||
def route_types_mutant(self):
|
||||
path = urlsplit(self.path).path
|
||||
body = self.FAKE_PNG if path in self.MUTANT_SEEN else self.FAKE_JPEG
|
||||
if self.command != "HEAD":
|
||||
self.MUTANT_SEEN.add(path)
|
||||
self.send_raw(body, "image/png")
|
||||
|
||||
# gzip on the wire: the sniff must see the decoded body, not the stream
|
||||
def route_types_packed(self):
|
||||
self.send_raw(
|
||||
gzip.compress(self.FAKE_JPEG),
|
||||
"image/png",
|
||||
extra_headers=[("Content-Encoding", "gzip")],
|
||||
)
|
||||
|
||||
# --- MIME-type exclusion abort (issue #58) -----------------------------
|
||||
# A -mime:application/pdf filter must abort the transfer once the header
|
||||
# arrives, not download the whole body and discard it.
|
||||
@@ -354,6 +384,27 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
|
||||
# Content-Disposition naming: the attachment filename replaces the
|
||||
# URL-derived name; path components in it are stripped (RFC 2616).
|
||||
CDISPO_NAMES = {
|
||||
"/cdispo/fetch.php": "report.pdf",
|
||||
"/cdispo/evil.php": "../../evil.pdf",
|
||||
}
|
||||
|
||||
def route_cdispo_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="fetch.php">report</a>\n' '\t<a href="evil.php">evil</a>\n'
|
||||
)
|
||||
|
||||
def route_cdispo(self):
|
||||
filename = self.CDISPO_NAMES[urlsplit(self.path).path]
|
||||
cdispo = 'attachment; filename="%s"' % filename
|
||||
self.send_raw(
|
||||
self.FAKE_PDF,
|
||||
"application/pdf",
|
||||
extra_headers=[("Content-Disposition", cdispo)],
|
||||
)
|
||||
|
||||
# 302 whose Location carries a #fragment (#204): the fragment is a UA anchor
|
||||
# that must be dropped before the target is fetched. A leaked '#' reaches the
|
||||
# strict-server guard below and 400s.
|
||||
@@ -369,6 +420,50 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
def route_redir_target(self):
|
||||
self.send_raw(b"<html><body>redirect target</body></html>\n", "text/html")
|
||||
|
||||
# --- delayed-type degenerate paths (issues #5/#107) --------------------
|
||||
def route_delayed_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="noloc.php">noloc</a>\n'
|
||||
'\t<a href="selfloop.php">selfloop</a>\n'
|
||||
'\t<a href="chain1.php">chain</a>\n'
|
||||
'\t<a href="redir.php">redir</a>\n'
|
||||
'\t<a href="notype.bin">notype</a>\n'
|
||||
'\t<a href="empty.php">empty</a>\n'
|
||||
)
|
||||
|
||||
def send_redirect(self, location):
|
||||
self.send_response(302, "Found")
|
||||
if location is not None:
|
||||
self.send_header("Location", location)
|
||||
self.send_header("Content-Length", "0")
|
||||
self.end_headers()
|
||||
|
||||
def route_delayed_noloc(self):
|
||||
self.send_redirect(None) # 302 without Location: name never resolves
|
||||
|
||||
def route_delayed_selfloop(self):
|
||||
self.send_redirect("selfloop.php")
|
||||
|
||||
def route_delayed_chain(self):
|
||||
# chain1..chain9: one more hop than the type-check redirect budget
|
||||
n = int(urlsplit(self.path).path.rsplit("chain", 1)[1].split(".")[0])
|
||||
if n < 9:
|
||||
self.send_redirect("chain%d.php" % (n + 1))
|
||||
else:
|
||||
self.send_raw(self.FAKE_PDF, "application/pdf")
|
||||
|
||||
def route_delayed_redir(self):
|
||||
self.send_redirect("real.pdf")
|
||||
|
||||
def route_delayed_realpdf(self):
|
||||
self.send_raw(self.FAKE_PDF, "application/pdf")
|
||||
|
||||
def route_delayed_notype(self):
|
||||
self.send_raw(self.FAKE_PDF, None)
|
||||
|
||||
def route_delayed_empty(self):
|
||||
self.send_raw(b"", "text/html") # 200 + Content-Length: 0
|
||||
|
||||
ROUTES = {
|
||||
"/cookies/entrance.php": route_entrance,
|
||||
"/cookies/second.php": route_second,
|
||||
@@ -384,6 +479,10 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/types/notype.pdf": route_types,
|
||||
"/types/emptyct.png": route_types,
|
||||
"/types/lie.png": route_types,
|
||||
"/types/wrongtype.jpg": route_types,
|
||||
"/types/bigtype.jpg": route_types,
|
||||
"/types/mutant.jpg": route_types_mutant,
|
||||
"/types/packed.jpg": route_types_packed,
|
||||
"/types/report.pdf": route_types,
|
||||
"/types/page.htm": route_types,
|
||||
"/types/script.js": route_types,
|
||||
@@ -406,6 +505,25 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/mimex/index.html": route_mimex_index,
|
||||
"/mimex/blob.pdf": route_mimex_blob,
|
||||
"/mimex/real.html": route_mimex_real,
|
||||
"/cdispo/index.html": route_cdispo_index,
|
||||
"/cdispo/fetch.php": route_cdispo,
|
||||
"/cdispo/evil.php": route_cdispo,
|
||||
"/delayed/index.html": route_delayed_index,
|
||||
"/delayed/noloc.php": route_delayed_noloc,
|
||||
"/delayed/selfloop.php": route_delayed_selfloop,
|
||||
"/delayed/redir.php": route_delayed_redir,
|
||||
"/delayed/real.pdf": route_delayed_realpdf,
|
||||
"/delayed/notype.bin": route_delayed_notype,
|
||||
"/delayed/empty.php": route_delayed_empty,
|
||||
"/delayed/chain1.php": route_delayed_chain,
|
||||
"/delayed/chain2.php": route_delayed_chain,
|
||||
"/delayed/chain3.php": route_delayed_chain,
|
||||
"/delayed/chain4.php": route_delayed_chain,
|
||||
"/delayed/chain5.php": route_delayed_chain,
|
||||
"/delayed/chain6.php": route_delayed_chain,
|
||||
"/delayed/chain7.php": route_delayed_chain,
|
||||
"/delayed/chain8.php": route_delayed_chain,
|
||||
"/delayed/chain9.php": route_delayed_chain,
|
||||
"/redir/index.html": route_redir_index,
|
||||
"/redir/go.php": route_redir_go,
|
||||
"/redir/target.html": route_redir_target,
|
||||
|
||||
4
tests/server-root/fraglink/index.html
Normal file
4
tests/server-root/fraglink/index.html
Normal file
@@ -0,0 +1,4 @@
|
||||
<html><body>
|
||||
<a href=target.html#sec>unquoted fragment link</a>
|
||||
<a href="target.html#sec2">quoted fragment link</a>
|
||||
</body></html>
|
||||
1
tests/server-root/fraglink/target.html
Normal file
1
tests/server-root/fraglink/target.html
Normal file
@@ -0,0 +1 @@
|
||||
<html><body><a name="sec"></a><a name="sec2"></a>target</body></html>
|
||||
Reference in New Issue
Block a user