mirror of
https://github.com/xroche/httrack.git
synced 2026-06-20 00:58:47 +03:00
Compare commits
36 Commits
docs/api-h
...
fix/css-ur
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
42acbe6c97 | ||
|
|
cae11499f1 | ||
|
|
02c7f4ebf6 | ||
|
|
9070b44a70 | ||
|
|
799c045061 | ||
|
|
fb1ee3bf2e | ||
|
|
6a08ca7d39 | ||
|
|
a8b491e509 | ||
|
|
a8e4bb3b81 | ||
|
|
0145ec37a3 | ||
|
|
a80fab38ba | ||
|
|
c52a524a63 | ||
|
|
1907621d37 | ||
|
|
3b2d7afdaa | ||
|
|
6ee539619e | ||
|
|
fb098b27b4 | ||
|
|
5f6a3fb917 | ||
|
|
f9e676dbe3 | ||
|
|
1b440c44b5 | ||
|
|
ac6dd1a570 | ||
|
|
4549ec3695 | ||
|
|
ac56c31b24 | ||
|
|
ee6beeeb7d | ||
|
|
6788bda380 | ||
|
|
7ead8d595e | ||
|
|
93f502990c | ||
|
|
0f4b2596b2 | ||
|
|
4a676bb5e1 | ||
|
|
36b4e834b8 | ||
|
|
bbb423f025 | ||
|
|
eed46e0b09 | ||
|
|
fa57f0148f | ||
|
|
76260d5e6e | ||
|
|
5d0913dfce | ||
|
|
9b7601a987 | ||
|
|
4ec38c4e66 |
@@ -2532,8 +2532,26 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
#if HTS_USEOPENSSL
|
||||
/* SSL mode */
|
||||
if (back[i].r.ssl) {
|
||||
int tunnel_ok = 1;
|
||||
|
||||
// https via proxy: CONNECT-tunnel before TLS (#85)
|
||||
if (back[i].r.req.proxy.active && back[i].r.ssl_con == NULL) {
|
||||
const int timeout = back[i].timeout > 0 ? back[i].timeout : 30;
|
||||
|
||||
tunnel_ok =
|
||||
http_proxy_tunnel(opt, &back[i].r, back[i].url_adr, timeout);
|
||||
if (!tunnel_ok) {
|
||||
if (!strnotempty(back[i].r.msg))
|
||||
strcpybuff(back[i].r.msg, "proxy CONNECT failed");
|
||||
deletehttp(&back[i].r);
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
back[i].r.statuscode = STATUSCODE_NON_FATAL;
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
}
|
||||
}
|
||||
// handshake not yet launched
|
||||
if (!back[i].r.ssl_con) {
|
||||
if (tunnel_ok && !back[i].r.ssl_con) {
|
||||
SSL_CTX_set_options(openssl_ctx, SSL_OP_ALL);
|
||||
// new session
|
||||
back[i].r.ssl_con = SSL_new(openssl_ctx);
|
||||
@@ -2551,7 +2569,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
back[i].r.statuscode = STATUSCODE_SSL_HANDSHAKE;
|
||||
}
|
||||
/* Error */
|
||||
if (back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
|
||||
if (tunnel_ok && back[i].r.statuscode == STATUSCODE_SSL_HANDSHAKE) {
|
||||
strcpybuff(back[i].r.msg, "bad SSL/TLS handshake");
|
||||
deletehttp(&back[i].r);
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
@@ -2779,7 +2797,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
if (strcmp(back[i].url_fil, "/robots.txt")) {
|
||||
if (back[i].r.statuscode == HTTP_OK) { // 'OK'
|
||||
if (!is_hypertext_mime(opt, back[i].r.contenttype, back[i].url_fil)) { // pas HTML
|
||||
if (opt->getmode & 2) { // on peut ecrire des non html
|
||||
if (opt->getmode & HTS_GETMODE_NONHTML) {
|
||||
int fcheck = 0;
|
||||
int last_errno = 0;
|
||||
|
||||
@@ -2852,7 +2870,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // on coupe tout!
|
||||
} else { // on coupe tout!
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"File cancelled (non HTML): %s%s",
|
||||
back[i].url_adr, back[i].url_fil);
|
||||
@@ -3661,7 +3679,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
#endif
|
||||
if (sz >= 0) {
|
||||
if (!is_hypertext_mime(opt, back[i].r.contenttype, back[i].url_sav)) { // pas HTML
|
||||
if (opt->getmode & 2) { // on peut ecrire des non html **sinon ben euhh sera intercepté plus loin, donc rap sur ce qui va sortir**
|
||||
if (opt->getmode & HTS_GETMODE_NONHTML) {
|
||||
filenote(&opt->state.strc, back[i].url_sav, NULL); // noter fichier comme connu
|
||||
file_notify(opt, back[i].url_adr, back[i].url_fil,
|
||||
back[i].url_sav, 0, 1,
|
||||
@@ -3838,7 +3856,7 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
/* funny log for commandline users */
|
||||
//if (!opt->quiet) {
|
||||
// petite animation
|
||||
if (opt->verbosedisplay == 1) {
|
||||
if (opt->verbosedisplay == HTS_VERBOSE_SIMPLE) {
|
||||
if (back[i].status == STATUS_READY) {
|
||||
if (back[i].r.statuscode == HTTP_OK)
|
||||
printf("* %s%s (" LLintP " bytes) - OK" VT_CLREOL "\r",
|
||||
|
||||
@@ -370,7 +370,7 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
StringCopy(opt->path_html, base);
|
||||
StringCopy(opt->path_html_utf8, base);
|
||||
}
|
||||
opt->cache = 1;
|
||||
opt->cache = HTS_CACHE_PRIORITY;
|
||||
|
||||
/* pass 1: create everything in a single write session */
|
||||
selftest_open_for_write(&cache, opt);
|
||||
@@ -547,7 +547,7 @@ static void golden_setup(httrackp *opt, const char *dir) {
|
||||
StringCopy(opt->path_log, base);
|
||||
StringCopy(opt->path_html, base);
|
||||
StringCopy(opt->path_html_utf8, base);
|
||||
opt->cache = 1;
|
||||
opt->cache = HTS_CACHE_PRIORITY;
|
||||
}
|
||||
|
||||
int cache_golden_selftest(httrackp *opt, const char *dir, int regen) {
|
||||
|
||||
@@ -135,7 +135,8 @@ HTSEXT_API T_SOC catch_url_init(int *port, /* 128 bytes */ char *adr) {
|
||||
// returns 0 if error
|
||||
// url: buffer where URL must be stored - or ip:port in case of failure
|
||||
// data: 32Kb
|
||||
HTSEXT_API int catch_url(T_SOC soc, char *url, char *method, char *data) {
|
||||
HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||
char *data) {
|
||||
int retour = 0;
|
||||
|
||||
// connexion (accept)
|
||||
|
||||
@@ -1835,9 +1835,10 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
a++; // sauter espace(s)
|
||||
if (strnotempty(a)) {
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
if (strcmp(a, "/") != 0 || opt->robots >= 3)
|
||||
if (strcmp(a, "/") != 0 ||
|
||||
opt->robots >= HTS_ROBOTS_ALWAYS_STRICT)
|
||||
#endif
|
||||
{ /* ignoring disallow: / */
|
||||
{ /* ignoring disallow: / */
|
||||
if ((strlen(buff) + strlen(a) + 8) < sizeof(buff)) {
|
||||
strcatbuff(buff, a);
|
||||
strcatbuff(buff, "\n");
|
||||
@@ -1932,10 +1933,10 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
"Warning: store %s without scan: %s", r.contenttype,
|
||||
savename());
|
||||
} else {
|
||||
if ((opt->getmode & 2) != 0) { // ok autorisé
|
||||
if ((opt->getmode & HTS_GETMODE_NONHTML) != 0) {
|
||||
hts_log_print(opt, LOG_DEBUG, "Store %s: %s", r.contenttype,
|
||||
savename());
|
||||
} else { // lien non autorisé! (ex: cgi-bin en html)
|
||||
} else { // lien non autorisé! (ex: cgi-bin en html)
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"non-html file ignored after upload at %s : %s",
|
||||
urladr(), urlfil());
|
||||
@@ -2052,7 +2053,7 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
ptr++;
|
||||
|
||||
// faut-il sauter le(s) lien(s) suivant(s)? (fichiers images à passer après les html)
|
||||
if (opt->getmode & 4) { // sauver les non html après
|
||||
if (opt->getmode & HTS_GETMODE_HTML_FIRST) {
|
||||
// sauter les fichiers selon la passe
|
||||
if (!numero_passe) {
|
||||
while((ptr < opt->lien_tot) ? (heap(ptr)->pass2) : 0)
|
||||
@@ -2584,7 +2585,7 @@ static int mkdir_compat(const char *pathname) {
|
||||
|
||||
/* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */
|
||||
/* Note: preserve errno */
|
||||
HTSEXT_API int dir_exists(const char *path) {
|
||||
HTSEXT_API hts_boolean dir_exists(const char *path) {
|
||||
const int err = errno;
|
||||
STRUCT_STAT st;
|
||||
char BIGSTK file[HTS_URLMAXSIZE * 2];
|
||||
@@ -3341,7 +3342,8 @@ int back_fill(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
int ptr, int numero_passe) {
|
||||
int n = back_pluggable_sockets(sback, opt);
|
||||
|
||||
if (opt->savename_delayed == 2 && !opt->delayed_cached) /* cancel (always delayed) */
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD &&
|
||||
!opt->delayed_cached) /* cancel (always delayed) */
|
||||
return 0;
|
||||
if (n > 0) {
|
||||
int p;
|
||||
@@ -3645,7 +3647,7 @@ HTSEXT_API int hts_setpause(httrackp * opt, int p) {
|
||||
}
|
||||
|
||||
// ask for termination
|
||||
HTSEXT_API int hts_request_stop(httrackp * opt, int force) {
|
||||
HTSEXT_API int hts_request_stop(httrackp *opt, hts_boolean force) {
|
||||
if (opt != NULL) {
|
||||
hts_log_print(opt, LOG_ERROR, "Exit requested by shell or user");
|
||||
hts_mutexlock(&opt->state.lock);
|
||||
@@ -3655,7 +3657,7 @@ HTSEXT_API int hts_request_stop(httrackp * opt, int force) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
HTSEXT_API int hts_has_stopped(httrackp * opt) {
|
||||
HTSEXT_API hts_boolean hts_has_stopped(httrackp *opt) {
|
||||
int ended;
|
||||
hts_mutexlock(&opt->state.lock);
|
||||
ended = opt->state.is_ended;
|
||||
@@ -3677,12 +3679,12 @@ HTSEXT_API int hts_has_stopped(httrackp * opt) {
|
||||
//}
|
||||
// ajout d'URL
|
||||
// -1 : erreur
|
||||
HTSEXT_API int hts_addurl(httrackp * opt, char **url) {
|
||||
HTSEXT_API hts_boolean hts_addurl(httrackp *opt, char **url) {
|
||||
if (url)
|
||||
opt->state._hts_addurl = url;
|
||||
return (opt->state._hts_addurl != NULL);
|
||||
}
|
||||
HTSEXT_API int hts_resetaddurl(httrackp * opt) {
|
||||
HTSEXT_API hts_boolean hts_resetaddurl(httrackp *opt) {
|
||||
opt->state._hts_addurl = NULL;
|
||||
return (opt->state._hts_addurl != NULL);
|
||||
}
|
||||
@@ -3701,7 +3703,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
||||
if (from->maxsoc > 0)
|
||||
to->maxsoc = from->maxsoc;
|
||||
|
||||
if (from->nearlink > -1)
|
||||
/* hts_boolean/enum fields are unsigned (GCC), so a bare `> -1` unset-guard
|
||||
is always false; cast to int to keep the -1 "unset" sentinel test. */
|
||||
if ((int) from->nearlink > -1)
|
||||
to->nearlink = from->nearlink;
|
||||
|
||||
if (from->timeout > -1)
|
||||
@@ -3728,18 +3732,18 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
||||
if (from->hostcontrol > -1)
|
||||
to->hostcontrol = from->hostcontrol;
|
||||
|
||||
if (from->errpage > -1)
|
||||
if ((int) from->errpage > -1)
|
||||
to->errpage = from->errpage;
|
||||
|
||||
if (from->parseall > -1)
|
||||
if ((int) from->parseall > -1)
|
||||
to->parseall = from->parseall;
|
||||
|
||||
// test all: bit 8 de travel
|
||||
if (from->travel > -1) {
|
||||
if (from->travel & 256)
|
||||
to->travel |= 256;
|
||||
if (from->travel & HTS_TRAVEL_TEST_ALL)
|
||||
to->travel |= HTS_TRAVEL_TEST_ALL;
|
||||
else
|
||||
to->travel &= 255;
|
||||
to->travel &= HTS_TRAVEL_SCOPE_MASK;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -3843,7 +3847,7 @@ int htsAddLink(htsmoduleStruct * str, char *link) {
|
||||
a = opt->savename_type;
|
||||
b = opt->savename_83;
|
||||
opt->savename_type = 0;
|
||||
opt->savename_83 = 0;
|
||||
opt->savename_83 = HTS_SAVENAME_83_LONG;
|
||||
// note: adr,fil peuvent être patchés
|
||||
r =
|
||||
url_savename(&afs, NULL, NULL, NULL, opt, sback, cache, hashptr, ptr, numero_passe,
|
||||
|
||||
@@ -369,10 +369,6 @@ char *readfile_or(const char *fil, const char *defaultdata);
|
||||
void check_rate(TStamp stat_timestart, int maxrate);
|
||||
#endif
|
||||
|
||||
// links
|
||||
int liens_record(char *adr, char *fil, char *save, char *former_adr,
|
||||
char *former_fil, char *codebase);
|
||||
|
||||
/* Backing (download-slot) scheduler. Operate on the back[] ring (struct_back).
|
||||
Not thread-safe; call from the single crawl loop. */
|
||||
|
||||
|
||||
@@ -612,12 +612,12 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
/* Terminal is a tty, may ask questions and display funny information */
|
||||
if (isatty(1)) {
|
||||
opt->quiet = 0;
|
||||
opt->verbosedisplay = 1;
|
||||
opt->verbosedisplay = HTS_VERBOSE_SIMPLE;
|
||||
}
|
||||
/* Not a tty, no stdin input or funny output! */
|
||||
else {
|
||||
opt->quiet = 1;
|
||||
opt->verbosedisplay = 0;
|
||||
opt->verbosedisplay = HTS_VERBOSE_NONE;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -953,9 +953,11 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
p = buff;
|
||||
do {
|
||||
int insert_after_argc;
|
||||
int quoted; /* "" unquotes to empty but is still a real token (#106) */
|
||||
|
||||
// read next
|
||||
lastp = p;
|
||||
quoted = (p != NULL && *p == '"');
|
||||
if (p) {
|
||||
p = next_token(p, 1);
|
||||
if (p) {
|
||||
@@ -966,7 +968,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
|
||||
/* Insert parameters BUT so that they can be in the same order */
|
||||
if (lastp) {
|
||||
if (strnotempty(lastp)) {
|
||||
if (strnotempty(lastp) || quoted) {
|
||||
insert_after_argc = argc - insert_after;
|
||||
cmdl_ins(lastp, insert_after_argc, (argv + insert_after), x_argvblk,
|
||||
x_argvblk_size, x_ptr);
|
||||
@@ -1431,7 +1433,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
StringBuff(opt->path_log), "hts-in_progress.lock"))) { // fichier lock?
|
||||
//char s[32];
|
||||
|
||||
opt->cache = 1; // cache prioritaire
|
||||
opt->cache = HTS_CACHE_PRIORITY; // cache prioritaire
|
||||
if (opt->quiet == 0) {
|
||||
if ((fexist
|
||||
(fconcat
|
||||
@@ -1465,7 +1467,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
(fconcat
|
||||
(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_html), "index.html"))) {
|
||||
//char s[32];
|
||||
opt->cache = 2; // cache vient après test de validité
|
||||
opt->cache = HTS_CACHE_TEST_UPDATE;
|
||||
if (opt->quiet == 0) {
|
||||
if ((fexist
|
||||
(fconcat
|
||||
@@ -1558,25 +1560,25 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
return 0; // déja fait normalement
|
||||
//
|
||||
case 'g': // récupérer un (ou plusieurs) fichiers isolés
|
||||
opt->wizard = 2; // le wizard on peut plus s'en passer..
|
||||
opt->wizard = HTS_WIZARD_AUTO;
|
||||
//opt->wizard=0; // pas de wizard
|
||||
opt->cache = 0; // ni de cache
|
||||
opt->cache = HTS_CACHE_NONE; // ni de cache
|
||||
opt->makeindex = 0; // ni d'index
|
||||
httrack_logmode = 1; // erreurs à l'écran
|
||||
opt->savename_type = 1003; // mettre dans le répertoire courant
|
||||
opt->depth = 0; // ne pas explorer la page
|
||||
opt->accept_cookie = 0; // pas de cookies
|
||||
opt->robots = 0; // pas de robots
|
||||
opt->robots = HTS_ROBOTS_NEVER; // pas de robots
|
||||
break;
|
||||
case 'w':
|
||||
opt->wizard = 2; // wizard 'soft' (ne pose pas de questions)
|
||||
opt->travel = 0;
|
||||
opt->seeker = 1;
|
||||
opt->wizard = HTS_WIZARD_AUTO;
|
||||
opt->travel = HTS_TRAVEL_SAME_ADDRESS;
|
||||
opt->seeker = HTS_SEEKER_DOWN;
|
||||
break;
|
||||
case 'W':
|
||||
opt->wizard = 1; // Wizard-Help (pose des questions)
|
||||
opt->travel = 0;
|
||||
opt->seeker = 1;
|
||||
opt->wizard = HTS_WIZARD_ASK; // Wizard-Help (pose des questions)
|
||||
opt->travel = HTS_TRAVEL_SAME_ADDRESS;
|
||||
opt->seeker = HTS_SEEKER_DOWN;
|
||||
break;
|
||||
case 'r': // n'est plus le recurse get bestial mais wizard itou!
|
||||
if (isdigit((unsigned char) *(com + 1))) {
|
||||
@@ -1598,19 +1600,23 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
// note: les tests opt->depth sont pour éviter de faire
|
||||
// un miroir du web (:-O) accidentelement ;-)
|
||||
case 'a': /*if (opt->depth==9999) opt->depth=3; */
|
||||
opt->travel = 0 + (opt->travel & 256);
|
||||
opt->travel =
|
||||
HTS_TRAVEL_SAME_ADDRESS + (opt->travel & HTS_TRAVEL_TEST_ALL);
|
||||
break;
|
||||
case 'd': /*if (opt->depth==9999) opt->depth=3; */
|
||||
opt->travel = 1 + (opt->travel & 256);
|
||||
opt->travel =
|
||||
HTS_TRAVEL_SAME_DOMAIN + (opt->travel & HTS_TRAVEL_TEST_ALL);
|
||||
break;
|
||||
case 'l': /*if (opt->depth==9999) opt->depth=3; */
|
||||
opt->travel = 2 + (opt->travel & 256);
|
||||
opt->travel =
|
||||
HTS_TRAVEL_SAME_TLD + (opt->travel & HTS_TRAVEL_TEST_ALL);
|
||||
break;
|
||||
case 'e': /*if (opt->depth==9999) opt->depth=3; */
|
||||
opt->travel = 7 + (opt->travel & 256);
|
||||
opt->travel =
|
||||
HTS_TRAVEL_EVERYWHERE + (opt->travel & HTS_TRAVEL_TEST_ALL);
|
||||
break;
|
||||
case 't':
|
||||
opt->travel |= 256;
|
||||
opt->travel |= HTS_TRAVEL_TEST_ALL;
|
||||
break;
|
||||
case 'n':
|
||||
opt->nearlink = 1;
|
||||
@@ -1620,16 +1626,16 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
break;
|
||||
//
|
||||
case 'U':
|
||||
opt->seeker = 2;
|
||||
opt->seeker = HTS_SEEKER_UP;
|
||||
break;
|
||||
case 'D':
|
||||
opt->seeker = 1;
|
||||
opt->seeker = HTS_SEEKER_DOWN;
|
||||
break;
|
||||
case 'S':
|
||||
opt->seeker = 0;
|
||||
break;
|
||||
case 'B':
|
||||
opt->seeker = 3;
|
||||
opt->seeker = HTS_SEEKER_DOWN | HTS_SEEKER_UP;
|
||||
break;
|
||||
//
|
||||
case 'Y':
|
||||
@@ -1659,12 +1665,12 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
//case 'A': opt->urlmode=1; break;
|
||||
//case 'R': opt->urlmode=2; break;
|
||||
case 'K':
|
||||
opt->urlmode = 0;
|
||||
opt->urlmode = HTS_URLMODE_ABSOLUTE;
|
||||
if (isdigit((unsigned char) *(com + 1))) {
|
||||
sscanf(com + 1, "%d", &opt->urlmode);
|
||||
if (opt->urlmode == 0) { // in fact K0 ==> K2
|
||||
sscanf(com + 1, "%d", (int *) &opt->urlmode);
|
||||
if (opt->urlmode == HTS_URLMODE_ABSOLUTE) { // in fact K0 ==> K2
|
||||
// and K ==> K0
|
||||
opt->urlmode = 2;
|
||||
opt->urlmode = HTS_URLMODE_RELATIVE;
|
||||
}
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
@@ -1779,7 +1785,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
break;
|
||||
//
|
||||
case 'b':
|
||||
sscanf(com + 1, "%d", &opt->accept_cookie);
|
||||
sscanf(com + 1, "%d", (int *) &opt->accept_cookie);
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
break;
|
||||
@@ -1811,53 +1817,51 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
com++;
|
||||
}
|
||||
break;
|
||||
case 'L':
|
||||
{
|
||||
sscanf(com + 1, "%d", &opt->savename_83);
|
||||
switch (opt->savename_83) {
|
||||
case 0: // 8-3 (ISO9660 L1)
|
||||
opt->savename_83 = 1;
|
||||
break;
|
||||
case 1:
|
||||
opt->savename_83 = 0;
|
||||
break;
|
||||
default: // 2 == ISO9660 (ISO9660 L2)
|
||||
opt->savename_83 = 2;
|
||||
break;
|
||||
}
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
case 'L': {
|
||||
sscanf(com + 1, "%d", (int *) &opt->savename_83);
|
||||
switch (opt->savename_83) {
|
||||
case 0: // 8-3 (ISO9660 L1)
|
||||
opt->savename_83 = HTS_SAVENAME_83_DOS;
|
||||
break;
|
||||
case 1:
|
||||
opt->savename_83 = HTS_SAVENAME_83_LONG;
|
||||
break;
|
||||
default: // 2 == ISO9660 (ISO9660 L2)
|
||||
opt->savename_83 = HTS_SAVENAME_83_ISO9660;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
while (isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
} break;
|
||||
case 's':
|
||||
if (isdigit((unsigned char) *(com + 1))) {
|
||||
sscanf(com + 1, "%d", &opt->robots);
|
||||
sscanf(com + 1, "%d", (int *) &opt->robots);
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
} else
|
||||
opt->robots = 1;
|
||||
opt->robots = HTS_ROBOTS_SOMETIMES;
|
||||
#if DEBUG_ROBOTS
|
||||
printf("robots.txt mode set to %d\n", opt->robots);
|
||||
#endif
|
||||
break;
|
||||
case 'o':
|
||||
sscanf(com + 1, "%d", &opt->errpage);
|
||||
sscanf(com + 1, "%d", (int *) &opt->errpage);
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
break;
|
||||
case 'u':
|
||||
sscanf(com + 1, "%d", &opt->check_type);
|
||||
sscanf(com + 1, "%d", (int *) &opt->check_type);
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
break;
|
||||
//
|
||||
case 'C':
|
||||
if (isdigit((unsigned char) *(com + 1))) {
|
||||
sscanf(com + 1, "%d", &opt->cache);
|
||||
sscanf(com + 1, "%d", (int *) &opt->cache);
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
} else
|
||||
opt->cache = 1;
|
||||
opt->cache = HTS_CACHE_PRIORITY;
|
||||
break;
|
||||
case 'k':
|
||||
opt->all_in_cache = 1;
|
||||
@@ -1913,7 +1917,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
case 'I':
|
||||
opt->kindex = 1;
|
||||
if (isdigit((unsigned char) *(com + 1))) {
|
||||
sscanf(com + 1, "%d", &opt->kindex);
|
||||
sscanf(com + 1, "%d", (int *) &opt->kindex);
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
}
|
||||
@@ -1985,9 +1989,9 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
}
|
||||
break; // url hack
|
||||
case 'v':
|
||||
opt->verbosedisplay = 2;
|
||||
opt->verbosedisplay = HTS_VERBOSE_FULL;
|
||||
if (isdigit((unsigned char) *(com + 1))) {
|
||||
sscanf(com + 1, "%d", &opt->verbosedisplay);
|
||||
sscanf(com + 1, "%d", (int *) &opt->verbosedisplay);
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
}
|
||||
@@ -2000,9 +2004,9 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
}
|
||||
break;
|
||||
case 'N':
|
||||
opt->savename_delayed = 2;
|
||||
opt->savename_delayed = HTS_SAVENAME_DELAYED_HARD;
|
||||
if (isdigit((unsigned char) *(com + 1))) {
|
||||
sscanf(com + 1, "%d", &opt->savename_delayed);
|
||||
sscanf(com + 1, "%d", (int *) &opt->savename_delayed);
|
||||
while(isdigit((unsigned char) *(com + 1)))
|
||||
com++;
|
||||
}
|
||||
@@ -2045,7 +2049,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
// preserve: no footer, original links
|
||||
case 'p':
|
||||
StringClear(opt->footer);
|
||||
opt->urlmode = 4;
|
||||
opt->urlmode = HTS_URLMODE_KEEP_ORIGINAL;
|
||||
break;
|
||||
case 'L': // URL list
|
||||
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
|
||||
@@ -3092,6 +3096,78 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
htsmain_free();
|
||||
return 0;
|
||||
break;
|
||||
case '9': { // copy_htsopt selftest: httrack -#9
|
||||
httrackp *from = hts_create_opt();
|
||||
httrackp *to = hts_create_opt();
|
||||
int err = 0;
|
||||
|
||||
/* from-values differ from both the to-values and the
|
||||
hts_create_opt() defaults (nearlink FALSE, errpage/parseall
|
||||
TRUE), so a copy that no-ops or just resets to defaults is
|
||||
caught too, not only the unsigned-guard bug. */
|
||||
from->retry = 7; /* int field: positive control */
|
||||
to->retry = 0;
|
||||
from->nearlink = HTS_TRUE;
|
||||
to->nearlink = HTS_FALSE;
|
||||
from->errpage = HTS_FALSE;
|
||||
to->errpage = HTS_TRUE;
|
||||
from->parseall = HTS_FALSE;
|
||||
to->parseall = HTS_TRUE;
|
||||
|
||||
copy_htsopt(from, to);
|
||||
|
||||
if (to->retry != 7)
|
||||
err = 1;
|
||||
if (to->nearlink != HTS_TRUE)
|
||||
err = 1;
|
||||
if (to->errpage != HTS_FALSE)
|
||||
err = 1;
|
||||
if (to->parseall != HTS_FALSE)
|
||||
err = 1;
|
||||
|
||||
hts_free_opt(from);
|
||||
hts_free_opt(to);
|
||||
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
|
||||
htsmain_free();
|
||||
return err;
|
||||
} break;
|
||||
case 'Q': { // cookie request-header selftest: httrack -#Q
|
||||
static t_cookie cookie;
|
||||
char hdr[1024];
|
||||
/* RFC 6265: bare name=value pairs, no $Version/$Path (#151). */
|
||||
const char *expected = "Cookie: name=value; has_js=1" H_CRLF;
|
||||
int err = 0;
|
||||
|
||||
const char *dom = "www.example.com";
|
||||
int added;
|
||||
|
||||
cookie.max_len = (int) sizeof(cookie.data);
|
||||
cookie.data[0] = '\0';
|
||||
added = cookie_add(&cookie, "name", "value", dom, "/");
|
||||
added |= cookie_add(&cookie, "has_js", "1", dom, "/");
|
||||
/* different domain: must be filtered out */
|
||||
added |= cookie_add(&cookie, "junk", "x", "other.org", "/");
|
||||
if (added) {
|
||||
printf("cookie-header: FAIL (cookie_add setup)\n");
|
||||
htsmain_free();
|
||||
return 1;
|
||||
}
|
||||
|
||||
http_cookie_header_selftest(&cookie, dom, "/", hdr,
|
||||
sizeof(hdr));
|
||||
if (strcmp(hdr, expected) != 0)
|
||||
err = 1;
|
||||
if (strstr(hdr, "$Version") != NULL ||
|
||||
strstr(hdr, "$Path") != NULL)
|
||||
err = 1;
|
||||
if (strstr(hdr, "junk") != NULL) // wrong-domain cookie leaked
|
||||
err = 1;
|
||||
printf("cookie-header: %s\n", err ? "FAIL" : "OK");
|
||||
if (err)
|
||||
printf(" got: %s\n", hdr);
|
||||
htsmain_free();
|
||||
return err;
|
||||
} break;
|
||||
case '!':
|
||||
HTS_PANIC_PRINTF
|
||||
("Option #! is disabled for security reasons");
|
||||
@@ -3610,12 +3686,12 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
printf("Mirror launched on %s by HTTrack Website Copier/"
|
||||
HTTRACK_VERSION "%s " HTTRACK_AFF_AUTHORS "" LF, t,
|
||||
hts_get_version_info(opt));
|
||||
if (opt->wizard == 0) {
|
||||
if (opt->wizard == HTS_WIZARD_NONE) {
|
||||
printf
|
||||
("mirroring %s with %d levels, %d sockets,t=%d,s=%d,logm=%d,lnk=%d,mdg=%d\n",
|
||||
url, opt->depth, opt->maxsoc, opt->travel, opt->seeker,
|
||||
httrack_logmode, opt->urlmode, opt->getmode);
|
||||
} else { // the magic wizard
|
||||
} else { // the magic wizard
|
||||
printf("mirroring %s with the wizard help..\n", url);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -242,6 +242,14 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_NOPARAM "(none)"
|
||||
#define HTS_NOPARAM2 "\"(none)\""
|
||||
|
||||
/* Boolean flag for option fields and API yes/no returns. An enum (not C bool)
|
||||
so it stays int-sized: option fields keep the httrackp layout/ABI, and a
|
||||
return type stays compatible with the int it replaces. */
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_boolean
|
||||
#define HTS_DEF_DEFSTRUCT_hts_boolean
|
||||
typedef enum hts_boolean { HTS_FALSE = 0, HTS_TRUE = 1 } hts_boolean;
|
||||
#endif
|
||||
|
||||
/* Larger/smaller of two values. Macros: arguments are evaluated twice. */
|
||||
#define maximum(A,B) ( (A) > (B) ? (A) : (B) )
|
||||
|
||||
|
||||
391
src/htslib.c
391
src/htslib.c
@@ -644,6 +644,165 @@ T_SOC http_fopen(httrackp * opt, const char *adr, const char *fil, htsblk * reto
|
||||
return http_xfopen(opt, 0, 1, 1, NULL, adr, fil, retour);
|
||||
}
|
||||
|
||||
// Read a CRLF line from a non-blocking socket (waits up to timeout per recv).
|
||||
// Returns the line length (0 = empty), or -1 on timeout/EOF/error.
|
||||
static int proxy_getline(T_SOC soc, char *s, int max, int timeout) {
|
||||
int j = 0;
|
||||
|
||||
for (;;) {
|
||||
unsigned char ch;
|
||||
int n;
|
||||
|
||||
if (!check_readinput_t(soc, timeout))
|
||||
return -1; // timed out waiting for data
|
||||
n = (int) recv(soc, &ch, 1, 0);
|
||||
if (n == 1) {
|
||||
if (ch == 13) // CR
|
||||
continue;
|
||||
if (ch == 10) // LF: end of line
|
||||
break;
|
||||
if (j >= max - 1)
|
||||
return -1; // line too long: bound the read against a hostile proxy
|
||||
s[j++] = (char) ch;
|
||||
} else if (n == 0) {
|
||||
return -1; // connection closed
|
||||
} else {
|
||||
#ifdef _WIN32
|
||||
if (WSAGetLastError() == WSAEWOULDBLOCK)
|
||||
continue;
|
||||
#else
|
||||
if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
|
||||
continue;
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
s[j] = '\0';
|
||||
return j;
|
||||
}
|
||||
|
||||
int http_proxy_tunnel(httrackp *opt, htsblk *retour, const char *adr,
|
||||
int timeout) {
|
||||
const T_SOC soc = retour->soc;
|
||||
const char *const host = jump_identification_const(adr); // host[:port]
|
||||
const char *const portsep = jump_toport_const(adr); // ":port" or NULL
|
||||
char BIGSTK authority[HTS_URLMAXSIZE * 2];
|
||||
char BIGSTK req[HTS_URLMAXSIZE * 4 + 1100];
|
||||
char line[1024];
|
||||
int code;
|
||||
|
||||
if (soc == INVALID_SOCKET)
|
||||
return 0;
|
||||
|
||||
// CONNECT needs an explicit host:port; default the https port
|
||||
authority[0] = '\0';
|
||||
if (portsep != NULL)
|
||||
strlcatbuff(authority, host, sizeof(authority)); // already host:port
|
||||
else
|
||||
snprintf(authority, sizeof(authority), "%s:%d", host, 443);
|
||||
|
||||
// backstop: never let a stray CR/LF in the host smuggle a second line into
|
||||
// the CONNECT request (the host is already sanitized upstream)
|
||||
{
|
||||
const char *c;
|
||||
|
||||
for (c = authority; *c != '\0'; c++) {
|
||||
if ((unsigned char) *c < ' ') {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: invalid host");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
snprintf(req, sizeof(req), "CONNECT %s HTTP/1.0" H_CRLF "Host: %s" H_CRLF,
|
||||
authority, authority);
|
||||
|
||||
// creds go on the CONNECT, not the tunneled origin request
|
||||
if (link_has_authorization(retour->req.proxy.name)) {
|
||||
const char *a = jump_identification_const(retour->req.proxy.name);
|
||||
const char *astart = jump_protocol_const(retour->req.proxy.name);
|
||||
char autorisation[1100];
|
||||
char user_pass[256];
|
||||
|
||||
autorisation[0] = user_pass[0] = '\0';
|
||||
strncatbuff(user_pass, astart, (int) (a - astart) - 1);
|
||||
strcpybuff(user_pass, unescape_http(OPT_GET_BUFF(opt),
|
||||
OPT_GET_BUFF_SIZE(opt), user_pass));
|
||||
code64((unsigned char *) user_pass, (int) strlen(user_pass),
|
||||
(unsigned char *) autorisation, 0);
|
||||
strlcatbuff(req, "Proxy-Authorization: Basic ", sizeof(req));
|
||||
strlcatbuff(req, autorisation, sizeof(req));
|
||||
strlcatbuff(req, H_CRLF, sizeof(req));
|
||||
}
|
||||
strlcatbuff(req, H_CRLF, sizeof(req)); // end of request headers
|
||||
|
||||
// raw send: ssl is set, so sendc() would route to TLS
|
||||
{
|
||||
const char *p = req;
|
||||
size_t remain = strlen(req);
|
||||
int stalls = 0;
|
||||
|
||||
while (remain > 0) {
|
||||
const int n = (int) send(soc, p, (int) remain, 0);
|
||||
|
||||
if (n > 0) {
|
||||
p += n;
|
||||
remain -= (size_t) n;
|
||||
stalls = 0;
|
||||
} else {
|
||||
#ifdef _WIN32
|
||||
const int wouldblock = (WSAGetLastError() == WSAEWOULDBLOCK);
|
||||
#else
|
||||
const int wouldblock =
|
||||
(errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR);
|
||||
#endif
|
||||
// don't spin forever on a fatal error or an unwritable socket
|
||||
if (!wouldblock || !check_writeinput_t(soc, timeout) ||
|
||||
++stalls > 100) {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: write error");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// proxy status line: "HTTP/1.x <code> ..."
|
||||
if (proxy_getline(soc, line, sizeof(line), timeout) < 0) {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: no response");
|
||||
return 0;
|
||||
}
|
||||
if (sscanf(line, "HTTP/%*d.%*d %d", &code) < 1)
|
||||
code = 0;
|
||||
if (code < 200 || code >= 300) {
|
||||
snprintf(retour->msg, sizeof(retour->msg), "proxy CONNECT refused: %s",
|
||||
strnotempty(line) ? line : "(no status)");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// drain headers to the blank line; cap the count so a flooding proxy can't
|
||||
// stall the crawl
|
||||
{
|
||||
int headers = 0;
|
||||
|
||||
for (;;) {
|
||||
const int n = proxy_getline(soc, line, sizeof(line), timeout);
|
||||
|
||||
if (n < 0) {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: truncated response");
|
||||
return 0;
|
||||
}
|
||||
if (n == 0)
|
||||
break; // blank line: tunnel ready
|
||||
if (++headers > 64) {
|
||||
strcpybuff(retour->msg, "proxy CONNECT: too many response headers");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
// ouverture d'une liaison http, envoi d'une requète
|
||||
// mode: 0 GET 1 HEAD [2 POST]
|
||||
// treat: traiter header?
|
||||
@@ -680,14 +839,14 @@ T_SOC http_xfopen(httrackp * opt, int mode, int treat, int waitconnect,
|
||||
|
||||
/* connexion */
|
||||
if (retour) {
|
||||
if ((!(retour->req.proxy.active))
|
||||
|| ((strcmp(adr, "file://") == 0)
|
||||
|| (strncmp(adr, "https://", 8) == 0)
|
||||
)
|
||||
) { /* pas de proxy, ou non utilisable ici */
|
||||
/* no proxy, or proxy not usable here (local file) */
|
||||
if ((!(retour->req.proxy.active)) || (strcmp(adr, "file://") == 0)) {
|
||||
soc = newhttp(opt, adr, retour, -1, waitconnect);
|
||||
} else {
|
||||
soc = newhttp(opt, retour->req.proxy.name, retour, retour->req.proxy.port, waitconnect); // ouvrir sur le proxy à la place
|
||||
// to the proxy; https tunnels to the origin via CONNECT in back_wait
|
||||
// (#85)
|
||||
soc = newhttp(opt, retour->req.proxy.name, retour, retour->req.proxy.port,
|
||||
waitconnect);
|
||||
}
|
||||
} else {
|
||||
soc = newhttp(opt, adr, NULL, -1, waitconnect);
|
||||
@@ -874,6 +1033,50 @@ static void print_buffer(buff_struct*const str, const char *format, ...) {
|
||||
assertf(str->pos < str->capacity);
|
||||
}
|
||||
|
||||
/* Append the request "Cookie:" header line for every stored cookie matching
|
||||
domain/path. RFC 6265 form: bare "name=value" pairs joined by "; ", no
|
||||
$Version/$Path attributes (those are RFC 2965 syntax that modern servers
|
||||
reject, issue #151). Returns the number of cookies emitted. */
|
||||
static int append_cookie_header(buff_struct *bstr, t_cookie *cookie,
|
||||
const char *domain, const char *path) {
|
||||
char buffer[8192];
|
||||
char *b;
|
||||
int cook = 0;
|
||||
int max_cookies = 8;
|
||||
|
||||
if (cookie == NULL)
|
||||
return 0;
|
||||
b = cookie->data;
|
||||
do {
|
||||
b = cookie_find(b, "", domain, path); // next matching cookie
|
||||
if (b != NULL) {
|
||||
max_cookies--;
|
||||
if (!cook) {
|
||||
print_buffer(bstr, "Cookie: ");
|
||||
cook = 1;
|
||||
} else
|
||||
print_buffer(bstr, "; ");
|
||||
print_buffer(bstr, "%s", cookie_get(buffer, b, 5));
|
||||
print_buffer(bstr, "=%s", cookie_get(buffer, b, 6));
|
||||
b = cookie_nextfield(b);
|
||||
}
|
||||
} while (b != NULL && max_cookies > 0);
|
||||
if (cook)
|
||||
print_buffer(bstr, H_CRLF);
|
||||
return cook;
|
||||
}
|
||||
|
||||
/* Self-test entry for append_cookie_header(): build the request Cookie line
|
||||
into dst (always NUL-terminated). Returns the number of cookies emitted. */
|
||||
int http_cookie_header_selftest(t_cookie *cookie, const char *domain,
|
||||
const char *path, char *dst, size_t dst_size) {
|
||||
buff_struct bstr = {dst, dst_size, 0};
|
||||
|
||||
assertf(dst != NULL && dst_size > 0);
|
||||
dst[0] = '\0';
|
||||
return append_cookie_header(&bstr, cookie, domain, path);
|
||||
}
|
||||
|
||||
// envoi d'une requète
|
||||
int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
const char *xsend, const char *adr, const char *fil,
|
||||
@@ -999,8 +1202,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
if (xsend)
|
||||
print_buffer(&bstr, "%s", xsend); // éventuelles autres lignes
|
||||
|
||||
// tester proxy authentication
|
||||
if (retour->req.proxy.active) {
|
||||
// for https, auth rides the CONNECT (the tunneled GET would leak it)
|
||||
if (retour->req.proxy.active && strncmp(adr, "https://", 8) != 0) {
|
||||
if (link_has_authorization(retour->req.proxy.name)) { // et hop, authentification proxy!
|
||||
const char *a = jump_identification_const(retour->req.proxy.name);
|
||||
const char *astart = jump_protocol_const(retour->req.proxy.name);
|
||||
@@ -1048,34 +1251,9 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
search_tag + strlen(POSTTOK) + 1))));
|
||||
}
|
||||
}
|
||||
// gestion cookies?
|
||||
// send stored cookies matching this host/path
|
||||
if (cookie) {
|
||||
char buffer[8192];
|
||||
char *b = cookie->data;
|
||||
int cook = 0;
|
||||
int max_cookies = 8;
|
||||
|
||||
do {
|
||||
b = cookie_find(b, "", jump_identification_const(adr), fil); // prochain cookie satisfaisant aux conditions
|
||||
if (b != NULL) {
|
||||
max_cookies--;
|
||||
if (!cook) {
|
||||
print_buffer(&bstr, "Cookie: $Version=1; ");
|
||||
cook = 1;
|
||||
} else
|
||||
print_buffer(&bstr, "; ");
|
||||
print_buffer(&bstr, "%s", cookie_get(buffer, b, 5));
|
||||
print_buffer(&bstr, "=%s", cookie_get(buffer, b, 6));
|
||||
print_buffer(&bstr, "; $Path=%s", cookie_get(buffer, b, 2));
|
||||
b = cookie_nextfield(b);
|
||||
}
|
||||
} while(b != NULL && max_cookies > 0);
|
||||
if (cook) { // on a envoyé un (ou plusieurs) cookie?
|
||||
print_buffer(&bstr, H_CRLF);
|
||||
#if DEBUG_COOK
|
||||
printf("Header:\n%s\n", bstr.buffer);
|
||||
#endif
|
||||
}
|
||||
append_cookie_header(&bstr, cookie, jump_identification_const(adr), fil);
|
||||
}
|
||||
// gérer le keep-alive (garder socket)
|
||||
if (retour->req.http11 && !retour->req.nokeepalive) {
|
||||
@@ -1808,6 +1986,24 @@ int check_readinput_t(T_SOC soc, int timeout) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// wait until the socket is writable, up to timeout seconds
|
||||
int check_writeinput_t(T_SOC soc, int timeout) {
|
||||
if (soc != INVALID_SOCKET) {
|
||||
fd_set fds;
|
||||
struct timeval tv;
|
||||
const int isoc = (int) soc;
|
||||
|
||||
assertf(isoc == soc);
|
||||
FD_ZERO(&fds);
|
||||
FD_SET(isoc, &fds);
|
||||
tv.tv_sec = timeout;
|
||||
tv.tv_usec = 0;
|
||||
select(isoc + 1, NULL, &fds, NULL, &tv);
|
||||
return FD_ISSET(isoc, &fds) ? 1 : 0;
|
||||
} else
|
||||
return 0;
|
||||
}
|
||||
|
||||
// idem, sauf qu'ici on peut choisir la taille max de données à recevoir
|
||||
// SI bufl==0 alors le buffer est censé être de 8kos, et on recoit par bloc de lignes
|
||||
// en éliminant les cr (ex: header), arrêt si double-lf
|
||||
@@ -2580,8 +2776,8 @@ HTSEXT_API TStamp mtime_local(void) {
|
||||
assert(! "gettimeofday");
|
||||
}
|
||||
|
||||
return (TStamp) (((TStamp) tv.tv_sec * (TStamp) 1000)
|
||||
+ ((TStamp) tv.tv_usec / (TStamp) 1000000));
|
||||
return (TStamp) (((TStamp) tv.tv_sec * (TStamp) 1000) +
|
||||
((TStamp) tv.tv_usec / (TStamp) 1000));
|
||||
#else
|
||||
struct timeb B;
|
||||
ftime(&B);
|
||||
@@ -3646,8 +3842,9 @@ HTSEXT_API char *unescape_http(char *const catbuff, const size_t size, const cha
|
||||
// DOES NOT DECODE %25 (part of CHAR_DELIM)
|
||||
// no_high & 1: decode high chars
|
||||
// no_high & 2: decode space
|
||||
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size,
|
||||
const char *s, const int no_high) {
|
||||
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size,
|
||||
const char *s,
|
||||
const hts_boolean no_high) {
|
||||
size_t i, j;
|
||||
|
||||
RUNTIME_TIME_CHECK_SIZE(size);
|
||||
@@ -3931,8 +4128,8 @@ void hts_replace(char *s, char from, char to) {
|
||||
|
||||
// guess a local file's mime type (e.g. fil="toto.gif" -> s="image/gif")
|
||||
// returns 1 if a type was written to s, 0 otherwise
|
||||
int guess_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
const char *fil) {
|
||||
hts_boolean guess_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
const char *fil) {
|
||||
return get_httptype_sized(opt, s, ssize, fil, 1);
|
||||
}
|
||||
|
||||
@@ -3945,8 +4142,8 @@ void guess_httptype(httrackp * opt, char *s, const char *fil) {
|
||||
// write the mime type for fil into s (capacity ssize)
|
||||
// flag: 1 to always return a type (the "application/..." / octet-stream
|
||||
// fallback) returns 1 if a type was written to s, 0 otherwise
|
||||
HTSEXT_API int get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
const char *fil, int flag) {
|
||||
HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
const char *fil, hts_boolean flag) {
|
||||
// userdef overrides get_httptype (a rule with an empty value, e.g. "--assume
|
||||
// cgi=", matches but writes nothing: report it as "no type" like the old
|
||||
// code, whose callers tested strnotempty(s))
|
||||
@@ -4196,7 +4393,7 @@ HTSEXT_API int is_userknowntype(httrackp * opt, const char *fil) {
|
||||
|
||||
// page dynamique?
|
||||
// is_dyntype(get_ext("foo.asp"))
|
||||
HTSEXT_API int is_dyntype(const char *fil) {
|
||||
HTSEXT_API hts_boolean is_dyntype(const char *fil) {
|
||||
int j = 0;
|
||||
|
||||
if (!fil)
|
||||
@@ -4214,7 +4411,7 @@ HTSEXT_API int is_dyntype(const char *fil) {
|
||||
|
||||
// types critiques qui ne doivent pas être changés car renvoyés par des serveurs qui ne
|
||||
// connaissent pas le type
|
||||
int may_unknown(httrackp * opt, const char *st) {
|
||||
hts_boolean may_unknown(httrackp *opt, const char *st) {
|
||||
int j = 0;
|
||||
|
||||
// types média
|
||||
@@ -5236,7 +5433,8 @@ HTSEXT_API int hts_uninit_module(void) {
|
||||
}
|
||||
|
||||
// legacy. do not use
|
||||
HTSEXT_API int hts_log(httrackp * opt, const char *prefix, const char *msg) {
|
||||
HTSEXT_API hts_boolean hts_log(httrackp *opt, const char *prefix,
|
||||
const char *msg) {
|
||||
if (opt->log != NULL) {
|
||||
fspc(opt, opt->log, prefix);
|
||||
fprintf(opt->log, "%s" LF, msg);
|
||||
@@ -5434,69 +5632,72 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
|
||||
/* default settings */
|
||||
|
||||
opt->wizard = 2; // wizard automatique
|
||||
opt->quiet = 0; // questions
|
||||
//
|
||||
opt->travel = 0; // même adresse
|
||||
opt->wizard = HTS_WIZARD_AUTO; // wizard automatique
|
||||
opt->quiet = HTS_FALSE;
|
||||
//
|
||||
opt->travel = HTS_TRAVEL_SAME_ADDRESS; // même adresse
|
||||
opt->depth = 9999; // mirror total par défaut
|
||||
opt->extdepth = 0; // mais pas à l'extérieur
|
||||
opt->seeker = 1; // down
|
||||
opt->urlmode = 2; // relatif par défaut
|
||||
opt->no_type_change = 0; // change file types
|
||||
opt->seeker = HTS_SEEKER_DOWN; // down
|
||||
opt->urlmode = HTS_URLMODE_RELATIVE; // relatif par défaut
|
||||
opt->no_type_change = HTS_FALSE;
|
||||
opt->debug = LOG_NOTICE; // small log
|
||||
opt->getmode = 3; // linear scan
|
||||
opt->getmode = HTS_GETMODE_HTML | HTS_GETMODE_NONHTML;
|
||||
opt->maxsite = -1; // taille max site (aucune)
|
||||
opt->maxfile_nonhtml = -1; // taille max fichier non html
|
||||
opt->maxfile_html = -1; // idem pour html
|
||||
opt->maxsoc = 4; // nbre socket max
|
||||
opt->fragment = -1; // pas de fragmentation
|
||||
opt->nearlink = 0; // ne pas prendre les liens non-html "adjacents"
|
||||
opt->makeindex = 1; // faire un index
|
||||
opt->kindex = 0; // index 'keyword'
|
||||
opt->delete_old = 1; // effacer anciens fichiers
|
||||
opt->background_on_suspend = 1; // Background the process if Control Z calls signal suspend.
|
||||
opt->makestat = 0; // pas de fichier de stats
|
||||
opt->maketrack = 0; // ni de tracking
|
||||
opt->nearlink = HTS_FALSE;
|
||||
opt->makeindex = HTS_TRUE;
|
||||
opt->kindex = HTS_FALSE;
|
||||
opt->delete_old = HTS_TRUE;
|
||||
opt->background_on_suspend = HTS_TRUE;
|
||||
opt->makestat = HTS_FALSE;
|
||||
opt->maketrack = HTS_FALSE;
|
||||
opt->timeout = 120; // timeout par défaut (2 minutes)
|
||||
opt->cache = 1; // cache prioritaire
|
||||
opt->shell = 0; // pas de shell par defaut
|
||||
opt->cache = HTS_CACHE_PRIORITY; // cache prioritaire
|
||||
opt->shell = HTS_FALSE;
|
||||
opt->proxy.active = 0; // pas de proxy
|
||||
opt->user_agent_send = 1; // envoyer un user-agent
|
||||
opt->user_agent_send = HTS_TRUE;
|
||||
StringCopy(opt->user_agent,
|
||||
"Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)");
|
||||
StringCopy(opt->referer, "");
|
||||
StringCopy(opt->from, "");
|
||||
opt->savename_83 = 0; // noms longs par défaut
|
||||
opt->savename_83 = HTS_SAVENAME_83_LONG; // long names by default
|
||||
opt->savename_type = 0; // avec structure originale
|
||||
opt->savename_delayed = 2; // hard delayed type (default)
|
||||
opt->delayed_cached = 1; // cached delayed type (default)
|
||||
opt->mimehtml = 0; // pas MIME-html
|
||||
opt->savename_delayed =
|
||||
HTS_SAVENAME_DELAYED_HARD; // always delay the type check (default)
|
||||
opt->delayed_cached = HTS_TRUE;
|
||||
opt->mimehtml = HTS_FALSE;
|
||||
opt->parsejava = HTSPARSE_DEFAULT; // parser classes
|
||||
opt->hostcontrol = 0; // PAS de control host pour timeout et traffic jammer
|
||||
opt->retry = 2; // 2 retry par défaut
|
||||
opt->errpage = 1; // copier ou générer une page d'erreur en cas d'erreur (404 etc.)
|
||||
opt->check_type = 1; // vérifier type si inconnu (cgi,asp..) SAUF / considéré comme html
|
||||
opt->all_in_cache = 0; // ne pas tout stocker en cache
|
||||
opt->robots = 2; // traiter les robots.txt
|
||||
opt->external = 0; // liens externes normaux
|
||||
opt->passprivacy = 0; // mots de passe dans les fichiers
|
||||
opt->includequery = 1; // include query-string par défaut
|
||||
opt->mirror_first_page = 0; // pas mode mirror links
|
||||
opt->accept_cookie = 1; // gérer les cookies
|
||||
opt->errpage = HTS_TRUE;
|
||||
// d'erreur (404 etc.)
|
||||
opt->check_type = HTS_TRUE;
|
||||
// considéré comme html
|
||||
opt->all_in_cache = HTS_FALSE;
|
||||
opt->robots = HTS_ROBOTS_ALWAYS; // traiter les robots.txt
|
||||
opt->external = HTS_FALSE;
|
||||
opt->passprivacy = HTS_FALSE;
|
||||
opt->includequery = HTS_TRUE;
|
||||
opt->mirror_first_page = HTS_FALSE;
|
||||
opt->accept_cookie = HTS_TRUE;
|
||||
opt->cookie = NULL;
|
||||
opt->http10 = 0; // laisser http/1.1
|
||||
opt->nokeepalive = 0; // pas keep-alive
|
||||
opt->nocompression = 0; // pas de compression
|
||||
opt->tolerant = 0; // ne pas accepter content-length incorrect
|
||||
opt->parseall = 1; // tout parser (tags inconnus, par exemple)
|
||||
opt->parsedebug = 0; // pas de mode débuggage
|
||||
opt->norecatch = 0; // ne pas reprendre les fichiers effacés par l'utilisateur
|
||||
opt->verbosedisplay = 0; // pas d'animation texte
|
||||
opt->sizehack = 0; // size hack
|
||||
opt->urlhack = 1; // url hack (normalizer)
|
||||
opt->http10 = HTS_FALSE;
|
||||
opt->nokeepalive = HTS_FALSE;
|
||||
opt->nocompression = HTS_FALSE;
|
||||
opt->tolerant = HTS_FALSE;
|
||||
opt->parseall = HTS_TRUE;
|
||||
opt->parsedebug = HTS_FALSE;
|
||||
opt->norecatch = HTS_FALSE;
|
||||
opt->verbosedisplay = HTS_VERBOSE_NONE; // no text animation
|
||||
opt->sizehack = HTS_FALSE;
|
||||
opt->urlhack = HTS_TRUE;
|
||||
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
|
||||
opt->ftp_proxy = 1; // proxy http pour ftp
|
||||
opt->convert_utf8 = 1; // convert html to UTF-8
|
||||
opt->ftp_proxy = HTS_TRUE;
|
||||
opt->convert_utf8 = HTS_TRUE;
|
||||
StringCopy(opt->filelist, "");
|
||||
StringCopy(opt->lang_iso, "en, *");
|
||||
StringCopy(opt->accept,
|
||||
@@ -5507,9 +5708,9 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
//
|
||||
opt->log = stdout;
|
||||
opt->errlog = stderr;
|
||||
opt->flush = 1; // flush sur les fichiers log
|
||||
//opt->aff_progress=0;
|
||||
opt->keyboard = 0;
|
||||
opt->flush = HTS_TRUE;
|
||||
// opt->aff_progress=0;
|
||||
opt->keyboard = HTS_FALSE;
|
||||
//
|
||||
StringCopy(opt->path_html, "");
|
||||
StringCopy(opt->path_html_utf8, "");
|
||||
@@ -5526,10 +5727,10 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
opt->waittime = -1; // wait until.. hh*3600+mm*60+ss
|
||||
//
|
||||
opt->exec = "";
|
||||
opt->is_update = 0; // not an update (yet)
|
||||
opt->dir_topindex = 0; // do not built top index (yet)
|
||||
opt->is_update = HTS_FALSE;
|
||||
opt->dir_topindex = HTS_FALSE;
|
||||
//
|
||||
opt->bypass_limits = 0; // enforce limits by default
|
||||
opt->bypass_limits = HTS_FALSE;
|
||||
opt->state.stop = 0; // stopper
|
||||
opt->state.exit_xh = 0; // abort
|
||||
//
|
||||
|
||||
16
src/htslib.h
16
src/htslib.h
@@ -182,6 +182,11 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode, const char *xsend
|
||||
const char *adr, const char *fil,
|
||||
const char *referer_adr, const char *referer_fil,
|
||||
htsblk * retour);
|
||||
/* Build the request "Cookie:" header line for stored cookies matching
|
||||
domain/path into dst (NUL-terminated). Exposed for the -#Q self-test;
|
||||
wraps the same logic http_sendhead() uses. Returns cookies emitted. */
|
||||
int http_cookie_header_selftest(t_cookie *cookie, const char *domain,
|
||||
const char *path, char *dst, size_t dst_size);
|
||||
|
||||
//int newhttp(char* iadr,char* err=NULL);
|
||||
T_SOC newhttp(httrackp * opt, const char *iadr, htsblk * retour, int port,
|
||||
@@ -193,6 +198,17 @@ HTS_INLINE void deletesoc_r(htsblk * r);
|
||||
htsblk http_test(httrackp * opt, const char *adr, const char *fil, char *loc);
|
||||
int check_readinput(htsblk * r);
|
||||
int check_readinput_t(T_SOC soc, int timeout);
|
||||
int check_writeinput_t(T_SOC soc, int timeout);
|
||||
|
||||
/* Open an HTTP CONNECT tunnel through the active proxy for an https request:
|
||||
`retour->soc` must already be TCP-connected to the proxy, and `adr` is the
|
||||
origin authority (url_adr, e.g. "https://host:port"). Sends the CONNECT
|
||||
request (with Proxy-Authorization when the proxy carries credentials) and
|
||||
reads the proxy's status line, so the caller's TLS handshake then runs
|
||||
end-to-end with the origin. Blocks up to `timeout` seconds. Returns 1 on a
|
||||
2xx tunnel, 0 on failure (retour->msg/statuscode set). */
|
||||
int http_proxy_tunnel(httrackp *opt, htsblk *retour, const char *adr,
|
||||
int timeout);
|
||||
void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * retour,
|
||||
char *rcvd);
|
||||
void treatfirstline(htsblk * retour, const char *rcvd);
|
||||
|
||||
@@ -184,10 +184,11 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
|
||||
/* 8-3 ? */
|
||||
switch (opt->savename_83) {
|
||||
case 1: // 8-3
|
||||
case HTS_SAVENAME_83_DOS: // 8-3
|
||||
max_char = 8;
|
||||
break;
|
||||
case 2: // Level 2 File names may be up to 31 characters.
|
||||
case HTS_SAVENAME_83_ISO9660: // Level 2 File names may be up to 31
|
||||
// characters.
|
||||
max_char = 31;
|
||||
break;
|
||||
default:
|
||||
@@ -324,7 +325,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
}
|
||||
|
||||
/* replace shtml to html.. */
|
||||
if (opt->savename_delayed == 2)
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD)
|
||||
is_html = -1; /* ALWAYS delay type */
|
||||
else
|
||||
is_html = ishtml(opt, fil);
|
||||
@@ -363,7 +364,9 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
) {
|
||||
// tester type avec requète HEAD si on ne connait pas le type du fichier
|
||||
if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
|
||||
if (opt->savename_delayed == 2 || (ishtest = ishtml(opt, fil)) < 0) { // on ne sait pas si c'est un html ou un fichier..
|
||||
if (opt->savename_delayed == HTS_SAVENAME_DELAYED_HARD ||
|
||||
(ishtest = ishtml(opt, fil)) <
|
||||
0) { // unsure whether it's html or a file
|
||||
// lire dans le cache
|
||||
htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
|
||||
|
||||
@@ -393,11 +396,12 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
}
|
||||
#endif
|
||||
//
|
||||
} else if (opt->savename_delayed != 2 && is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
||||
Lookup mimetype not only by extension,
|
||||
but also by filename */
|
||||
/* Note: "foo.cgi => text/html" means that foo.cgi shall have the text/html MIME file type,
|
||||
that is, ".html" */
|
||||
} else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD &&
|
||||
is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
|
||||
Lookup mimetype not only by extension,
|
||||
but also by filename */
|
||||
/* Note: "foo.cgi => text/html" means that foo.cgi shall have the
|
||||
text/html MIME file type, that is, ".html" */
|
||||
char BIGSTK mime[1024];
|
||||
|
||||
mime[0] = ext[0] = '\0';
|
||||
@@ -408,9 +412,13 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
}
|
||||
}
|
||||
}
|
||||
// note: if savename_delayed is enabled, the naming will be temporary (and slightly invalid!)
|
||||
// note: if we are about to stop (opt->state.stop), back_add() will fail later
|
||||
else if (opt->savename_delayed != 0 && !opt->state.stop) {
|
||||
// note: if savename_delayed is enabled, the naming will be temporary
|
||||
// (and slightly invalid!)
|
||||
//
|
||||
// note: if we are about to stop (opt->state.stop), back_add() will
|
||||
// fail later
|
||||
else if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
|
||||
!opt->state.stop) {
|
||||
// Check if the file is ready in backing. We basically take the same logic as later.
|
||||
// FIXME: we should cleanup and factorize this unholy mess
|
||||
if (headers != NULL && headers->status >= 0 && !is_redirect) {
|
||||
@@ -698,7 +706,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
}
|
||||
// restaurer
|
||||
opt->state._hts_in_html_parsing = hihp;
|
||||
} // caché?
|
||||
} // caché?
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1190,7 +1198,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
// Not used anymore unless non-delayed types.
|
||||
// de même en cas de manque d'extension on en place une de manière forcée..
|
||||
// cela évite les /chez/toto et les /chez/toto/index.html incompatibles
|
||||
if (opt->savename_type != -1 && opt->savename_delayed != 2) {
|
||||
if (opt->savename_type != -1 &&
|
||||
opt->savename_delayed != HTS_SAVENAME_DELAYED_HARD) {
|
||||
char *a = afs->save + strlen(afs->save) - 1;
|
||||
|
||||
while((a > afs->save) && (*a != '.') && (*a != '/'))
|
||||
@@ -1236,31 +1245,21 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
size_t i;
|
||||
for(i = 0 ; afs->save[i] != '\0' ; i++) {
|
||||
unsigned char c = (unsigned char) afs->save[i];
|
||||
if (c < 32 // control
|
||||
|| c == 127 // unwise
|
||||
|| c == '~' // unix unwise
|
||||
|| c == '\\' // windows separator
|
||||
|| c == ':' // windows forbidden
|
||||
|| c == '*' // windows forbidden
|
||||
|| c == '?' // windows forbidden
|
||||
|| c == '\"' // windows forbidden
|
||||
|| c == '<' // windows forbidden
|
||||
|| c == '>' // windows forbidden
|
||||
|| c == '|' // windows forbidden
|
||||
//|| c == '@' // ?
|
||||
||
|
||||
(
|
||||
opt->savename_83 == 2 // CDROM
|
||||
&&
|
||||
(
|
||||
c == '-'
|
||||
|| c == '='
|
||||
|| c == '+'
|
||||
)
|
||||
)
|
||||
)
|
||||
{
|
||||
afs->save[i] = '_';
|
||||
if (c < 32 // control
|
||||
|| c == 127 // unwise
|
||||
|| c == '~' // unix unwise
|
||||
|| c == '\\' // windows separator
|
||||
|| c == ':' // windows forbidden
|
||||
|| c == '*' // windows forbidden
|
||||
|| c == '?' // windows forbidden
|
||||
|| c == '\"' // windows forbidden
|
||||
|| c == '<' // windows forbidden
|
||||
|| c == '>' // windows forbidden
|
||||
|| c == '|' // windows forbidden
|
||||
//|| c == '@' // ?
|
||||
|| (opt->savename_83 == HTS_SAVENAME_83_ISO9660 // CDROM
|
||||
&& (c == '-' || c == '=' || c == '+'))) {
|
||||
afs->save[i] = '_';
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1521,7 +1520,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
char *a = afs->save + strlen(afs->save) - 1;
|
||||
char *b;
|
||||
int n = 2;
|
||||
char collisionSeparator = ((opt->savename_83 != 2) ? '-' : '_');
|
||||
char collisionSeparator =
|
||||
((opt->savename_83 != HTS_SAVENAME_83_ISO9660) ? '-' : '_');
|
||||
|
||||
tempo[0] = '\0';
|
||||
|
||||
|
||||
198
src/htsopt.h
198
src/htsopt.h
@@ -285,6 +285,102 @@ typedef enum htsparsejava_flags {
|
||||
HTSPARSE_NO_AGGRESSIVE = 8 // don't aggressively parse .js or .java
|
||||
} htsparsejava_flags;
|
||||
|
||||
/* Link-rewriting style for saved pages (opt->urlmode). */
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_urlmode
|
||||
#define HTS_DEF_DEFSTRUCT_hts_urlmode
|
||||
typedef enum hts_urlmode {
|
||||
HTS_URLMODE_ABSOLUTE = 0, /**< absolute URL (http://host/path) everywhere */
|
||||
HTS_URLMODE_ABSOLUTE_FILE = 1, /**< legacy file: form, unused */
|
||||
HTS_URLMODE_RELATIVE = 2, /**< relative link (default) */
|
||||
HTS_URLMODE_ABSOLUTE_URI = 3, /**< absolute URI from root (/path) */
|
||||
HTS_URLMODE_KEEP_ORIGINAL = 4, /**< keep the original link, do not rewrite */
|
||||
HTS_URLMODE_TRANSPARENT_PROXY = 5 /**< transparent-proxy URL */
|
||||
} hts_urlmode;
|
||||
#endif
|
||||
|
||||
/* Cache policy for updates and retries (opt->cache). */
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_cachemode
|
||||
#define HTS_DEF_DEFSTRUCT_hts_cachemode
|
||||
typedef enum hts_cachemode {
|
||||
HTS_CACHE_NONE = 0, /**< no cache */
|
||||
HTS_CACHE_PRIORITY = 1, /**< cache takes priority over the network */
|
||||
HTS_CACHE_TEST_UPDATE = 2 /**< check for update before reuse (default) */
|
||||
} hts_cachemode;
|
||||
#endif
|
||||
|
||||
/* Interactive wizard level (opt->wizard). */
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_wizard
|
||||
#define HTS_DEF_DEFSTRUCT_hts_wizard
|
||||
typedef enum hts_wizard {
|
||||
HTS_WIZARD_NONE = 0, /**< no wizard */
|
||||
HTS_WIZARD_ASK = 1, /**< wizard asks questions */
|
||||
HTS_WIZARD_AUTO = 2 /**< wizard runs without asking */
|
||||
} hts_wizard;
|
||||
#endif
|
||||
|
||||
/* robots.txt / meta-robots obedience level (opt->robots). */
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_robots
|
||||
#define HTS_DEF_DEFSTRUCT_hts_robots
|
||||
typedef enum hts_robots {
|
||||
HTS_ROBOTS_NEVER = 0, /**< ignore robots rules */
|
||||
HTS_ROBOTS_SOMETIMES = 1, /**< partial obedience (default) */
|
||||
HTS_ROBOTS_ALWAYS = 2, /**< obey robots rules */
|
||||
HTS_ROBOTS_ALWAYS_STRICT = 3 /**< obey even strict rules */
|
||||
} hts_robots;
|
||||
#endif
|
||||
|
||||
/* What to fetch (opt->getmode bitmask). */
|
||||
typedef enum hts_getmode {
|
||||
HTS_GETMODE_HTML = 1 << 0, /**< save HTML files */
|
||||
HTS_GETMODE_NONHTML = 1 << 1, /**< save non-HTML files */
|
||||
HTS_GETMODE_HTML_FIRST = 1 << 2 /**< fetch HTML first, then the other files */
|
||||
} hts_getmode;
|
||||
|
||||
/* Allowed directions in the directory tree (opt->seeker bitmask). */
|
||||
typedef enum hts_seeker {
|
||||
HTS_SEEKER_DOWN = 1 << 0, /**< may descend into subdirectories */
|
||||
HTS_SEEKER_UP = 1 << 1 /**< may ascend to parent directories */
|
||||
} hts_seeker;
|
||||
|
||||
/* opt->travel: link-following scope in the low byte, flags OR'd in above it. */
|
||||
typedef enum hts_travel_scope {
|
||||
HTS_TRAVEL_SAME_ADDRESS = 0, /**< stay on the same address (host) */
|
||||
HTS_TRAVEL_SAME_DOMAIN = 1, /**< stay on the same principal domain */
|
||||
HTS_TRAVEL_SAME_TLD = 2, /**< stay on the same TLD (e.g. .com) */
|
||||
HTS_TRAVEL_EVERYWHERE = 7, /**< follow links anywhere on the web */
|
||||
HTS_TRAVEL_TEST_ALL = 1 << 8 /**< also test forbidden URLs (-t) */
|
||||
} hts_travel_scope;
|
||||
|
||||
/* Mask selecting the scope value out of opt->travel. */
|
||||
#define HTS_TRAVEL_SCOPE_MASK 0xff
|
||||
|
||||
/* Text progress display detail (opt->verbosedisplay). */
|
||||
typedef enum hts_verbosedisplay {
|
||||
HTS_VERBOSE_NONE = 0, /**< no animated progress display (default) */
|
||||
HTS_VERBOSE_SIMPLE = 1, /**< minimal single-line progress */
|
||||
HTS_VERBOSE_FULL = 2 /**< full animated progress */
|
||||
} hts_verbosedisplay;
|
||||
|
||||
/* Delayed file-type resolution policy (opt->savename_delayed). */
|
||||
typedef enum hts_savename_delayed {
|
||||
HTS_SAVENAME_DELAYED_NONE = 0, /**< resolve the type immediately */
|
||||
HTS_SAVENAME_DELAYED_SOFT = 1, /**< delay the type check when unknown */
|
||||
HTS_SAVENAME_DELAYED_HARD = 2 /**< always delay the type check (default) */
|
||||
} hts_savename_delayed;
|
||||
|
||||
/* Saved-name length layout (opt->savename_83). */
|
||||
typedef enum hts_savename_83 {
|
||||
HTS_SAVENAME_83_LONG = 0, /**< long file names (default) */
|
||||
HTS_SAVENAME_83_DOS = 1, /**< DOS 8.3 names (ISO9660 level 1) */
|
||||
HTS_SAVENAME_83_ISO9660 = 2 /**< ISO9660 level 2 names (up to 31 chars) */
|
||||
} hts_savename_83;
|
||||
|
||||
/* Host-banning triggers (opt->hostcontrol bitmask). */
|
||||
typedef enum hts_hostcontrol {
|
||||
HTS_HOSTCONTROL_BAN_TIMEOUT = 1 << 0, /**< ban a timing-out host */
|
||||
HTS_HOSTCONTROL_BAN_SLOW = 1 << 1 /**< ban a too-slow host */
|
||||
} hts_hostcontrol;
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_lien_buffers
|
||||
#define HTS_DEF_FWSTRUCT_lien_buffers
|
||||
typedef struct lien_buffers lien_buffers;
|
||||
@@ -308,15 +404,16 @@ typedef struct httrackp httrackp;
|
||||
struct httrackp {
|
||||
size_t size_httrackp; /**< size of this structure (version/ABI guard) */
|
||||
/* */
|
||||
int wizard; /**< interactive wizard level (none/full/light) */
|
||||
int flush; /**< fflush() log files after each write */
|
||||
hts_wizard wizard; /**< interactive wizard level (none/ask/auto) */
|
||||
hts_boolean flush; /**< fflush() log files after each write */
|
||||
int travel; /**< link-following scope (same domain, etc.) */
|
||||
int seeker; /**< allowed direction: go up and/or down the tree */
|
||||
int depth; /**< maximum recursion depth (-rN) */
|
||||
int extdepth; /**< maximum recursion depth outside the start domain */
|
||||
int urlmode; /**< saved-link rewriting style (relative, absolute, etc.) */
|
||||
int no_type_change; // do not change file type according to MIME
|
||||
int debug; /**< debug logging level */
|
||||
hts_urlmode
|
||||
urlmode; /**< saved-link rewriting style (relative, absolute, etc.) */
|
||||
hts_boolean no_type_change; // do not change file type according to MIME
|
||||
hts_log_type debug; /**< debug logging level */
|
||||
int getmode; /**< what to fetch (HTML, images, ...) bitmask */
|
||||
FILE *log; /**< informational log stream; NULL mutes it */
|
||||
FILE *errlog; /**< error log stream; NULL mutes it */
|
||||
@@ -325,28 +422,31 @@ struct httrackp {
|
||||
LLint maxfile_html; /**< max bytes per HTML file */
|
||||
int maxsoc; /**< max simultaneous sockets (-cN) */
|
||||
LLint fragment; /**< split site after this many bytes */
|
||||
int nearlink; /**< also fetch images/data adjacent to a page but off-site */
|
||||
int makeindex; /**< build a top-level index.html */
|
||||
int kindex; /**< build a keyword index */
|
||||
int delete_old; /**< delete locally obsolete files after update */
|
||||
hts_boolean
|
||||
nearlink; /**< also fetch images/data adjacent to a page but off-site */
|
||||
hts_boolean makeindex; /**< build a top-level index.html */
|
||||
hts_boolean kindex; /**< build a keyword index */
|
||||
hts_boolean delete_old; /**< delete locally obsolete files after update */
|
||||
int timeout; /**< connection timeout in seconds */
|
||||
int rateout; /**< minimum transfer rate (bytes/s) before abort */
|
||||
int maxtime; /**< max total mirror duration in seconds */
|
||||
int maxrate; /**< max transfer rate cap (bytes/s) */
|
||||
float maxconn; /**< max connections per second */
|
||||
int waittime; /**< scheduled start time (wall-clock seconds) */
|
||||
int cache; /**< cache generation mode */
|
||||
hts_cachemode cache; /**< cache generation mode */
|
||||
// int aff_progress; // progress bar
|
||||
int shell; /**< driven by a shell over stdin/stdout pipes */
|
||||
hts_boolean shell; /**< driven by a shell over stdin/stdout pipes */
|
||||
t_proxy proxy; /**< proxy configuration */
|
||||
int savename_83; /**< force 8.3 (DOS) file names */
|
||||
hts_savename_83
|
||||
savename_83; /**< saved-name length layout (long/DOS/ISO9660) */
|
||||
int savename_type; /**< saved-name layout (original tree, flat, ...) */
|
||||
String
|
||||
savename_userdef; /**< user-defined name template (e.g. %h%p/%n%q.%t) */
|
||||
int savename_delayed; // delayed type check
|
||||
int delayed_cached; // delayed type check can be cached to speedup updates
|
||||
int mimehtml; /**< produce a single MIME/MHTML archive */
|
||||
int user_agent_send; /**< send a User-Agent header */
|
||||
hts_savename_delayed savename_delayed; /**< delayed type-check policy */
|
||||
hts_boolean
|
||||
delayed_cached; // delayed type check can be cached to speedup updates
|
||||
hts_boolean mimehtml; /**< produce a single MIME/MHTML archive */
|
||||
hts_boolean user_agent_send; /**< send a User-Agent header */
|
||||
String user_agent; /**< User-Agent value (e.g. httrack/1.0) */
|
||||
String referer; /**< Referer value to send */
|
||||
String from; /**< From value to send */
|
||||
@@ -355,37 +455,39 @@ struct httrackp {
|
||||
String path_html_utf8; /**< output directory for the mirror, UTF-8 form */
|
||||
String path_bin; /**< directory for HTML templates */
|
||||
int retry; /**< extra retries on a failed transfer */
|
||||
int makestat; /**< maintain a transfer-statistics log */
|
||||
int maketrack; /**< maintain an operations-statistics log */
|
||||
hts_boolean makestat; /**< maintain a transfer-statistics log */
|
||||
hts_boolean maketrack; /**< maintain an operations-statistics log */
|
||||
int parsejava; /**< Java/JS parsing mode; see htsparsejava_flags */
|
||||
int hostcontrol; /**< drop hosts that are too slow, etc. */
|
||||
int errpage; /**< generate an error page on 404 and similar */
|
||||
int check_type; /**< probe unknown-type links (cgi/asp/dir) and follow moves
|
||||
*/
|
||||
int all_in_cache; /**< keep all retrieved data in the cache */
|
||||
int robots; /**< robots.txt handling level */
|
||||
int external; /**< render external links as error pages */
|
||||
int passprivacy; /**< strip passwords from external links */
|
||||
int includequery; /**< include the query string in saved names */
|
||||
int mirror_first_page; /**< only mirror the links of the first page */
|
||||
int hostcontrol; /**< ban slow/timing-out hosts; see hts_hostcontrol bits */
|
||||
hts_boolean errpage; /**< generate an error page on 404 and similar */
|
||||
hts_boolean
|
||||
check_type; /**< probe unknown-type links (cgi/asp/dir) and follow moves
|
||||
*/
|
||||
hts_boolean all_in_cache; /**< keep all retrieved data in the cache */
|
||||
hts_robots robots; /**< robots.txt handling level */
|
||||
hts_boolean external; /**< render external links as error pages */
|
||||
hts_boolean passprivacy; /**< strip passwords from external links */
|
||||
hts_boolean includequery; /**< include the query string in saved names */
|
||||
hts_boolean mirror_first_page; /**< only mirror the links of the first page */
|
||||
String sys_com; /**< system command to run */
|
||||
int sys_com_exec; /**< actually execute sys_com */
|
||||
int accept_cookie; /**< accept and send cookies */
|
||||
hts_boolean sys_com_exec; /**< actually execute sys_com */
|
||||
hts_boolean accept_cookie; /**< accept and send cookies */
|
||||
t_cookie *cookie; /**< cookie store */
|
||||
int http10; /**< force HTTP/1.0 */
|
||||
int nokeepalive; /**< disable keep-alive */
|
||||
int nocompression; /**< disable content compression */
|
||||
int sizehack; /**< treat same-size response as "updated" */
|
||||
int urlhack; // force "url normalization" to avoid loops
|
||||
int tolerant; /**< accept an incorrect Content-Length */
|
||||
int parseall; /**< parse aggressively, including unknown tags with links */
|
||||
int parsedebug; /**< parser debug mode */
|
||||
int norecatch; /**< do not re-fetch files the user deleted locally */
|
||||
int verbosedisplay; /**< animated text progress display */
|
||||
hts_boolean http10; /**< force HTTP/1.0 */
|
||||
hts_boolean nokeepalive; /**< disable keep-alive */
|
||||
hts_boolean nocompression; /**< disable content compression */
|
||||
hts_boolean sizehack; /**< treat same-size response as "updated" */
|
||||
hts_boolean urlhack; // force "url normalization" to avoid loops
|
||||
hts_boolean tolerant; /**< accept an incorrect Content-Length */
|
||||
hts_boolean
|
||||
parseall; /**< parse aggressively, including unknown tags with links */
|
||||
hts_boolean parsedebug; /**< parser debug mode */
|
||||
hts_boolean norecatch; /**< do not re-fetch files the user deleted locally */
|
||||
hts_verbosedisplay verbosedisplay; /**< animated text progress display */
|
||||
String footer; /**< footer/info line injected into pages */
|
||||
int maxcache; /**< in-memory cache backing limit (bytes) */
|
||||
// int maxcache_anticipate; // maximum links to anticipate (upper bound)
|
||||
int ftp_proxy; /**< use the HTTP proxy for FTP too */
|
||||
hts_boolean ftp_proxy; /**< use the HTTP proxy for FTP too */
|
||||
String filelist; /**< file listing URLs to include */
|
||||
String urllist; /**< file listing filters to include */
|
||||
htsfilters filters; /**< filter pointers (+/-pattern rules) */
|
||||
@@ -399,20 +501,20 @@ struct httrackp {
|
||||
String headers; // Additional headers
|
||||
String mimedefs; // ext1=mimetype1\next2=mimetype2..
|
||||
String mod_blacklist; /**< blacklisted modules */
|
||||
int convert_utf8; // filenames UTF-8 conversion (3.46)
|
||||
hts_boolean convert_utf8; // filenames UTF-8 conversion (3.46)
|
||||
//
|
||||
int maxlink; /**< max number of links */
|
||||
int maxfilter; /**< max number of filters */
|
||||
//
|
||||
const char *exec; /**< path of the running executable */
|
||||
//
|
||||
int quiet; /**< suppress non-wizard questions */
|
||||
int keyboard; /**< poll stdin for keyboard input */
|
||||
int bypass_limits; // bypass built-in limits
|
||||
int background_on_suspend; // background process on suspend signal
|
||||
hts_boolean quiet; /**< suppress non-wizard questions */
|
||||
hts_boolean keyboard; /**< poll stdin for keyboard input */
|
||||
hts_boolean bypass_limits; // bypass built-in limits
|
||||
hts_boolean background_on_suspend; // background process on suspend signal
|
||||
//
|
||||
int is_update; /**< this run is an update (show "File updated...") */
|
||||
int dir_topindex; /**< rebuild the top index afterwards */
|
||||
hts_boolean is_update; /**< this run is an update (show "File updated...") */
|
||||
hts_boolean dir_topindex; /**< rebuild the top index afterwards */
|
||||
//
|
||||
// callbacks
|
||||
t_hts_htmlcheck_callbacks
|
||||
|
||||
207
src/htsparse.c
207
src/htsparse.c
@@ -296,6 +296,48 @@ static const char *html_inline_safe(const char *src, char *dst, size_t size) {
|
||||
return dst;
|
||||
}
|
||||
|
||||
/* Byte before html, or a space sentinel at the buffer start where html[-1]
|
||||
would underflow; space reads as the word boundary the guards want there. */
|
||||
static HTS_INLINE char html_prevc(const char *html, const char *start) {
|
||||
return html > start ? html[-1] : ' ';
|
||||
}
|
||||
|
||||
/* True if [s, s+len) is exactly an HTTP method token (XHR.open's first
|
||||
argument is a method, not a URL: #218). Case-insensitive. */
|
||||
static int is_http_method(const char *s, size_t len) {
|
||||
static const char *const methods[] = {"GET", "POST", "PUT",
|
||||
"DELETE", "HEAD", "OPTIONS",
|
||||
"PATCH", "TRACE", NULL};
|
||||
int i;
|
||||
|
||||
for (i = 0; methods[i] != NULL; i++) {
|
||||
if (strlen(methods[i]) == len && strfield(s, methods[i]) == (int) len)
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Percent-encode '(' and ')' in a link emitted into an unquoted CSS url(...):
|
||||
a literal ')' closes the token early and the UA mis-parses the value (#163).
|
||||
The UA decodes %28/%29 back to the saved-on-disk name. */
|
||||
static void escape_url_parens(char *const s, const size_t size) {
|
||||
char BIGSTK buff[HTS_URLMAXSIZE * 2];
|
||||
size_t i, j;
|
||||
|
||||
for (i = 0, j = 0; s[i] != '\0' && j + 3 < size && j + 3 < sizeof(buff);
|
||||
i++) {
|
||||
if (s[i] == '(' || s[i] == ')') {
|
||||
buff[j++] = '%';
|
||||
buff[j++] = '2';
|
||||
buff[j++] = s[i] == '(' ? '8' : '9';
|
||||
} else {
|
||||
buff[j++] = s[i];
|
||||
}
|
||||
}
|
||||
buff[j] = '\0';
|
||||
strlcpybuff(s, buff, size);
|
||||
}
|
||||
|
||||
/* Main parser */
|
||||
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
char catbuff[CATBUFF_SIZE];
|
||||
@@ -349,7 +391,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
#endif
|
||||
|
||||
// Now, parsing
|
||||
if ((opt->getmode & 1) && (ptr > 0)) { // récupérer les html sur disque
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
// créer le fichier html local
|
||||
HT_ADD_FOP; // écrire peu à peu le fichier
|
||||
}
|
||||
@@ -553,10 +595,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (opt->depth == heap(ptr)->depth) { // on note toujours les premiers liens
|
||||
if (!in_media) {
|
||||
if (opt->makeindex && (ptr > 0)) {
|
||||
if (opt->getmode & 1) { // autorisation d'écrire
|
||||
if (opt->getmode & HTS_GETMODE_HTML) {
|
||||
p = strfield(html, "title");
|
||||
if (p) {
|
||||
if (*(html - 1) == '/')
|
||||
if (html_prevc(html, r->adr) == '/')
|
||||
p = 0; // /title
|
||||
} else {
|
||||
if (strfield(html, "/html"))
|
||||
@@ -704,7 +746,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
}
|
||||
|
||||
if (opt->getmode & 1) { // sauver html
|
||||
if (opt->getmode & HTS_GETMODE_HTML) { // sauver html
|
||||
p = 0;
|
||||
switch (emited_footer) {
|
||||
case 0:
|
||||
@@ -740,7 +782,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
|
||||
if (strchr(r->adr, '\r'))
|
||||
eol = "\r\n";
|
||||
if (StringNotEmpty(opt->footer) || opt->urlmode != 4) { /* != preserve */
|
||||
if (StringNotEmpty(opt->footer) ||
|
||||
opt->urlmode != HTS_URLMODE_KEEP_ORIGINAL) {
|
||||
if (StringNotEmpty(opt->footer)) {
|
||||
char BIGSTK tempo[1024 + HTS_URLMAXSIZE * 2];
|
||||
char gmttime[256];
|
||||
@@ -1340,6 +1383,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int can_avoid_quotes = 0;
|
||||
char quotes_replacement = '\0';
|
||||
int ensure_not_mime = 0;
|
||||
// .open(method,url): reject an HTTP-method first arg (#218)
|
||||
int ensure_not_method = 0;
|
||||
// @import: the quoted token is the URL; a trailing
|
||||
// media/supports/layer condition is not part of it
|
||||
int is_import = 0;
|
||||
|
||||
if (inscript_tag)
|
||||
expected_end = ";\"\'"; // voir a href="javascript:doc.location='foo'"
|
||||
@@ -1356,9 +1404,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (!nc)
|
||||
nc = strfield(html, ":location"); // javascript:location="doc"
|
||||
if (!nc) { // location="doc"
|
||||
if ((nc = strfield(html, "location"))
|
||||
&& !isspace(*(html - 1))
|
||||
)
|
||||
if ((nc = strfield(html, "location")) &&
|
||||
!isspace(html_prevc(html, r->adr)))
|
||||
nc = 0;
|
||||
}
|
||||
if (!nc)
|
||||
@@ -1368,6 +1415,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
expected = '('; // parenthèse
|
||||
expected_end = "),"; // fin: virgule ou parenthèse
|
||||
ensure_not_mime = 1; //* ensure the url is not a mime type */
|
||||
ensure_not_method = 1; // xhr.open: don't grab method
|
||||
}
|
||||
if (!nc)
|
||||
if ((nc = strfield(html, ".replace"))) { // window.replace("url")
|
||||
@@ -1379,7 +1427,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
expected = '('; // parenthèse
|
||||
expected_end = ")"; // fin: parenthèse
|
||||
}
|
||||
if (!nc && (nc = strfield(html, "url")) && (!isalnum(*(html - 1))) && *(html - 1) != '_') { // url(url)
|
||||
if (!nc && (nc = strfield(html, "url")) &&
|
||||
(!isalnum(html_prevc(html, r->adr))) &&
|
||||
html_prevc(html, r->adr) != '_') { // url(url)
|
||||
expected = '('; // parenthèse
|
||||
expected_end = ")"; // fin: parenthèse
|
||||
can_avoid_quotes = 1;
|
||||
@@ -1389,6 +1439,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if ((nc = strfield(html, "import"))) { // import "url"
|
||||
if (is_space(*(html + nc))) {
|
||||
expected = 0; // no char expected
|
||||
is_import = 1;
|
||||
} else
|
||||
nc = 0;
|
||||
}
|
||||
@@ -1406,6 +1457,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if ((*a == 34) || (*a == '\'') || (can_avoid_quotes)) {
|
||||
const char *b, *c;
|
||||
int ndelim = 1;
|
||||
int valid_url = 0;
|
||||
|
||||
if ((*a == 34) || (*a == '\''))
|
||||
a++;
|
||||
@@ -1420,12 +1472,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
b++;
|
||||
}
|
||||
c = b--;
|
||||
c += ndelim;
|
||||
while(*c == ' ')
|
||||
c++;
|
||||
if ((strchr(expected_end, *c)) || (*c == '\n')
|
||||
|| (*c == '\r')) {
|
||||
c -= (ndelim + 1);
|
||||
// no closing delimiter here (truncated input):
|
||||
// Don't scan past the buffer NUL or capture it.
|
||||
if (*c != '\0') {
|
||||
c += ndelim;
|
||||
while (*c == ' ')
|
||||
c++;
|
||||
valid_url =
|
||||
(strchr(expected_end, *c)) || (*c == '\n') ||
|
||||
(*c == '\r') ||
|
||||
(is_import && *(b + 1 + ndelim) == ' ');
|
||||
}
|
||||
if (valid_url) {
|
||||
// URL end = last char (b), not the delimiter
|
||||
c = b;
|
||||
if ((int) (c - a + 1)) {
|
||||
if (ensure_not_mime) {
|
||||
int i = 0;
|
||||
@@ -1441,6 +1501,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
// XHR.open's "GET" etc. is a method, not a URL
|
||||
if (a != NULL && ensure_not_method &&
|
||||
is_http_method(a, (size_t) (c - a + 1))) {
|
||||
a = NULL;
|
||||
}
|
||||
// Check for bogus links (Vasiliy)
|
||||
if (a != NULL) {
|
||||
const size_t size = c - a + 1;
|
||||
@@ -1484,7 +1549,6 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1691,6 +1755,24 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
hts_nodetect[i -
|
||||
1]);
|
||||
}
|
||||
// xmlns / xmlns:prefix declare
|
||||
// XML namespaces, not resources
|
||||
// (#191)
|
||||
else {
|
||||
const int xl = strfield(
|
||||
intag_startattr, "xmlns");
|
||||
const char xc =
|
||||
intag_startattr[xl];
|
||||
if (xl &&
|
||||
(xc == ':' || xc == '=' ||
|
||||
is_space(xc))) {
|
||||
url_ok = 0;
|
||||
hts_log_print(
|
||||
opt, LOG_DEBUG,
|
||||
"dirty parsing: xmlns "
|
||||
"namespace avoided");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1746,7 +1828,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
|
||||
// écrire codebase avant, flusher avant code
|
||||
if ((p_type == -1) || (p_type == -2)) {
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
HT_add_adr; // refresh
|
||||
}
|
||||
lastsaved = html; // dernier écrit+1
|
||||
@@ -1837,7 +1919,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
|
||||
// ne pas flusher après code si on doit écrire le codebase avant!
|
||||
if ((p_type != -1) && (p_type != 2) && (p_type != -2)) {
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
HT_add_adr; // refresh
|
||||
}
|
||||
lastsaved = html; // dernier écrit+1
|
||||
@@ -1914,7 +1996,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (*html != '#') { // Not empty+unique #
|
||||
if (eadr - html == 1) { // 1=link empty with delim (end_adr-start_adr)
|
||||
if (quote) {
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
HT_ADD("#"); // We add this for a <href="">
|
||||
}
|
||||
}
|
||||
@@ -2569,7 +2651,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if ((p_type == 2) || (p_type == -2)) { // base href ou codebase, pas un lien
|
||||
hts_log_print(opt, LOG_DEBUG, "Code/Codebase: %s%s",
|
||||
afs.af.adr, afs.af.fil);
|
||||
} else if ((opt->getmode & 4) == 0) {
|
||||
} else if ((opt->getmode & HTS_GETMODE_HTML_FIRST) ==
|
||||
0) {
|
||||
hts_log_print(opt, LOG_DEBUG, "Record: %s%s -> %s",
|
||||
afs.af.adr, afs.af.fil, afs.save);
|
||||
} else {
|
||||
@@ -2592,8 +2675,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
lastsaved = eadr - 1 + 1; // sauter "
|
||||
}
|
||||
/* */
|
||||
else if (opt->urlmode == 0) { // URL absolue dans tous les cas
|
||||
if ((opt->getmode & 1) && (ptr > 0)) { // ecrire les html
|
||||
else if (opt->urlmode == HTS_URLMODE_ABSOLUTE) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
if (!link_has_authority(afs.af.adr)) {
|
||||
HT_ADD("http://");
|
||||
} else {
|
||||
@@ -2620,12 +2703,14 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
lastsaved = eadr - 1; // dernier écrit+1 (enfin euh apres on fait un ++ alors hein)
|
||||
/* */
|
||||
} else if (opt->urlmode == 4) { // ne rien faire!
|
||||
} else if (opt->urlmode == HTS_URLMODE_KEEP_ORIGINAL) {
|
||||
/* */
|
||||
/* leave the link 'as is' */
|
||||
/* Sinon, dépend de interne/externe */
|
||||
} else if (forbidden_url == 1) { // le lien ne sera pas chargé, référence externe!
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
} else if (forbidden_url ==
|
||||
1) { // le lien ne sera pas chargé, référence
|
||||
// externe!
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
if (p_type != -1) { // pas que le nom de fichier (pas classe java)
|
||||
if (!opt->external) {
|
||||
if (!link_has_authority(afs.af.adr)) {
|
||||
@@ -2674,7 +2759,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
'/') ? 1 : (ishtml(opt, afs.af.fil)))) {
|
||||
case 1:
|
||||
case -2: // html ou répertoire
|
||||
if (opt->getmode & 1) { // sauver html
|
||||
if (opt->getmode & HTS_GETMODE_HTML) {
|
||||
patch_it = 1; // redirect
|
||||
add_url = 1; // avec link?
|
||||
cat_name = "external.html";
|
||||
@@ -2847,7 +2932,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
// érire codebase="chemin"
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) &&
|
||||
(ptr > 0)) {
|
||||
char BIGSTK tempo4[HTS_URLMAXSIZE * 2];
|
||||
|
||||
tempo4[0] = '\0';
|
||||
@@ -2875,9 +2961,11 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
lastsaved = eadr - 1;
|
||||
}
|
||||
/*
|
||||
else if (opt->urlmode==1) { // ABSOLU, c'est le cas le moins courant
|
||||
else if (opt->urlmode==1) { // ABSOLU, c'est le cas le
|
||||
moins courant
|
||||
// NE FONCTIONNE PAS!! (et est inutile)
|
||||
if ((opt->getmode & 1) && (ptr>0)) { // ecrire les html
|
||||
if ((opt->getmode & 1) && (ptr>0)) { // ecrire les
|
||||
html
|
||||
// écrire le lien modifié, absolu
|
||||
HT_ADD("file:");
|
||||
if (*save=='/')
|
||||
@@ -2885,7 +2973,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
else
|
||||
HT_ADD(save)
|
||||
}
|
||||
lastsaved=eadr-1; // dernier écrit+1 (enfin euh apres on fait un ++ alors hein)
|
||||
lastsaved=eadr-1; // dernier écrit+1 (enfin euh apres
|
||||
on fait un ++ alors hein)
|
||||
}
|
||||
*/
|
||||
else if (opt->mimehtml) {
|
||||
@@ -2895,18 +2984,18 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
make_content_id(afs.af.adr, afs.af.fil, cid, sizeof(cid));
|
||||
HT_ADD_HTMLESCAPED(cid);
|
||||
lastsaved = eadr - 1; // dernier écrit+1 (enfin euh apres on fait un ++ alors hein)
|
||||
} else if (opt->urlmode == 3) { // URI absolue /
|
||||
if ((opt->getmode & 1) && (ptr > 0)) { // ecrire les html
|
||||
} else if (opt->urlmode == HTS_URLMODE_ABSOLUTE_URI) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
HT_ADD_HTMLESCAPED(afs.af.fil);
|
||||
}
|
||||
lastsaved = eadr - 1; // dernier écrit+1 (enfin euh apres on fait un ++ alors hein)
|
||||
} else if (opt->urlmode == 5) { // transparent proxy URL
|
||||
} else if (opt->urlmode == HTS_URLMODE_TRANSPARENT_PROXY) {
|
||||
char BIGSTK tempo[HTS_URLMAXSIZE * 2];
|
||||
const char *uri;
|
||||
int i;
|
||||
char *pos;
|
||||
|
||||
if ((opt->getmode & 1) && (ptr > 0)) { // ecrire les html
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
if (!link_has_authority(afs.af.adr)) {
|
||||
HT_ADD("http://");
|
||||
} else {
|
||||
@@ -2947,7 +3036,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
HT_ADD_HTMLESCAPED(tempo);
|
||||
}
|
||||
lastsaved = eadr - 1; // dernier écrit+1 (enfin euh apres on fait un ++ alors hein)
|
||||
} else if (opt->urlmode == 2) { // RELATIF
|
||||
} else if (opt->urlmode == HTS_URLMODE_RELATIVE) {
|
||||
char BIGSTK tempo[HTS_URLMAXSIZE * 2];
|
||||
|
||||
tempo[0] = '\0';
|
||||
@@ -2959,6 +3048,10 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
/* Never escape high-chars (we don't know the encoding!!) */
|
||||
inplace_escape_uri_utf(tempo, sizeof(tempo));
|
||||
|
||||
// unquoted CSS url(...): keep parens escaped (#163)
|
||||
if (ending_p == ')')
|
||||
escape_url_parens(tempo, sizeof(tempo));
|
||||
|
||||
//if (!no_esc_utf)
|
||||
// escape_uri(tempo); // escape with %xx
|
||||
//else {
|
||||
@@ -3009,7 +3102,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
// érire codebase="chemin"
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) &&
|
||||
(ptr > 0)) {
|
||||
char BIGSTK tempo4[HTS_URLMAXSIZE * 2];
|
||||
|
||||
tempo4[0] = '\0';
|
||||
@@ -3027,7 +3121,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
//lastsaved=adr; // dernier écrit+1
|
||||
}
|
||||
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
// convert to local codepage - NOT, already converted into %NN, and passed to the remote server so we do not have anything to do
|
||||
//if (str->page_charset_ != NULL && *str->page_charset_ != '\0') {
|
||||
// char *const local_save = hts_convertStringFromUTF8(tempo, strlen(tempo), str->page_charset_);
|
||||
@@ -3061,7 +3155,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
"Error building relative link %s and %s",
|
||||
afs.save, relativesavename());
|
||||
}
|
||||
} // sinon le lien sera écrit normalement
|
||||
} // sinon le lien sera écrit normalement
|
||||
|
||||
#if 0
|
||||
if (fexist(save)) { // le fichier existe..
|
||||
@@ -3089,7 +3183,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
opt->maxlink);
|
||||
hts_log_print(opt, LOG_INFO,
|
||||
"To avoid that: use #L option for more links (example: -#L1000000)");
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
if (fp) {
|
||||
fclose(fp);
|
||||
fp = NULL;
|
||||
@@ -3101,9 +3195,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
int pass_fix, dejafait = 0;
|
||||
|
||||
// Calculer la priorité de ce lien
|
||||
if ((opt->getmode & 4) == 0) { // traiter html après
|
||||
if ((opt->getmode & HTS_GETMODE_HTML_FIRST) == 0) {
|
||||
pass_fix = 0;
|
||||
} else { // vérifier que ce n'est pas un !html
|
||||
} else { // vérifier que ce n'est pas un !html
|
||||
if (!ishtml(opt, afs.af.fil))
|
||||
pass_fix = 1; // priorité inférieure (traiter après)
|
||||
else
|
||||
@@ -3167,7 +3261,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
if (checkrobots(_ROBOTS, afs.af.adr, "") == -1) { // robots.txt ?
|
||||
// enregistrer robots.txt (MACRO)
|
||||
if (!hts_record_link(opt, afs.af.adr, "/robots.txt", "", "", "", NULL)) {
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) &&
|
||||
(ptr > 0)) {
|
||||
if (fp) {
|
||||
fclose(fp);
|
||||
fp = NULL;
|
||||
@@ -3206,7 +3301,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
// enregistrer
|
||||
if (!hts_record_link(opt, afs.af.adr, afs.af.fil, afs.save,
|
||||
former.adr, former.fil, codebase)) {
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) &&
|
||||
(ptr > 0)) {
|
||||
if (fp) {
|
||||
fclose(fp);
|
||||
fp = NULL;
|
||||
@@ -3351,7 +3447,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
// ----------
|
||||
// écrire peu à peu
|
||||
if ((opt->getmode & 1) && (ptr > 0))
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0))
|
||||
HT_add_adr;
|
||||
lastsaved = html; // dernier écrit+1
|
||||
// ----------
|
||||
@@ -3411,7 +3507,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
opt->state._hts_in_html_parsing = 0; // flag
|
||||
opt->state._hts_cancel = 0; // pas de cancel
|
||||
|
||||
if ((opt->getmode & 1) && (ptr > 0)) {
|
||||
if ((opt->getmode & HTS_GETMODE_HTML) && (ptr > 0)) {
|
||||
{
|
||||
char *cAddr = TypedArrayElts(output_buffer);
|
||||
int cSize = (int) TypedArraySize(output_buffer);
|
||||
@@ -3443,7 +3539,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
//
|
||||
} // if !error
|
||||
|
||||
if (opt->getmode & 1) {
|
||||
if (opt->getmode & HTS_GETMODE_HTML) {
|
||||
if (fp) {
|
||||
fclose(fp);
|
||||
fp = NULL;
|
||||
@@ -3711,7 +3807,8 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
//case -1: can_retry=1; break;
|
||||
case STATUSCODE_TIMEOUT:
|
||||
if (opt->hostcontrol) { // timeout et retry épuisés
|
||||
if ((opt->hostcontrol & 1) && (heap(ptr)->retry <= 0)) {
|
||||
if ((opt->hostcontrol & HTS_HOSTCONTROL_BAN_TIMEOUT) &&
|
||||
(heap(ptr)->retry <= 0)) {
|
||||
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
||||
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
@@ -3724,7 +3821,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
|
||||
break;
|
||||
case STATUSCODE_SLOW:
|
||||
if ((opt->hostcontrol) && (heap(ptr)->retry <= 0)) { // too slow
|
||||
if (opt->hostcontrol & 2) {
|
||||
if (opt->hostcontrol & HTS_HOSTCONTROL_BAN_SLOW) {
|
||||
hts_log_print(opt, LOG_DEBUG, "Link banned: %s%s", urladr(), urlfil());
|
||||
host_ban(opt, ptr, sback, jump_identification_const(urladr()));
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
@@ -4250,10 +4347,10 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
|
||||
char com[256];
|
||||
|
||||
linput(stdin, com, 200);
|
||||
if (opt->verbosedisplay == 2)
|
||||
opt->verbosedisplay = 1;
|
||||
if (opt->verbosedisplay == HTS_VERBOSE_FULL)
|
||||
opt->verbosedisplay = HTS_VERBOSE_SIMPLE;
|
||||
else
|
||||
opt->verbosedisplay = 2;
|
||||
opt->verbosedisplay = HTS_VERBOSE_FULL;
|
||||
/* Info for wrappers */
|
||||
hts_log_print(opt, LOG_INFO, "engine: change-options");
|
||||
RUN_CALLBACK0(opt, chopt);
|
||||
@@ -4363,7 +4460,7 @@ int hts_mirror_wait_for_next_file(htsmoduleStruct * str,
|
||||
printf("%c\x0d", ("/-\\|")[roll]);
|
||||
fflush(stdout);
|
||||
}
|
||||
} else if (opt->verbosedisplay == 1) {
|
||||
} else if (opt->verbosedisplay == HTS_VERBOSE_SIMPLE) {
|
||||
if (b >= 0) {
|
||||
if (back[b].r.statuscode == HTTP_OK)
|
||||
printf("%d/%d: %s%s (" LLintP " bytes) - OK\33[K\r", ptr, opt->lien_tot,
|
||||
@@ -4454,8 +4551,8 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
char in_error_msg[32];
|
||||
|
||||
// resolve unresolved type
|
||||
if (opt->savename_delayed != 0 && *forbidden_url == 0 && IS_DELAYED_EXT(afs->save)
|
||||
&& !opt->state.stop) {
|
||||
if (opt->savename_delayed != HTS_SAVENAME_DELAYED_NONE &&
|
||||
*forbidden_url == 0 && IS_DELAYED_EXT(afs->save) && !opt->state.stop) {
|
||||
int loops;
|
||||
int continue_loop;
|
||||
|
||||
@@ -4839,7 +4936,7 @@ int hts_wait_delayed(htsmoduleStruct * str, lien_adrfilsave *afs,
|
||||
}
|
||||
}
|
||||
|
||||
} // delayed type check ?
|
||||
} // delayed type check ?
|
||||
|
||||
ENGINE_SAVE_CONTEXT_BASE();
|
||||
|
||||
|
||||
@@ -1213,7 +1213,7 @@ HTSEXT_API find_handle hts_findfirst(char *path) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
HTSEXT_API int hts_findnext(find_handle find) {
|
||||
HTSEXT_API hts_boolean hts_findnext(find_handle find) {
|
||||
if (find) {
|
||||
#ifdef _WIN32
|
||||
if ((FindNextFileA(find->handle, &find->hdata)))
|
||||
@@ -1273,7 +1273,7 @@ HTSEXT_API int hts_findgetsize(find_handle find) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
HTSEXT_API int hts_findisdir(find_handle find) {
|
||||
HTSEXT_API hts_boolean hts_findisdir(find_handle find) {
|
||||
if (find) {
|
||||
if (!hts_findissystem(find)) {
|
||||
#ifdef _WIN32
|
||||
@@ -1287,7 +1287,7 @@ HTSEXT_API int hts_findisdir(find_handle find) {
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
HTSEXT_API int hts_findisfile(find_handle find) {
|
||||
HTSEXT_API hts_boolean hts_findisfile(find_handle find) {
|
||||
if (find) {
|
||||
if (!hts_findissystem(find)) {
|
||||
#ifdef _WIN32
|
||||
@@ -1301,7 +1301,7 @@ HTSEXT_API int hts_findisfile(find_handle find) {
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
HTSEXT_API int hts_findissystem(find_handle find) {
|
||||
HTSEXT_API hts_boolean hts_findissystem(find_handle find) {
|
||||
if (find) {
|
||||
#ifdef _WIN32
|
||||
if (find->hdata.
|
||||
|
||||
@@ -108,15 +108,15 @@ HTSEXT_API int hts_buildtopindex(httrackp * opt, const char *path,
|
||||
// Portable directory find functions
|
||||
// Directory find functions
|
||||
HTSEXT_API find_handle hts_findfirst(char *path);
|
||||
HTSEXT_API int hts_findnext(find_handle find);
|
||||
HTSEXT_API hts_boolean hts_findnext(find_handle find);
|
||||
HTSEXT_API int hts_findclose(find_handle find);
|
||||
|
||||
//
|
||||
HTSEXT_API char *hts_findgetname(find_handle find);
|
||||
HTSEXT_API int hts_findgetsize(find_handle find);
|
||||
HTSEXT_API int hts_findisdir(find_handle find);
|
||||
HTSEXT_API int hts_findisfile(find_handle find);
|
||||
HTSEXT_API int hts_findissystem(find_handle find);
|
||||
HTSEXT_API hts_boolean hts_findisdir(find_handle find);
|
||||
HTSEXT_API hts_boolean hts_findisfile(find_handle find);
|
||||
HTSEXT_API hts_boolean hts_findissystem(find_handle find);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
158
src/htswizard.c
158
src/htswizard.c
@@ -178,7 +178,7 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
// -------------------- PHASE 1 --------------------
|
||||
|
||||
/* Doit-on traiter les non html? */
|
||||
if ((opt->getmode & 2) == 0) { // non on ne doit pas
|
||||
if ((opt->getmode & HTS_GETMODE_NONHTML) == 0) { // non on ne doit pas
|
||||
if (!ishtml(opt, fil)) { // non il ne faut pas
|
||||
//adr[0]='\0'; // ne pas traiter ce lien, pas traiter
|
||||
forbidden_url = 1; // interdire récupération du lien
|
||||
@@ -266,11 +266,11 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
test2 =
|
||||
(strchr(tempo2 + ((*tempo2 == '/') ? 1 : 0), '/') != NULL);
|
||||
if ((test1) && (test2)) { // on ne peut que descendre
|
||||
if ((opt->seeker & 1) == 0) { // interdiction de descendre
|
||||
if ((opt->seeker & HTS_SEEKER_DOWN) == 0) {
|
||||
forbidden_url = 1;
|
||||
hts_log_print(opt, LOG_DEBUG, "lower link canceled: %s%s", adr,
|
||||
fil);
|
||||
} else { // autorisé à priori - NEW
|
||||
} else { // autorisé à priori - NEW
|
||||
if (!heap(ptr)->link_import) { // ne résulte pas d'un 'moved'
|
||||
forbidden_url = 0;
|
||||
hts_log_print(opt, LOG_DEBUG, "lower link authorized: %s%s",
|
||||
@@ -278,7 +278,7 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
}
|
||||
}
|
||||
} else if ((test1) || (test2)) { // on peut descendre pour accéder au lien
|
||||
if ((opt->seeker & 1) != 0) { // on peut descendre - NEW
|
||||
if ((opt->seeker & HTS_SEEKER_DOWN) != 0) {
|
||||
if (!heap(ptr)->link_import) { // ne résulte pas d'un 'moved'
|
||||
forbidden_url = 0;
|
||||
hts_log_print(opt, LOG_DEBUG, "lower link authorized: %s%s",
|
||||
@@ -290,11 +290,11 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
|
||||
// up
|
||||
if ((!strncmp(tempo, "../", 3)) && (!strncmp(tempo2, "../", 3))) { // impossible sans monter
|
||||
if ((opt->seeker & 2) == 0) { // interdiction de monter
|
||||
if ((opt->seeker & HTS_SEEKER_UP) == 0) {
|
||||
forbidden_url = 1;
|
||||
hts_log_print(opt, LOG_DEBUG, "upper link canceled: %s%s", adr,
|
||||
fil);
|
||||
} else { // autorisé à monter - NEW
|
||||
} else { // autorisé à monter - NEW
|
||||
if (!heap(ptr)->link_import) { // ne résulte pas d'un 'moved'
|
||||
forbidden_url = 0;
|
||||
hts_log_print(opt, LOG_DEBUG, "upper link authorized: %s%s",
|
||||
@@ -302,13 +302,13 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
}
|
||||
}
|
||||
} else if ((!strncmp(tempo, "../", 3)) || (!strncmp(tempo2, "../", 3))) { // Possible en montant
|
||||
if ((opt->seeker & 2) != 0) { // autorisé à monter - NEW
|
||||
if ((opt->seeker & HTS_SEEKER_UP) != 0) {
|
||||
if (!heap(ptr)->link_import) { // ne résulte pas d'un 'moved'
|
||||
forbidden_url = 0;
|
||||
hts_log_print(opt, LOG_DEBUG, "upper link authorized: %s%s",
|
||||
adr, fil);
|
||||
}
|
||||
} // sinon autorisé en descente
|
||||
} // sinon autorisé en descente
|
||||
}
|
||||
|
||||
} else {
|
||||
@@ -345,83 +345,81 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
|
||||
//if (!opt->wizard) { // mode non wizard
|
||||
// doit-on traiter ce lien?.. vérifier droits de sortie
|
||||
switch ((opt->travel & 255)) {
|
||||
case 0:
|
||||
switch ((opt->travel & HTS_TRAVEL_SCOPE_MASK)) {
|
||||
case HTS_TRAVEL_SAME_ADDRESS:
|
||||
if (!opt->wizard) // mode non wizard
|
||||
forbidden_url = 1;
|
||||
break; // interdicton de sortir au dela de l'adresse
|
||||
case 1:{ // sortie sur le même dom.xxx
|
||||
size_t i = strlen(adr) - 1;
|
||||
size_t j = strlen(urladr()) - 1;
|
||||
case HTS_TRAVEL_SAME_DOMAIN: {
|
||||
size_t i = strlen(adr) - 1;
|
||||
size_t j = strlen(urladr()) - 1;
|
||||
|
||||
if ((i > 0) && (j > 0)) {
|
||||
while((i > 0) && (adr[i] != '.'))
|
||||
i--;
|
||||
while((j > 0) && (urladr()[j] != '.'))
|
||||
j--;
|
||||
if ((i > 0) && (j > 0)) {
|
||||
i--;
|
||||
j--;
|
||||
while((i > 0) && (adr[i] != '.'))
|
||||
i--;
|
||||
while((j > 0) && (urladr()[j] != '.'))
|
||||
j--;
|
||||
}
|
||||
}
|
||||
if ((i > 0) && (j > 0)) {
|
||||
if (!strfield2(adr + i, urladr() + j)) { // !=
|
||||
if (!opt->wizard) { // mode non wizard
|
||||
//printf("refused: %s\n",adr);
|
||||
forbidden_url = 1; // pas même domaine
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"foreign domain link canceled: %s%s", adr, fil);
|
||||
}
|
||||
|
||||
} else {
|
||||
if (opt->wizard) { // mode wizard
|
||||
forbidden_url = 0; // même domaine
|
||||
hts_log_print(opt, LOG_DEBUG, "same domain link authorized: %s%s",
|
||||
adr, fil);
|
||||
}
|
||||
}
|
||||
|
||||
} else
|
||||
forbidden_url = 1;
|
||||
}
|
||||
break;
|
||||
case 2:{ // sortie sur le même .xxx
|
||||
size_t i = strlen(adr) - 1;
|
||||
size_t j = strlen(urladr()) - 1;
|
||||
|
||||
while((i > 0) && (adr[i] != '.'))
|
||||
if ((i > 0) && (j > 0)) {
|
||||
while ((i > 0) && (adr[i] != '.'))
|
||||
i--;
|
||||
while((j > 0) && (urladr()[j] != '.'))
|
||||
while ((j > 0) && (urladr()[j] != '.'))
|
||||
j--;
|
||||
if ((i > 0) && (j > 0)) {
|
||||
if (!strfield2(adr + i, urladr() + j)) { // !-
|
||||
if (!opt->wizard) { // mode non wizard
|
||||
//printf("refused: %s\n",adr);
|
||||
forbidden_url = 1; // pas même .xx
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"foreign location link canceled: %s%s", adr, fil);
|
||||
}
|
||||
} else {
|
||||
if (opt->wizard) { // mode wizard
|
||||
forbidden_url = 0; // même domaine
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"same location link authorized: %s%s", adr, fil);
|
||||
}
|
||||
}
|
||||
} else
|
||||
forbidden_url = 1;
|
||||
i--;
|
||||
j--;
|
||||
while ((i > 0) && (adr[i] != '.'))
|
||||
i--;
|
||||
while ((j > 0) && (urladr()[j] != '.'))
|
||||
j--;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 7: // everywhere!!
|
||||
if ((i > 0) && (j > 0)) {
|
||||
if (!strfield2(adr + i, urladr() + j)) { // !=
|
||||
if (!opt->wizard) { // mode non wizard
|
||||
// printf("refused: %s\n",adr);
|
||||
forbidden_url = 1; // pas même domaine
|
||||
hts_log_print(opt, LOG_DEBUG, "foreign domain link canceled: %s%s",
|
||||
adr, fil);
|
||||
}
|
||||
|
||||
} else {
|
||||
if (opt->wizard) { // mode wizard
|
||||
forbidden_url = 0; // même domaine
|
||||
hts_log_print(opt, LOG_DEBUG, "same domain link authorized: %s%s",
|
||||
adr, fil);
|
||||
}
|
||||
}
|
||||
|
||||
} else
|
||||
forbidden_url = 1;
|
||||
} break;
|
||||
case HTS_TRAVEL_SAME_TLD: {
|
||||
size_t i = strlen(adr) - 1;
|
||||
size_t j = strlen(urladr()) - 1;
|
||||
|
||||
while ((i > 0) && (adr[i] != '.'))
|
||||
i--;
|
||||
while ((j > 0) && (urladr()[j] != '.'))
|
||||
j--;
|
||||
if ((i > 0) && (j > 0)) {
|
||||
if (!strfield2(adr + i, urladr() + j)) { // !-
|
||||
if (!opt->wizard) { // mode non wizard
|
||||
// printf("refused: %s\n",adr);
|
||||
forbidden_url = 1; // pas même .xx
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
"foreign location link canceled: %s%s", adr, fil);
|
||||
}
|
||||
} else {
|
||||
if (opt->wizard) { // mode wizard
|
||||
forbidden_url = 0; // même domaine
|
||||
hts_log_print(opt, LOG_DEBUG, "same location link authorized: %s%s",
|
||||
adr, fil);
|
||||
}
|
||||
}
|
||||
} else
|
||||
forbidden_url = 1;
|
||||
} break;
|
||||
case HTS_TRAVEL_EVERYWHERE:
|
||||
if (opt->wizard) { // mode wizard
|
||||
forbidden_url = 0;
|
||||
break;
|
||||
}
|
||||
} // switch
|
||||
} // switch
|
||||
|
||||
// ANCIENNE POS -- récupérer les liens à côtés d'un lien (nearlink)
|
||||
|
||||
@@ -583,7 +581,7 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
// on doit poser la question.. peut on la poser?
|
||||
// (oui je sais quel preuve de délicatesse, merci merci)
|
||||
if ((question) && (ptr > 0) && (!force_mirror)) {
|
||||
if (opt->wizard == 2) { // éliminer tous les liens non répertoriés comme autorisés (ou inconnus)
|
||||
if (opt->wizard == HTS_WIZARD_AUTO) {
|
||||
question = 0;
|
||||
forbidden_url = 1;
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
@@ -600,8 +598,8 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
printf("robots.txt forbidden: %s%s\n", adr, fil);
|
||||
#endif
|
||||
// question résolue, par les filtres, et mode robot non strict
|
||||
if ((!question) && (filters_answer) && (opt->robots == 1)
|
||||
&& (forbidden_url != 1)) {
|
||||
if ((!question) && (filters_answer) &&
|
||||
(opt->robots == HTS_ROBOTS_SOMETIMES) && (forbidden_url != 1)) {
|
||||
r = 0; // annuler interdiction des robots
|
||||
if (!forbidden_url) {
|
||||
hts_log_print(opt, LOG_DEBUG,
|
||||
@@ -685,7 +683,7 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
io_flush;
|
||||
} else { // lien primaire: autoriser répertoire entier
|
||||
if (!force_mirror) {
|
||||
if ((opt->seeker & 1) == 0) { // interdiction de descendre
|
||||
if ((opt->seeker & HTS_SEEKER_DOWN) == 0) {
|
||||
n = 7;
|
||||
} else {
|
||||
n = 5; // autoriser miroir répertoires descendants (lien primaire)
|
||||
@@ -712,7 +710,7 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
switch (n) {
|
||||
case -1: // sauter tout le reste
|
||||
forbidden_url = 1;
|
||||
opt->wizard = 2; // sauter tout le reste
|
||||
opt->wizard = HTS_WIZARD_AUTO; // sauter tout le reste
|
||||
break;
|
||||
case 0: // forbid the same link: adr/fil
|
||||
forbidden_url = 1;
|
||||
@@ -796,7 +794,7 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
break;
|
||||
|
||||
case 5: // allow the whole directory and its children
|
||||
if ((opt->seeker & 2) == 0) { // not allowed to go up
|
||||
if ((opt->seeker & HTS_SEEKER_UP) == 0) { // not allowed to go up
|
||||
size_t i = strlen(fil) - 1;
|
||||
|
||||
while((fil[i] != '/') && (i > 0))
|
||||
@@ -872,7 +870,7 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
// lien non autorisé, peut-on juste le tester?
|
||||
if (just_test_it) {
|
||||
if (forbidden_url == 1) {
|
||||
if (opt->travel & 256) { // tester tout de même
|
||||
if (opt->travel & HTS_TRAVEL_TEST_ALL) { // tester tout de même
|
||||
if (strfield(adr, "ftp://") == 0) { // PAS ftp!
|
||||
forbidden_url = 1; // oui oui toujours interdit (note: sert à rien car ==1 mais c pour comprendre)
|
||||
*just_test_it = 1; // mais on teste
|
||||
|
||||
@@ -206,7 +206,8 @@ HTSEXT_API htsErrorCallback hts_get_error_callback(void);
|
||||
/* Logging */
|
||||
/** Legacy: write prefix then msg to opt->log. Returns 0 if written, 1 if
|
||||
opt->log is NULL. Prefer hts_log_print(). */
|
||||
HTSEXT_API int hts_log(httrackp * opt, const char *prefix, const char *msg);
|
||||
HTSEXT_API hts_boolean hts_log(httrackp *opt, const char *prefix,
|
||||
const char *msg);
|
||||
|
||||
/** printf-style log at level @p type (an hts_log_type, optionally |LOG_ERRNO).
|
||||
Forwards to the registered log callback, and when the level is <= opt->debug
|
||||
@@ -254,15 +255,6 @@ HTSEXT_API int htswrap_add(httrackp * opt, const char *name, void *fct);
|
||||
or 0 if none or unknown. */
|
||||
HTSEXT_API uintptr_t htswrap_read(httrackp * opt, const char *name);
|
||||
|
||||
/** @warning No implementation is linked into the library; calling this fails to
|
||||
link. For per-callback user data use the CHAIN_FUNCTION() ARGUMENT and
|
||||
CALLBACKARG_USERDEF() instead. */
|
||||
HTSEXT_API int htswrap_set_userdef(httrackp * opt, void *userdef);
|
||||
|
||||
/** @warning No implementation is linked into the library; calling this fails to
|
||||
link. Read per-callback user data with CALLBACKARG_USERDEF() instead. */
|
||||
HTSEXT_API void *htswrap_get_userdef(httrackp * opt);
|
||||
|
||||
/* Internal library allocators, if a different libc is being used by the client */
|
||||
/** strdup() through the library allocator. Returns a heap copy freed with
|
||||
hts_free(), or NULL on failure. */
|
||||
@@ -322,7 +314,8 @@ HTSEXT_API T_SOC catch_url_init(int *port, char *adr);
|
||||
"ip:port". The buffers are caller-allocated and not bounds-checked: @p data
|
||||
must be CATCH_URL_DATA_SIZE bytes, and @p url / @p method must fit the
|
||||
captured request line. */
|
||||
HTSEXT_API int catch_url(T_SOC soc, char *url, char *method, char *data);
|
||||
HTSEXT_API hts_boolean catch_url(T_SOC soc, char *url, char *method,
|
||||
char *data);
|
||||
|
||||
/* State */
|
||||
/** Whether the engine is parsing HTML. Returns 0 if not, otherwise the percent
|
||||
@@ -343,10 +336,10 @@ HTSEXT_API int hts_is_exiting(httrackp * opt);
|
||||
caller-owned, NULL-terminated array of strings; the engine stores the
|
||||
pointer without copying, so the array and its strings must stay valid until
|
||||
the engine consumes them. @return nonzero if a list is now set. */
|
||||
HTSEXT_API int hts_addurl(httrackp * opt, char **url);
|
||||
HTSEXT_API hts_boolean hts_addurl(httrackp *opt, char **url);
|
||||
|
||||
/** Clear any pending add-URL list set by hts_addurl(). Always returns 0. */
|
||||
HTSEXT_API int hts_resetaddurl(httrackp * opt);
|
||||
HTSEXT_API hts_boolean hts_resetaddurl(httrackp *opt);
|
||||
|
||||
/** Apply the runtime-tunable options from @p from onto @p to, to adjust a live
|
||||
mirror. Only fields set to a non-sentinel value are copied; the rest of @p
|
||||
@@ -365,7 +358,7 @@ HTSEXT_API int hts_setpause(httrackp * opt, int);
|
||||
lock, so it is safe to call from another thread). @p force is currently
|
||||
ignored.
|
||||
@return 0; no-op if @p opt is NULL. */
|
||||
HTSEXT_API int hts_request_stop(httrackp * opt, int force);
|
||||
HTSEXT_API int hts_request_stop(httrackp *opt, hts_boolean force);
|
||||
|
||||
/** Queue a single in-progress file, by URL, to be cancelled by the engine.
|
||||
@p url is copied internally. Takes the state lock, so it is thread-safe.
|
||||
@@ -382,7 +375,7 @@ HTSEXT_API void hts_cancel_parsing(httrackp * opt);
|
||||
|
||||
/** Nonzero once the mirror has fully ended. Read under the engine state lock,
|
||||
so safe to poll from another thread. Wait for this before hts_free_opt(). */
|
||||
HTSEXT_API int hts_has_stopped(httrackp * opt);
|
||||
HTSEXT_API hts_boolean hts_has_stopped(httrackp *opt);
|
||||
|
||||
/* Tools */
|
||||
/** Ensure the directory chain leading to @p path exists, creating missing
|
||||
@@ -399,7 +392,7 @@ HTSEXT_API int structcheck_utf8(const char *path);
|
||||
/** Whether the directory containing @p path exists. The basename is stripped
|
||||
first, so passing a file path tests its parent directory. @return 1 if it is
|
||||
a directory, 0 otherwise. */
|
||||
HTSEXT_API int dir_exists(const char *path);
|
||||
HTSEXT_API hts_boolean dir_exists(const char *path);
|
||||
|
||||
/** Write the HTTP reason phrase for @p statuscode into @p msg, a caller buffer
|
||||
of at least 64 bytes. For an unknown code a non-empty @p msg is kept,
|
||||
@@ -582,20 +575,15 @@ HTSEXT_API char *unescape_http(char *const catbuff, const size_t size, const cha
|
||||
must-avoid escapes are kept encoded, and %25 is never decoded). @p no_high &
|
||||
1 also decodes high (>= 128) bytes; @p no_high & 2 also decodes an escaped
|
||||
space. Returns @p catbuff. */
|
||||
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size, const char *s, const int no_high);
|
||||
|
||||
/** @warning No implementation is linked into the library; calling this fails to
|
||||
link. */
|
||||
HTSEXT_API char *antislash_unescaped(char *catbuff, const char *s);
|
||||
|
||||
HTSEXT_API void escape_remove_control(char *s);
|
||||
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size,
|
||||
const char *s, const hts_boolean no_high);
|
||||
|
||||
/** Determine the MIME type of local file name @p fil into @p s (capacity
|
||||
@p ssize): user --assume rules, then ".html", then the built-in extension
|
||||
table. @p flag != 0 forces a fallback type. @return 1 if a type was written,
|
||||
0 otherwise. */
|
||||
HTSEXT_API int get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
const char *fil, int flag);
|
||||
HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
const char *fil, hts_boolean flag);
|
||||
|
||||
/** @deprecated Use get_httptype_sized(). Assumes @p s has at least
|
||||
HTS_MIMETYPE_SIZE capacity. */
|
||||
@@ -615,7 +603,7 @@ HTSEXT_API int is_userknowntype(httrackp * opt, const char *fil);
|
||||
|
||||
/** 1 if @p fil, an extension such as "asp" or "php" (not a full filename), is a
|
||||
known dynamic-page type, else 0. */
|
||||
HTSEXT_API int is_dyntype(const char *fil);
|
||||
HTSEXT_API hts_boolean is_dyntype(const char *fil);
|
||||
|
||||
/** Extract the extension of @p fil (text after the last '.', stopping at '?')
|
||||
into caller scratch @p catbuff (capacity @p size) and return it. Returns ""
|
||||
@@ -625,12 +613,12 @@ HTSEXT_API const char *get_ext(char *catbuff, size_t size, const char *fil);
|
||||
|
||||
/** 1 if MIME type @p st must not be reclassified or renamed (hypertext types
|
||||
and a built-in keep-list of commonly mislabeled types), else 0. */
|
||||
HTSEXT_API int may_unknown(httrackp * opt, const char *st);
|
||||
HTSEXT_API hts_boolean may_unknown(httrackp *opt, const char *st);
|
||||
|
||||
/** Guess the MIME type of local file @p fil into @p s (capacity @p ssize),
|
||||
always producing a type. @return 1 if a type was written. */
|
||||
HTSEXT_API int guess_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
const char *fil);
|
||||
HTSEXT_API hts_boolean guess_httptype_sized(httrackp *opt, char *s,
|
||||
size_t ssize, const char *fil);
|
||||
|
||||
/** @deprecated Use guess_httptype_sized(). Assumes @p s has at least
|
||||
HTS_MIMETYPE_SIZE capacity. */
|
||||
@@ -692,7 +680,7 @@ HTSEXT_API find_handle hts_findfirst(char *path);
|
||||
|
||||
/** Advance to the next directory entry. Returns 1 if an entry is available, 0
|
||||
at end of directory. */
|
||||
HTSEXT_API int hts_findnext(find_handle find);
|
||||
HTSEXT_API hts_boolean hts_findnext(find_handle find);
|
||||
|
||||
/** Close the iteration and free @p find. Always returns 0; NULL is accepted. */
|
||||
HTSEXT_API int hts_findclose(find_handle find);
|
||||
@@ -707,16 +695,16 @@ HTSEXT_API int hts_findgetsize(find_handle find);
|
||||
|
||||
/** 1 if the current entry is a directory, else 0 (a system/special entry, see
|
||||
hts_findissystem(), reports 0). */
|
||||
HTSEXT_API int hts_findisdir(find_handle find);
|
||||
HTSEXT_API hts_boolean hts_findisdir(find_handle find);
|
||||
|
||||
/** 1 if the current entry is a regular file, else 0 (a system/special entry,
|
||||
see hts_findissystem(), reports 0). */
|
||||
HTSEXT_API int hts_findisfile(find_handle find);
|
||||
HTSEXT_API hts_boolean hts_findisfile(find_handle find);
|
||||
|
||||
/** 1 if the current entry is a special/system entry to skip: "." or "..", on
|
||||
POSIX also device/fifo/socket nodes, on Windows also system, hidden or
|
||||
temporary entries. Else 0. */
|
||||
HTSEXT_API int hts_findissystem(find_handle find);
|
||||
HTSEXT_API hts_boolean hts_findissystem(find_handle find);
|
||||
|
||||
/* UTF-8 aware FILE API */
|
||||
/* On non-Windows these macros resolve directly to the POSIX calls. On Windows
|
||||
|
||||
@@ -288,7 +288,7 @@ static void __cdecl htsshow_uninit(t_hts_callbackarg * carg) {
|
||||
}
|
||||
static int __cdecl htsshow_start(t_hts_callbackarg * carg, httrackp * opt) {
|
||||
use_show = 0;
|
||||
if (opt->verbosedisplay == 2) {
|
||||
if (opt->verbosedisplay == HTS_VERBOSE_FULL) {
|
||||
use_show = 1;
|
||||
vt_clear();
|
||||
}
|
||||
@@ -852,7 +852,7 @@ static void sig_doback(int blind) { // mettre en backing
|
||||
if (global_opt != NULL) {
|
||||
// suppress logging and asking lousy questions
|
||||
global_opt->quiet = 1;
|
||||
global_opt->verbosedisplay = 0;
|
||||
global_opt->verbosedisplay = HTS_VERBOSE_NONE;
|
||||
}
|
||||
|
||||
if (!blind)
|
||||
|
||||
15
tests/01_engine-cookies.test
Executable file
15
tests/01_engine-cookies.test
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Issue #151 guard: the request Cookie header must be bare RFC 6265 name=value
|
||||
# pairs, no $Version/$Path attributes. Driven by the 'httrack -#Q' selftest.
|
||||
|
||||
set -eu
|
||||
|
||||
# A trailing token is required; a bare '-#Q' falls through to the usage screen.
|
||||
out=$(httrack -#Q run)
|
||||
|
||||
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||
test "$out" = "cookie-header: OK" || {
|
||||
echo "expected 'cookie-header: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
17
tests/01_engine-copyopt.test
Executable file
17
tests/01_engine-copyopt.test
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Regression guard for the unsigned-enum sentinel trap: copy_htsopt's
|
||||
# `if (from->X > -1)` guard is always false for unsigned hts_boolean fields, so
|
||||
# they silently stop being copied. Driven by the in-process 'httrack -#9' test.
|
||||
# Keep POSIX-portable (harness runs it via $(BASH), a plain /bin/sh on macOS).
|
||||
|
||||
set -eu
|
||||
|
||||
# A trailing token is required; a bare '-#9' falls through to the usage screen.
|
||||
out=$(httrack -#9 run)
|
||||
|
||||
# Exact-match the success line so a fall-through to usage can't pass the test.
|
||||
test "$out" = "copy-htsopt: OK" || {
|
||||
echo "expected 'copy-htsopt: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -89,4 +89,37 @@ grep -q NEWCONTENT "$(find "$out" -path '*/a.html' -print -quit)" || {
|
||||
exit 1
|
||||
}
|
||||
|
||||
# --- 3. an empty quoted arg survives the doit.log round-trip (#106) ----------
|
||||
# -%F "" (empty footer) records an empty "" token in doit.log; -r2 follows it so
|
||||
# a "drop the empty token" bug shifts -r2 into -%F's slot (the reprise then sees
|
||||
# -%F -r2 and panics "%F needs to be followed by ..."), making the bug visible
|
||||
# rather than a harmless run off the end of argv.
|
||||
out2="$tmp/out2"
|
||||
rc=0
|
||||
"$bin" "$url" -O "$out2" --quiet -n -%v0 -%F "" -r2 >/dev/null 2>&1 || rc=$?
|
||||
test "$rc" -eq 0 || {
|
||||
echo "FAIL: initial mirror with empty footer exited $rc"
|
||||
exit 1
|
||||
}
|
||||
# precondition: the writer put the empty token on disk for the reader to reload.
|
||||
grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
|
||||
echo "FAIL: empty footer not recorded as -%F \"\" -r2 in doit.log"
|
||||
grep -- '-%F' "$out2/hts-cache/doit.log" || true
|
||||
exit 1
|
||||
}
|
||||
# no-url reprise: the reader rebuilds argv from doit.log and rewrites doit.log
|
||||
# from it. The empty token surviving in the regenerated file proves the reader
|
||||
# kept it (a drop/swallow would panic above or rewrite -%F without the "").
|
||||
rc=0
|
||||
"$bin" -O "$out2" --quiet >/dev/null 2>&1 || rc=$?
|
||||
test "$rc" -eq 0 || {
|
||||
echo "FAIL: empty-footer reprise exited $rc (empty token dropped from doit.log?)"
|
||||
exit 1
|
||||
}
|
||||
grep -q ' -%F "" -r2' "$out2/hts-cache/doit.log" || {
|
||||
echo "FAIL: empty footer did not survive the doit.log reload round-trip"
|
||||
grep -- '-%F' "$out2/hts-cache/doit.log" || true
|
||||
exit 1
|
||||
}
|
||||
|
||||
exit 0
|
||||
|
||||
@@ -154,4 +154,132 @@ grep -Eq "style=\"background-image:url\('ibgs\.gif'\)\"" "$saved2" ||
|
||||
grep -q 'title="file://' "$saved2" ||
|
||||
! echo "FAIL: a no-detect attribute (title) was wrongly rewritten" || exit 1
|
||||
|
||||
# xmlns / xmlns:prefix decls must not be crawled (#191). Local file:// targets so a
|
||||
# regression downloads them; each is the LAST attr (heuristic only scans a value before '>').
|
||||
site3="$tmp/xmlns"
|
||||
mkdir -p "$site3"
|
||||
for f in ns og rdfs real; do gif "$site3/$f.gif"; done
|
||||
cat >"$site3/index.html" <<EOF
|
||||
<html xmlns="file://$site3/ns.gif"><body>
|
||||
<svg xmlns:og="file://$site3/og.gif"></svg>
|
||||
<div class="c" xmlns:rdfs="file://$site3/rdfs.gif"></div>
|
||||
<a href="file://$site3/real.gif">real link</a>
|
||||
</body></html>
|
||||
EOF
|
||||
out3="$tmp/xmlns-out"
|
||||
crawl "$site3/index.html" "$out3"
|
||||
|
||||
# the real link is still captured
|
||||
found "real.gif" "$out3"
|
||||
# namespace-declaration targets must not be fetched (default + prefixed forms)
|
||||
notfound "ns.gif" "$out3"
|
||||
notfound "og.gif" "$out3"
|
||||
notfound "rdfs.gif" "$out3"
|
||||
|
||||
# CSS @import (#94): every form's target is captured, crawling the .css directly.
|
||||
# The "cond"/"sup"/"spc" cases carry a trailing media/supports/layer condition (or
|
||||
# a space before ';'); they are the negative controls: without the parser fix the
|
||||
# URL is dropped, so a regression fails these found() checks.
|
||||
site4="$tmp/cssimport"
|
||||
mkdir -p "$site4"
|
||||
for f in nq dqu squ dqs sqs med cond sup lay spc; do printf 'body{}\n' >"$site4/$f.css"; done
|
||||
cat >"$site4/main.css" <<'EOF'
|
||||
@import url(nq.css);
|
||||
@import url("dqu.css");
|
||||
@import url('squ.css');
|
||||
@import "dqs.css";
|
||||
@import 'sqs.css';
|
||||
@import url(med.css) screen and (min-width: 400px);
|
||||
@import "cond.css" screen;
|
||||
@import "sup.css" supports(display: flex);
|
||||
@import url(lay.css) layer(base);
|
||||
@import "spc.css" ;
|
||||
EOF
|
||||
out4="$tmp/cssimport-out"
|
||||
crawl "$site4/main.css" "$out4"
|
||||
for f in nq dqu squ dqs sqs med cond sup lay spc; do found "$f.css" "$out4"; done
|
||||
|
||||
# Over-capture guard: the trailing condition is not part of the URL, so it must
|
||||
# survive the rewrite verbatim. A regression that grabs it would mangle these.
|
||||
m4=$(find "$out4" -type f -path '*/file/*' -name main.css -print -quit)
|
||||
test -n "$m4" || ! echo "FAIL: saved main.css not found" || exit 1
|
||||
for cond in '@import "cond.css" screen;' 'supports(display: flex)' 'layer(base)'; do
|
||||
grep -Fq "$cond" "$m4" ||
|
||||
! echo "FAIL #94: '$cond' altered on rewrite (condition captured as URL?)" || exit 1
|
||||
done
|
||||
|
||||
# Malformed input: an unterminated @import quote (truncated CSS) must not crash or
|
||||
# capture a bogus link; a valid sibling import is still captured. Guards a heap
|
||||
# overflow on the URL-end scan that aborts under ASan (CI sanitizer job).
|
||||
site5="$tmp/cssimport-trunc"
|
||||
mkdir -p "$site5"
|
||||
printf 'body{}\n' >"$site5/good.css"
|
||||
printf '@import "good.css";\n@import "trunc' >"$site5/main.css"
|
||||
out5="$tmp/cssimport-trunc-out"
|
||||
crawl "$site5/main.css" "$out5"
|
||||
found "good.css" "$out5"
|
||||
notfound "trunc" "$out5"
|
||||
|
||||
# Offset-0 underflow (#396): a token at the buffer start makes the detector's
|
||||
# word-boundary guard read *(html-1) one byte early (aborts under ASan). The
|
||||
# url() target is still captured; here it just must not underflow.
|
||||
site6="$tmp/parse-off0"
|
||||
mkdir -p "$site6"
|
||||
printf 'body{}\n' >"$site6/off0.css"
|
||||
printf 'url(off0.css)\n' >"$site6/main.css"
|
||||
out6="$tmp/parse-off0-out"
|
||||
crawl "$site6/main.css" "$out6"
|
||||
found "off0.css" "$out6"
|
||||
|
||||
# XMLHttpRequest.open(method, url) (#218): the first argument is an HTTP method,
|
||||
# not a URL. Without the fix "GET" is captured as a link and fetched (the offline
|
||||
# fixture saves a bare file named GET; a live server mangles it to GET.html).
|
||||
# window.open(url) detection must be unaffected.
|
||||
site7="$tmp/xhropen"
|
||||
mkdir -p "$site7"
|
||||
gif "$site7/winopen.gif"
|
||||
cat >"$site7/index.html" <<EOF
|
||||
<html><body><script>
|
||||
var x = new XMLHttpRequest();
|
||||
x.open("GET", "ajax_info.txt");
|
||||
var y = new XMLHttpRequest();
|
||||
y.open("Post", "submit.cgi");
|
||||
window.open("file://$site7/winopen.gif");
|
||||
</script></body></html>
|
||||
EOF
|
||||
out7="$tmp/xhropen-out"
|
||||
crawl "$site7/index.html" "$out7"
|
||||
# negative control: without the fix a file named exactly GET is downloaded
|
||||
notfound "GET" "$out7"
|
||||
# methods are matched case-insensitively (XHR spec normalizes them): a mixed-case
|
||||
# method is rejected too, so a file named Post must not appear either
|
||||
notfound "Post" "$out7"
|
||||
# regression guard: window.open(url) is still detected, so its absolute URL is
|
||||
# rewritten to a local link. The rewrite only happens if the parser saw it, so
|
||||
# these two assertions fail if .open detection broke (not a trivial --near save).
|
||||
saved7=$(savedhtml "$out7")
|
||||
test -n "$saved7" || ! echo "FAIL: saved xhr page not found" || exit 1
|
||||
grep -Fq 'window.open("winopen.gif")' "$saved7" ||
|
||||
! echo "FAIL #218: window.open(url) no longer detected/rewritten" || exit 1
|
||||
! grep -Fq 'window.open("file://' "$saved7" ||
|
||||
! echo "FAIL #218: window.open URL left absolute (not rewritten)" || exit 1
|
||||
|
||||
# Parens inside an unquoted CSS url(...) (#163): the saved-on-disk name has
|
||||
# literal '(' ')' (the source %28/%29 decode when fetching), but a literal ')'
|
||||
# in the rewritten url() would close the token early and break the value, so
|
||||
# they must stay percent-encoded. Negative control: without the fix the output
|
||||
# is url(img%20(1).gif) and the grep for %281%29 fails (parens are RFC2396
|
||||
# "mark" chars, which the URI escaper leaves alone).
|
||||
site8="$tmp/cssparens"
|
||||
mkdir -p "$site8"
|
||||
gif "$site8/img (1).gif"
|
||||
printf 'body { background-image: url(img%%20%%281%%29.gif); }\n' >"$site8/style.css"
|
||||
out8="$tmp/cssparens-out"
|
||||
crawl "$site8/style.css" "$out8"
|
||||
found "img (1).gif" "$out8"
|
||||
css8=$(find "$out8" -type f -path '*/file/*' -name style.css -print -quit)
|
||||
test -n "$css8" || ! echo "FAIL: saved style.css not found" || exit 1
|
||||
grep -Fq 'url(img%20%281%29.gif)' "$css8" ||
|
||||
! echo "FAIL #163: parens in unquoted CSS url() not percent-encoded on rewrite" || exit 1
|
||||
|
||||
exit 0
|
||||
|
||||
136
tests/13_crawl_proxy_https.test
Normal file
136
tests/13_crawl_proxy_https.test
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Issue #85: an https crawl must go through the configured proxy (CONNECT
|
||||
# tunnel), not bypass it and hit the origin directly. Fully local: a self-signed
|
||||
# TLS origin plus a logging CONNECT proxy, so no network access is needed.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
if test "${HTTPS_SUPPORT:-}" == "no"; then
|
||||
echo "no https support compiled, skipping"
|
||||
exit 77
|
||||
fi
|
||||
if ! command -v python3 >/dev/null 2>&1 || ! command -v openssl >/dev/null 2>&1; then
|
||||
echo "python3/openssl missing, skipping"
|
||||
exit 77
|
||||
fi
|
||||
|
||||
server="$top_srcdir/tests/proxy-https-server.py"
|
||||
tmpdir=$(mktemp -d)
|
||||
pids=
|
||||
|
||||
cleanup() {
|
||||
for pid in $pids; do
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# self-signed cert for the local TLS origin (httrack does not verify certs)
|
||||
openssl req -x509 -newkey rsa:2048 -keyout "$tmpdir/key.pem" \
|
||||
-out "$tmpdir/cert.pem" -days 2 -nodes -subj "/CN=127.0.0.1" \
|
||||
>/dev/null 2>&1
|
||||
cat "$tmpdir/key.pem" "$tmpdir/cert.pem" >"$tmpdir/both.pem"
|
||||
|
||||
# start_server <logdir> <mode>: launches a proxy+origin pair, sets $origin_port
|
||||
# and $proxy_port from its announced ephemeral ports.
|
||||
start_server() {
|
||||
local dir="$1" mode="$2" ports
|
||||
mkdir -p "$dir"
|
||||
ports="$dir/ports.txt"
|
||||
python3 "$server" "$tmpdir/both.pem" "$dir" "$mode" \
|
||||
>"$ports" 2>"$dir/server.err" &
|
||||
pids="$pids $!"
|
||||
for _ in $(seq 1 100); do
|
||||
grep -q "^ready" "$ports" 2>/dev/null && break
|
||||
sleep 0.1
|
||||
done
|
||||
grep -q "^ready" "$ports" 2>/dev/null || {
|
||||
echo "server ($mode) did not start" >&2
|
||||
cat "$dir/server.err" >&2
|
||||
exit 1
|
||||
}
|
||||
origin_port=$(awk '/^ORIGIN/{print $2}' "$ports")
|
||||
proxy_port=$(awk '/^PROXY/{print $2}' "$ports")
|
||||
}
|
||||
|
||||
# Run httrack, but kill it after a deadline so a hang (e.g. a missing bound on
|
||||
# the proxy response) surfaces as the kill code $HANG_RC instead of stalling the
|
||||
# whole job. A portable stand-in for `timeout`, which macOS lacks.
|
||||
HANG_RC=137 # 128 + SIGKILL
|
||||
run_crawl() {
|
||||
local out="$1" proxy="$2" port="$3"
|
||||
rm -rf "$out"
|
||||
httrack "https://127.0.0.1:${port}/" --proxy "$proxy" \
|
||||
-O "$out" -r1 -s0 --timeout=10 >"$out.log" 2>&1 &
|
||||
local pid=$!
|
||||
(sleep 60 && kill -9 "$pid" 2>/dev/null) &
|
||||
local guard=$!
|
||||
local rc=0
|
||||
wait "$pid" 2>/dev/null || rc=$?
|
||||
kill "$guard" 2>/dev/null || true
|
||||
wait "$guard" 2>/dev/null || true
|
||||
return "$rc"
|
||||
}
|
||||
|
||||
# --- working proxy ----------------------------------------------------------
|
||||
ok="$tmpdir/ok"
|
||||
start_server "$ok" ok
|
||||
|
||||
# 1. page retrieved AND the proxy saw a CONNECT to the origin
|
||||
run_crawl "$ok/out" "127.0.0.1:${proxy_port}" "$origin_port"
|
||||
grep -rq "ORIGIN-PAGE-85" "$ok/out" || {
|
||||
echo "FAIL: origin page not downloaded through proxy" >&2
|
||||
cat "$ok/out.log" >&2
|
||||
exit 1
|
||||
}
|
||||
grep -q "^CONNECT 127.0.0.1:${origin_port} " "$ok/proxy.log" || {
|
||||
echo "FAIL: proxy never received a CONNECT (https bypassed the proxy)" >&2
|
||||
cat "$ok/proxy.log" >&2
|
||||
exit 1
|
||||
}
|
||||
echo "OK: https tunneled through proxy via CONNECT"
|
||||
|
||||
# 2. authenticated proxy: creds ride the CONNECT, and NEVER reach the origin
|
||||
: >"$ok/proxy.log"
|
||||
: >"$ok/origin-headers.log"
|
||||
run_crawl "$ok/out2" "user:secret@127.0.0.1:${proxy_port}" "$origin_port"
|
||||
grep -rq "ORIGIN-PAGE-85" "$ok/out2" || {
|
||||
echo "FAIL: origin page not downloaded through authenticated proxy" >&2
|
||||
exit 1
|
||||
}
|
||||
got=$(awk '/^AUTH Basic /{print $3}' "$ok/proxy.log" | head -1)
|
||||
# base64("user:secret"); compared as a literal to stay portable (no base64 -d,
|
||||
# which differs between GNU and BSD)
|
||||
test "$got" == "dXNlcjpzZWNyZXQ=" || {
|
||||
echo "FAIL: Proxy-Authorization not carried on CONNECT (got '$got')" >&2
|
||||
cat "$ok/proxy.log" >&2
|
||||
exit 1
|
||||
}
|
||||
if grep -qi "proxy-authorization" "$ok/origin-headers.log"; then
|
||||
echo "FAIL: proxy credentials leaked to the origin through the tunnel" >&2
|
||||
cat "$ok/origin-headers.log" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "OK: proxy credentials carried on CONNECT, not leaked to origin"
|
||||
|
||||
# --- hostile proxy ----------------------------------------------------------
|
||||
# A proxy that answers 200 then streams headers forever must not hang the crawl:
|
||||
# the client bounds the response. run_crawl kills a hung httrack after 60s, so a
|
||||
# missing bound surfaces as $HANG_RC here.
|
||||
flood="$tmpdir/flood"
|
||||
start_server "$flood" flood
|
||||
rc=0
|
||||
run_crawl "$flood/out" "127.0.0.1:${proxy_port}" "$origin_port" || rc=$?
|
||||
test "$rc" -ne "$HANG_RC" || {
|
||||
echo "FAIL: crawl hung on a flooding proxy (bounded read missing)" >&2
|
||||
exit 1
|
||||
}
|
||||
grep -rq "ORIGIN-PAGE-85" "$flood/out" 2>/dev/null && {
|
||||
echo "FAIL: flooding proxy unexpectedly served the page" >&2
|
||||
exit 1
|
||||
}
|
||||
echo "OK: bounded proxy response, no hang on a flooding proxy"
|
||||
@@ -2,6 +2,7 @@
|
||||
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
||||
# silently drop it from the dist tarball and break "make distcheck".
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
proxy-https-server.py \
|
||||
fixtures/cache-golden/hts-cache/new.zip
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
@@ -24,6 +25,8 @@ TESTS = \
|
||||
01_engine-cache-golden.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-cookies.test \
|
||||
01_engine-copyopt.test \
|
||||
01_engine-doitlog.test \
|
||||
01_engine-entities.test \
|
||||
01_engine-filter.test \
|
||||
@@ -42,6 +45,7 @@ TESTS = \
|
||||
11_crawl-international.test \
|
||||
11_crawl-longurl.test \
|
||||
11_crawl-parsing.test \
|
||||
12_crawl_https.test
|
||||
12_crawl_https.test \
|
||||
13_crawl_proxy_https.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
151
tests/proxy-https-server.py
Normal file
151
tests/proxy-https-server.py
Normal file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Local CONNECT proxy + self-signed HTTPS origin for the issue #85 test.
|
||||
|
||||
Starts a TLS origin server and an HTTP proxy that honours CONNECT, on ephemeral
|
||||
ports. Every request line the proxy receives (and any Proxy-Authorization) is
|
||||
appended to the proxy log; every header the origin receives over the tunnel is
|
||||
appended to the origin log. That lets the test assert both that an https crawl
|
||||
tunneled through the proxy and that proxy credentials never leaked to the origin.
|
||||
|
||||
Proxy modes (argv[3], default "ok"):
|
||||
ok - honour CONNECT and tunnel to the origin
|
||||
flood - answer 200 then stream headers forever with no blank line, to exercise
|
||||
the client's bound on the proxy response (must not hang the crawl)
|
||||
|
||||
Usage: proxy-https-server.py <cert.pem> <logdir> [mode]
|
||||
Prints "ORIGIN <port>", "PROXY <port>", then "ready" (one per line) on stdout.
|
||||
"""
|
||||
import http.server
|
||||
import os
|
||||
import socket
|
||||
import socketserver
|
||||
import ssl
|
||||
import sys
|
||||
import threading
|
||||
|
||||
ORIGIN_BODY = b"<html><body>ORIGIN-PAGE-85</body></html>"
|
||||
PROXY_LOG = "proxy.log"
|
||||
ORIGIN_LOG = "origin-headers.log"
|
||||
|
||||
|
||||
def make_origin(logdir):
|
||||
class Origin(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
with open(os.path.join(logdir, ORIGIN_LOG), "a") as handle:
|
||||
for key in self.headers.keys():
|
||||
handle.write(key + "\n")
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/html")
|
||||
self.send_header("Content-Length", str(len(ORIGIN_BODY)))
|
||||
self.end_headers()
|
||||
self.wfile.write(ORIGIN_BODY)
|
||||
|
||||
def log_message(self, *args):
|
||||
pass
|
||||
|
||||
return Origin
|
||||
|
||||
|
||||
def start_origin(certfile, logdir):
|
||||
httpd = socketserver.TCPServer(("127.0.0.1", 0), make_origin(logdir))
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
ctx.load_cert_chain(certfile)
|
||||
httpd.socket = ctx.wrap_socket(httpd.socket, server_side=True)
|
||||
port = httpd.socket.getsockname()[1]
|
||||
threading.Thread(target=httpd.serve_forever, daemon=True).start()
|
||||
return port
|
||||
|
||||
|
||||
def pipe(src, dst):
|
||||
try:
|
||||
while True:
|
||||
data = src.recv(65536)
|
||||
if not data:
|
||||
break
|
||||
dst.sendall(data)
|
||||
except OSError:
|
||||
pass
|
||||
finally:
|
||||
for sock in (src, dst):
|
||||
try:
|
||||
sock.shutdown(socket.SHUT_RDWR)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def handle_client(conn, logdir, mode):
|
||||
rfile = conn.makefile("rb")
|
||||
request_line = rfile.readline().decode("latin-1").strip()
|
||||
auth = None
|
||||
while True:
|
||||
line = rfile.readline().decode("latin-1")
|
||||
if line in ("\r\n", "\n", ""):
|
||||
break
|
||||
key, _, value = line.partition(":")
|
||||
if key.strip().lower() == "proxy-authorization":
|
||||
auth = value.strip()
|
||||
with open(os.path.join(logdir, PROXY_LOG), "a") as handle:
|
||||
handle.write(request_line + "\n")
|
||||
if auth is not None:
|
||||
handle.write("AUTH " + auth + "\n")
|
||||
parts = request_line.split()
|
||||
if not (len(parts) >= 2 and parts[0] == "CONNECT"):
|
||||
conn.sendall(b"HTTP/1.0 501 Not Implemented\r\n\r\n")
|
||||
conn.close()
|
||||
return
|
||||
if mode == "flood":
|
||||
# 200, then an endless header stream with no terminating blank line: the
|
||||
# client must bound this and give up, not hang.
|
||||
try:
|
||||
conn.sendall(b"HTTP/1.0 200 Connection established\r\n")
|
||||
while True:
|
||||
conn.sendall(b"X-Pad: 0123456789\r\n")
|
||||
except OSError:
|
||||
pass
|
||||
conn.close()
|
||||
return
|
||||
host, _, port = parts[1].partition(":")
|
||||
try:
|
||||
upstream = socket.create_connection((host, int(port or 443)))
|
||||
except OSError:
|
||||
conn.sendall(b"HTTP/1.0 502 Bad Gateway\r\n\r\n")
|
||||
conn.close()
|
||||
return
|
||||
conn.sendall(b"HTTP/1.0 200 Connection established\r\n\r\n")
|
||||
threading.Thread(target=pipe, args=(conn, upstream), daemon=True).start()
|
||||
pipe(upstream, conn)
|
||||
|
||||
|
||||
def start_proxy(logdir, mode):
|
||||
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
srv.bind(("127.0.0.1", 0))
|
||||
srv.listen(16)
|
||||
port = srv.getsockname()[1]
|
||||
|
||||
def serve():
|
||||
while True:
|
||||
conn, _ = srv.accept()
|
||||
threading.Thread(
|
||||
target=handle_client, args=(conn, logdir, mode), daemon=True
|
||||
).start()
|
||||
|
||||
threading.Thread(target=serve, daemon=True).start()
|
||||
return port
|
||||
|
||||
|
||||
def main():
|
||||
certfile, logdir = sys.argv[1], sys.argv[2]
|
||||
mode = sys.argv[3] if len(sys.argv) > 3 else "ok"
|
||||
for name in (PROXY_LOG, ORIGIN_LOG):
|
||||
open(os.path.join(logdir, name), "w").close()
|
||||
origin_port = start_origin(certfile, logdir)
|
||||
proxy_port = start_proxy(logdir, mode)
|
||||
print("ORIGIN %d" % origin_port, flush=True)
|
||||
print("PROXY %d" % proxy_port, flush=True)
|
||||
print("ready", flush=True)
|
||||
threading.Event().wait()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user