mirror of
https://github.com/xroche/httrack.git
synced 2026-06-29 05:26:32 +03:00
Compare commits
1 Commits
master
...
mime-table
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e2c0341297 |
@@ -129,8 +129,6 @@ typedef enum HTTPStatusCode {
|
||||
HTTP_UNSUPPORTED_MEDIA_TYPE = 415,
|
||||
HTTP_REQUESTED_RANGE_NOT_SATISFIABLE = 416,
|
||||
HTTP_EXPECTATION_FAILED = 417,
|
||||
HTTP_TOO_MANY_REQUESTS = 429,
|
||||
HTTP_UNAVAILABLE_FOR_LEGAL_REASONS = 451,
|
||||
HTTP_INTERNAL_SERVER_ERROR = 500,
|
||||
HTTP_NOT_IMPLEMENTED = 501,
|
||||
HTTP_BAD_GATEWAY = 502,
|
||||
|
||||
@@ -64,7 +64,7 @@ Please visit our Website: http://www.httrack.com
|
||||
// catch_url_init(&port,&return_host);
|
||||
HTSEXT_API T_SOC catch_url_init_std(int *port_prox, char *adr_prox) {
|
||||
T_SOC soc;
|
||||
int try_to_listen_to[] = {8080, 3128, 80, 81, 82, 8081, 3129, 0, -1};
|
||||
int try_to_listen_to[] = { 8080, 3128, 80, 81, 82, 8081, 3129, 31337, 0, -1 };
|
||||
int i = 0;
|
||||
|
||||
do {
|
||||
|
||||
@@ -229,10 +229,6 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_DEFAULT_FOOTER \
|
||||
"<!-- Mirrored from %s%s by HTTrack Website Copier/" HTTRACK_AFF_VERSION \
|
||||
" " HTTRACK_AFF_AUTHORS ", %s -->"
|
||||
/* Honest crawler User-Agent; no fake OS/browser to go stale. */
|
||||
#define HTS_DEFAULT_USER_AGENT \
|
||||
"Mozilla/5.0 (compatible; HTTrack/" HTTRACK_AFF_VERSION \
|
||||
"; +https://www.httrack.com/)"
|
||||
#define HTTRACK_WEB "http://www.httrack.com"
|
||||
#define HTS_UPDATE_WEBSITE \
|
||||
"http://www.httrack.com/" \
|
||||
|
||||
14
src/htslib.c
14
src/htslib.c
@@ -1951,10 +1951,6 @@ HTSEXT_API const char *infostatuscode_const(int statuscode) {
|
||||
return "Requested Range Not Satisfiable";
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
case 501:
|
||||
@@ -5801,13 +5797,6 @@ HTSEXT_API int hts_init(void) {
|
||||
abortLog("unable to initialize TLS: SSL_CTX_new()");
|
||||
assertf("unable to initialize TLS" == NULL);
|
||||
}
|
||||
/* Pin a TLS floor (no SSLv3/TLS1.0/1.1); no cert verify, by design. */
|
||||
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
|
||||
SSL_CTX_set_min_proto_version(openssl_ctx, TLS1_2_VERSION);
|
||||
#else
|
||||
SSL_CTX_set_options(openssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
|
||||
SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -6059,7 +6048,8 @@ HTSEXT_API httrackp *hts_create_opt(void) {
|
||||
opt->shell = HTS_FALSE;
|
||||
opt->proxy.active = 0; // pas de proxy
|
||||
opt->user_agent_send = HTS_TRUE;
|
||||
StringCopy(opt->user_agent, HTS_DEFAULT_USER_AGENT);
|
||||
StringCopy(opt->user_agent,
|
||||
"Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)");
|
||||
StringCopy(opt->referer, "");
|
||||
StringCopy(opt->from, "");
|
||||
opt->savename_83 = HTS_SAVENAME_83_LONG; // long names by default
|
||||
|
||||
@@ -1302,40 +1302,6 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Default User-Agent: honest HTTrack token, no resurrected Windows 98. */
|
||||
static int st_useragent(httrackp *opt, int argc, char **argv) {
|
||||
const char *ua = StringBuff(opt->user_agent);
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
assertf(ua != NULL);
|
||||
assertf(strcmp(ua, HTS_DEFAULT_USER_AGENT) == 0);
|
||||
/* Teeth independent of the macro: honest token + self-identifier, and no
|
||||
legacy Mozilla/4.x fake-browser string (rejects the whole relic family). */
|
||||
assertf(strstr(ua, "HTTrack/") != NULL);
|
||||
assertf(strstr(ua, "+https://www.httrack.com/") != NULL);
|
||||
assertf(strstr(ua, "Mozilla/4.") == NULL);
|
||||
printf("useragent self-test OK: %s\n", ua);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* HTTP status code -> reason phrase, including the modern 429/451. */
|
||||
static int st_status(httrackp *opt, int argc, char **argv) {
|
||||
const char *s;
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
s = infostatuscode_const(429);
|
||||
assertf(s != NULL && strcmp(s, "Too Many Requests") == 0);
|
||||
s = infostatuscode_const(451);
|
||||
assertf(s != NULL && strcmp(s, "Unavailable For Legal Reasons") == 0);
|
||||
/* A spot-check of a long-standing code, and an unknown one. */
|
||||
s = infostatuscode_const(404);
|
||||
assertf(s != NULL && strcmp(s, "Not Found") == 0);
|
||||
assertf(infostatuscode_const(799) == NULL);
|
||||
printf("status self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* Registry: name -> handler, with a usage hint and a one-line description. */
|
||||
/* ------------------------------------------------------------ */
|
||||
@@ -1382,8 +1348,6 @@ static const struct selftest_entry {
|
||||
st_cache_writefail},
|
||||
{"dns", "", "DNS resolver/cache self-test", st_dns},
|
||||
{"cookies", "", "cookie request-header self-test", st_cookies},
|
||||
{"useragent", "", "default User-Agent self-test", st_useragent},
|
||||
{"status", "", "HTTP status code -> reason phrase self-test", st_status},
|
||||
};
|
||||
|
||||
static void list_selftests(void) {
|
||||
|
||||
@@ -358,12 +358,12 @@ int smallserver(T_SOC soc, char *url, char *method, char *data, char *path) {
|
||||
{NULL, 0}
|
||||
};
|
||||
initStrElt initStr[] = {
|
||||
{"user", HTS_DEFAULT_USER_AGENT},
|
||||
{"footer", "<!-- Mirrored from %s%s by HTTrack Website Copier/3.x "
|
||||
"[XR&CO'2014], %s -->"},
|
||||
{"url2",
|
||||
"+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"},
|
||||
{NULL, NULL}};
|
||||
{"user", "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)"},
|
||||
{"footer",
|
||||
"<!-- Mirrored from %s%s by HTTrack Website Copier/3.x [XR&CO'2014], %s -->"},
|
||||
{"url2", "+*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/*"},
|
||||
{NULL, NULL}
|
||||
};
|
||||
int i = 0;
|
||||
|
||||
for(i = 0; initInt[i].name; i++) {
|
||||
|
||||
@@ -80,10 +80,6 @@ htspair_t hts_detect_embed[] = {
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
/* HTML5 media siblings of <img src>: same near-link treatment (#451) */
|
||||
static const htspair_t hts_detect_embed_html5[] = {
|
||||
{"source", "src"}, {"source", "srcset"}, {"track", "src"}, {NULL, NULL}};
|
||||
|
||||
/* Internal */
|
||||
static int hts_acceptlink_(httrackp * opt, int ptr, const char *adr,
|
||||
const char *fil, const char *tag,
|
||||
@@ -140,17 +136,6 @@ static int cmp_token(const char *tag, const char *cmp) {
|
||||
&& !isalnum((unsigned char) tag[p]));
|
||||
}
|
||||
|
||||
/* TRUE if (tag, attribute) matches an embedded-asset pair in the table */
|
||||
static hts_boolean is_embed_pair(const htspair_t *table, const char *tag,
|
||||
const char *attribute) {
|
||||
int i;
|
||||
for (i = 0; table[i].tag != NULL; i++) {
|
||||
if (cmp_token(tag, table[i].tag) && cmp_token(attribute, table[i].attr))
|
||||
return HTS_TRUE;
|
||||
}
|
||||
return HTS_FALSE;
|
||||
}
|
||||
|
||||
static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
const char *adr, const char *fil, const char *tag,
|
||||
const char *attribute, int *set_prio_to,
|
||||
@@ -178,9 +163,15 @@ static int hts_acceptlink_(httrackp * opt, int ptr,
|
||||
|
||||
/* Built-in known tags (<img src=..>, ..) */
|
||||
if (forbidden_url != 0 && opt->nearlink && tag != NULL && attribute != NULL) {
|
||||
if (is_embed_pair(hts_detect_embed, tag, attribute) ||
|
||||
is_embed_pair(hts_detect_embed_html5, tag, attribute)) {
|
||||
embedded_triggered = 1;
|
||||
int i;
|
||||
|
||||
for(i = 0; hts_detect_embed[i].tag != NULL; i++) {
|
||||
if (cmp_token(tag, hts_detect_embed[i].tag)
|
||||
&& cmp_token(attribute, hts_detect_embed[i].attr)
|
||||
) {
|
||||
embedded_triggered = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -497,12 +497,6 @@ static const char *GetHttpMessage(int statuscode) {
|
||||
case 417:
|
||||
return "Expectation Failed";
|
||||
break;
|
||||
case 429:
|
||||
return "Too Many Requests";
|
||||
break;
|
||||
case 451:
|
||||
return "Unavailable For Legal Reasons";
|
||||
break;
|
||||
case 500:
|
||||
return "Internal Server Error";
|
||||
break;
|
||||
|
||||
@@ -323,33 +323,4 @@ grep -Fq 'href="ahref%20(4).gif"' "$saved9" ||
|
||||
! grep -Eq '(src|href)="[^"]*%28' "$saved9" ||
|
||||
! echo "FAIL #163: gate over-fired onto a non-url() attribute link" || exit 1
|
||||
|
||||
# HTML5 <source>/<track> follow as embedded near-links past the -r2 depth boundary (#451).
|
||||
# img.gif positive control; plain.gif (bare <a href>) negative control proves the gate is selective.
|
||||
site10="$tmp/html5media"
|
||||
mkdir -p "$site10"
|
||||
for f in img ss plain; do gif "$site10/$f.gif"; done
|
||||
printf 'x' >"$site10/v.webm"
|
||||
printf 'x' >"$site10/subs.vtt"
|
||||
cat >"$site10/index.html" <<EOF
|
||||
<html><body><a href="leaf.html">leaf</a></body></html>
|
||||
EOF
|
||||
cat >"$site10/leaf.html" <<EOF
|
||||
<html><body>
|
||||
<img src="img.gif">
|
||||
<picture><source srcset="ss.gif 2x"></picture>
|
||||
<video><source src="v.webm"></video>
|
||||
<video><track src="subs.vtt"></video>
|
||||
<a href="plain.gif">plain link past the boundary</a>
|
||||
</body></html>
|
||||
EOF
|
||||
out10="$tmp/html5media-out"
|
||||
rm -rf "$out10"
|
||||
mkdir -p "$out10"
|
||||
httrack "file://$site10/index.html" -O "$out10" --quiet --near -r2 >"$out10/.log" 2>&1 || true
|
||||
found "img.gif" "$out10"
|
||||
found "ss.gif" "$out10"
|
||||
found "v.webm" "$out10"
|
||||
found "subs.vtt" "$out10"
|
||||
notfound "plain.gif" "$out10"
|
||||
|
||||
exit 0
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# HTTP status -> reason phrase, including the modern 429/451 (#453).
|
||||
httrack -O /dev/null -#test=status run | grep -q "status self-test OK"
|
||||
@@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Default User-Agent (#449): honest HTTrack token, no Windows 98 relic.
|
||||
httrack -O /dev/null -#test=useragent run | grep -q "useragent self-test OK"
|
||||
@@ -47,11 +47,9 @@ TESTS = \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
01_engine-status.test \
|
||||
01_engine-stripquery.test \
|
||||
01_engine-strsafe.test \
|
||||
01_engine-urlhack.test \
|
||||
01_engine-useragent.test \
|
||||
02_manpage-regen.test \
|
||||
02_update-cache.test \
|
||||
10_crawl-simple.test \
|
||||
|
||||
Reference in New Issue
Block a user