mirror of
https://github.com/xroche/httrack.git
synced 2026-06-27 12:37:05 +03:00
Compare commits
1 Commits
master
...
fix/mime-e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5a78806cae |
@@ -3766,7 +3766,27 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
}
|
||||
#endif
|
||||
/********** **************************** ********** */
|
||||
} else { // il faut aller le chercher
|
||||
}
|
||||
// MIME type excluded by a -mime: filter: abort, don't fetch
|
||||
// the body (#58)
|
||||
else if (HTTP_IS_OK(back[i].r.statuscode) &&
|
||||
!back[i].testmode &&
|
||||
strnotempty(back[i].r.contenttype) &&
|
||||
hts_acceptmime(opt, 0, back[i].url_adr,
|
||||
back[i].url_fil,
|
||||
back[i].r.contenttype) == 1) {
|
||||
deletehttp(&back[i].r);
|
||||
back[i].r.soc = INVALID_SOCKET;
|
||||
back[i].status = STATUS_READY;
|
||||
back_set_finished(sback, i);
|
||||
back[i].r.statuscode = STATUSCODE_EXCLUDED;
|
||||
strcpybuff(back[i].r.msg, "Excluded by MIME type filter");
|
||||
hts_log_print(
|
||||
opt, LOG_NOTICE,
|
||||
"File excluded by MIME type filter (%s): %s%s",
|
||||
back[i].r.contenttype, back[i].url_adr,
|
||||
back[i].url_fil);
|
||||
} else { // il faut aller le chercher
|
||||
|
||||
// effacer buffer (requète)
|
||||
if (!noFreebuff) {
|
||||
@@ -3985,7 +4005,6 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*} */
|
||||
|
||||
@@ -146,7 +146,8 @@ typedef enum BackStatusCode {
|
||||
STATUSCODE_NON_FATAL = -5,
|
||||
STATUSCODE_SSL_HANDSHAKE = -6,
|
||||
STATUSCODE_TOO_BIG = -7,
|
||||
STATUSCODE_TEST_OK = -10
|
||||
STATUSCODE_TEST_OK = -10,
|
||||
STATUSCODE_EXCLUDED = -11 /* aborted: MIME excluded by a -mime: filter */
|
||||
} BackStatusCode;
|
||||
|
||||
/** HTTrack status ('status' member of of 'lien_back') **/
|
||||
|
||||
16
tests/25_local-mime-exclude.test
Executable file
16
tests/25_local-mime-exclude.test
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# A -mime: exclusion must abort the transfer on the response Content-Type, not
|
||||
# fetch the whole 1 MB body then discard it (#58). The bytes-received guard is
|
||||
# the real one: the file is absent either way, but only the fix keeps the count
|
||||
# tiny (header only) instead of pulling the body. Match it positively (a small,
|
||||
# <=4-digit count) so a vanished/reworded summary line fails rather than passes.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'mimex/real.html' \
|
||||
--not-found 'mimex/blob.pdf' \
|
||||
--log-found 'excluded by MIME type filter' \
|
||||
--log-found '\[[0-9]{1,4} bytes received' \
|
||||
httrack 'BASEURL/mimex/index.html' '-mime:application/pdf'
|
||||
@@ -66,6 +66,7 @@ TESTS = \
|
||||
21_local-intl-update.test \
|
||||
22_local-broken-size.test \
|
||||
23_local-errpage.test \
|
||||
24_local-resume-overlap.test
|
||||
24_local-resume-overlap.test \
|
||||
25_local-mime-exclude.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -177,6 +177,24 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
body, ctype = self.TYPE_MATRIX[path]
|
||||
self.send_raw(body, ctype)
|
||||
|
||||
# --- MIME-type exclusion abort (issue #58) -----------------------------
|
||||
# A -mime:application/pdf filter must abort the transfer once the header
|
||||
# arrives, not download the whole body and discard it.
|
||||
def route_mimex_index(self):
|
||||
self.send_html(
|
||||
'\t<a href="blob.pdf">pdf</a>\n' '\t<a href="real.html">real</a>\n'
|
||||
)
|
||||
|
||||
# 1 MB body: the fix aborts after the header, so httrack's "bytes received"
|
||||
# stays tiny; without it the engine reads the body and the count jumps.
|
||||
MIMEX_BLOB = b"%PDF-1.4\n" + b"\x00" * (1024 * 1024)
|
||||
|
||||
def route_mimex_blob(self):
|
||||
self.send_raw(self.MIMEX_BLOB, "application/pdf")
|
||||
|
||||
def route_mimex_real(self):
|
||||
self.send_raw(b"<html><body>real</body></html>", "text/html")
|
||||
|
||||
# --- special chars in URLs across an update (issue #157) ---------------
|
||||
# A dotless, accented basename served as text/html (MediaWiki style). The
|
||||
# name the first crawl picks (.html) must survive the update pass.
|
||||
@@ -355,6 +373,9 @@ class Handler(SimpleHTTPRequestHandler):
|
||||
"/errpage/good.html": route_errpage_good,
|
||||
"/errpage/missing.html": route_errpage_missing,
|
||||
"/errpage/empty.html": route_errpage_empty,
|
||||
"/mimex/index.html": route_mimex_index,
|
||||
"/mimex/blob.pdf": route_mimex_blob,
|
||||
"/mimex/real.html": route_mimex_real,
|
||||
}
|
||||
|
||||
# --- dispatch ----------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user