Compare commits

...

1 Commits

Author SHA1 Message Date
Xavier Roche
5a78806cae Abort the download when the response MIME type is excluded by -mime: (#58)
A -mime: exclusion only took effect after the full body had been
downloaded and then discarded (leaving a .delayed temp behind), wasting
bandwidth. Honor it as soon as the response Content-Type arrives:
back_wait now aborts the transfer before the body when hts_acceptmime
forbids the declared type, finishing the slot with a new
STATUSCODE_EXCLUDED clean-skip status rather than fetching and dropping.

Covers the reported case (an HTML-looking URL served as application/pdf
past a +*.html include) and any -mime: match regardless of extension.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-26 19:53:32 +02:00
5 changed files with 62 additions and 4 deletions

View File

@@ -3766,7 +3766,27 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
}
#endif
/********** **************************** ********** */
} else { // il faut aller le chercher
}
// MIME type excluded by a -mime: filter: abort, don't fetch
// the body (#58)
else if (HTTP_IS_OK(back[i].r.statuscode) &&
!back[i].testmode &&
strnotempty(back[i].r.contenttype) &&
hts_acceptmime(opt, 0, back[i].url_adr,
back[i].url_fil,
back[i].r.contenttype) == 1) {
deletehttp(&back[i].r);
back[i].r.soc = INVALID_SOCKET;
back[i].status = STATUS_READY;
back_set_finished(sback, i);
back[i].r.statuscode = STATUSCODE_EXCLUDED;
strcpybuff(back[i].r.msg, "Excluded by MIME type filter");
hts_log_print(
opt, LOG_NOTICE,
"File excluded by MIME type filter (%s): %s%s",
back[i].r.contenttype, back[i].url_adr,
back[i].url_fil);
} else { // il faut aller le chercher
// effacer buffer (requète)
if (!noFreebuff) {
@@ -3985,7 +4005,6 @@ void back_wait(struct_back * sback, httrackp * opt, cache_back * cache,
}
}
}
/*} */

View File

@@ -146,7 +146,8 @@ typedef enum BackStatusCode {
STATUSCODE_NON_FATAL = -5,
STATUSCODE_SSL_HANDSHAKE = -6,
STATUSCODE_TOO_BIG = -7,
STATUSCODE_TEST_OK = -10
STATUSCODE_TEST_OK = -10,
STATUSCODE_EXCLUDED = -11 /* aborted: MIME excluded by a -mime: filter */
} BackStatusCode;
/** HTTrack status ('status' member of of 'lien_back') **/

View File

@@ -0,0 +1,16 @@
#!/bin/bash
#
# A -mime: exclusion must abort the transfer on the response Content-Type, not
# fetch the whole 1 MB body then discard it (#58). The bytes-received guard is
# the real one: the file is absent either way, but only the fix keeps the count
# tiny (header only) instead of pulling the body. Match it positively (a small,
# <=4-digit count) so a vanished/reworded summary line fails rather than passes.
: "${top_srcdir:=..}"
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
--found 'mimex/real.html' \
--not-found 'mimex/blob.pdf' \
--log-found 'excluded by MIME type filter' \
--log-found '\[[0-9]{1,4} bytes received' \
httrack 'BASEURL/mimex/index.html' '-mime:application/pdf'

View File

@@ -66,6 +66,7 @@ TESTS = \
21_local-intl-update.test \
22_local-broken-size.test \
23_local-errpage.test \
24_local-resume-overlap.test
24_local-resume-overlap.test \
25_local-mime-exclude.test
CLEANFILES = check-network_sh.cache

View File

@@ -177,6 +177,24 @@ class Handler(SimpleHTTPRequestHandler):
body, ctype = self.TYPE_MATRIX[path]
self.send_raw(body, ctype)
# --- MIME-type exclusion abort (issue #58) -----------------------------
# A -mime:application/pdf filter must abort the transfer once the header
# arrives, not download the whole body and discard it.
def route_mimex_index(self):
self.send_html(
'\t<a href="blob.pdf">pdf</a>\n' '\t<a href="real.html">real</a>\n'
)
# 1 MB body: the fix aborts after the header, so httrack's "bytes received"
# stays tiny; without it the engine reads the body and the count jumps.
MIMEX_BLOB = b"%PDF-1.4\n" + b"\x00" * (1024 * 1024)
def route_mimex_blob(self):
self.send_raw(self.MIMEX_BLOB, "application/pdf")
def route_mimex_real(self):
self.send_raw(b"<html><body>real</body></html>", "text/html")
# --- special chars in URLs across an update (issue #157) ---------------
# A dotless, accented basename served as text/html (MediaWiki style). The
# name the first crawl picks (.html) must survive the update pass.
@@ -355,6 +373,9 @@ class Handler(SimpleHTTPRequestHandler):
"/errpage/good.html": route_errpage_good,
"/errpage/missing.html": route_errpage_missing,
"/errpage/empty.html": route_errpage_empty,
"/mimex/index.html": route_mimex_index,
"/mimex/blob.pdf": route_mimex_blob,
"/mimex/real.html": route_mimex_real,
}
# --- dispatch ----------------------------------------------------------