Compare commits

...

1 Commits

Author SHA1 Message Date
Xavier Roche
896a589f94 Add --pause to space out file downloads by a random delay (#185)
A new --pause MIN[:MAX] (seconds, -%G) waits a random MIN..MAX between
files so a crawl looks less like a bot and is gentler on the server; a
single value is a fixed delay. Disabled by default.

It reuses the existing non-blocking launch gate
(back_pluggable_sockets_strict): rather than Sleep() -- which would freeze
the single select() pump and stall the other in-flight transfers -- the
gate just withholds new launches until the delay elapses, one file per
gap. The per-gap target is derived from the last-request timestamp so it
stays stable across the many gate evaluations within a gap yet rerolls on
each launch; sampling rand() per evaluation would instead bias the
realized delay toward MIN.

Two int fields appended at the httrackp tail (ABI-stable, no soname bump).
Covered by a pure-function self-test (range + spread, with teeth against
the min-bias bug) and a local-server crawl that asserts the pause slows a
multi-file mirror.

Closes #185

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-27 23:55:35 +02:00
13 changed files with 180 additions and 1 deletions

View File

@@ -24,6 +24,7 @@ httrack \- offline browser : copy websites to a local directory
[ \fB\-EN, \-\-max\-time[=N]\fR ]
[ \fB\-AN, \-\-max\-rate[=N]\fR ]
[ \fB\-%cN, \-\-connection\-per\-second[=N]\fR ]
[ \fB\-%G, \-\-pause\fR ]
[ \fB\-GN, \-\-max\-pause[=N]\fR ]
[ \fB\-cN, \-\-sockets[=N]\fR ]
[ \fB\-TN, \-\-timeout[=N]\fR ]
@@ -155,6 +156,8 @@ maximum mirror time in seconds (60=1 minute, 3600=1 hour) (\-\-max\-time[=N])
maximum transfer rate in bytes/seconds (1000=1KB/s max) (\-\-max\-rate[=N])
.IP \-%cN
maximum number of connections/seconds (*%c10) (\-\-connection\-per\-second[=N])
.IP \-%G
random pause of MIN[:MAX] seconds between files (e.g. %G5:10) (\-\-pause <param>)
.IP \-GN
pause transfer if N bytes reached, and wait until lock file is deleted (\-\-max\-pause[=N])
.SS Flow control:

View File

@@ -114,6 +114,8 @@ const char *hts_optalias[][4] = {
"strip [host/pattern=]key1,key2,... from URLs"},
{"cookies-file", "-%K", "param1",
"load extra cookies from a Netscape cookies.txt"},
{"pause", "-%G", "param1",
"random pause of MIN[:MAX] seconds between files"},
{"generate-errors", "-o", "single", ""},
{"do-not-generate-errors", "-o0", "single", ""},
{"purge-old", "-X", "param", ""},

View File

@@ -35,6 +35,7 @@ Please visit our Website: http://www.httrack.com
#include <fcntl.h>
#include <ctype.h>
#include <stdint.h> /* uint64_t for the pause mixer (already a hard dep via md5.h) */
/* File defs */
#include "htscore.h"
@@ -3314,6 +3315,21 @@ HTS_INLINE int back_fillmax(struct_back * sback, httrackp * opt,
return -1; /* plus de place */
}
/* Seed-derived: stable within a gap, rerolls per launch; a per-call rand()
would bias the delay toward min_ms (see header). Jitter, not crypto. */
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms) {
uint64_t z = (uint64_t) seed;
if (max_ms <= min_ms)
return min_ms;
/* SplitMix64 finalizer: scrambles the low-entropy ms timestamp. */
z += 0x9E3779B97F4A7C15ULL;
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL;
z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL;
z ^= z >> 31;
return min_ms + (int) (z % (uint64_t) (max_ms - min_ms + 1));
}
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
int n = opt->maxsoc - back_nsoc(sback);
@@ -3334,6 +3350,18 @@ int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt) {
}
}
// #185 randomized inter-file pause: non-blocking, one launch per gap
if (n > 0 && opt->pause_max_ms > 0 && HTS_STAT.last_connect > 0) {
TStamp opTime =
HTS_STAT.last_request ? HTS_STAT.last_request : HTS_STAT.last_connect;
TStamp lap = mtime_local() - opTime;
if (lap < hts_pause_target_ms(opTime, opt->pause_min_ms, opt->pause_max_ms))
n = 0;
else
n = 1;
}
return n;
}
@@ -3748,6 +3776,11 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
if (StringNotEmpty(from->cookies_file))
StringCopyS(to->cookies_file, from->cookies_file);
if (from->pause_max_ms > 0) {
to->pause_min_ms = from->pause_min_ms;
to->pause_max_ms = from->pause_max_ms;
}
if (from->retry > -1)
to->retry = from->retry;

View File

@@ -418,6 +418,10 @@ int back_pluggable_sockets(struct_back * sback, httrackp * opt);
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt);
/* Randomized inter-file pause target in [min_ms,max_ms] (#185), derived from a
timestamp seed so it is stable within one gap and rerolls per launch. */
int hts_pause_target_ms(TStamp seed, int min_ms, int max_ms);
/* Schedule more links from the heap into free slots. Returns the number queued,
or <=0 if none could be added (no free slot / paused / stopped). */
int back_fill(struct_back * sback, httrackp * opt, cache_back * cache,

View File

@@ -1994,6 +1994,33 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
StringCopy(opt->cookies_file, argv[na]);
}
break;
case 'G': // pause: randomized inter-file delay MIN[:MAX] seconds
if ((na + 1 >= argc) || (argv[na + 1][0] == '-')) {
HTS_PANIC_PRINTF("Option pause needs a blank space and a "
"delay in seconds (MIN[:MAX])");
printf("Example: --pause 5:10\n");
htsmain_free();
return -1;
} else {
double pmin = 0, pmax = 0;
int nf;
na++;
nf = sscanf(argv[na], "%lf:%lf", &pmin, &pmax);
if (nf < 2)
pmax = pmin; /* a single value means a fixed delay */
/* positive-form bounds: NaN fails every comparison, so this
rejects it before the undefined (int)(NaN*1000) cast */
if (nf < 1 || !(pmin >= 0 && pmax >= pmin && pmax <= 86400)) {
HTS_PANIC_PRINTF("Invalid --pause range (expected "
"MIN[:MAX] seconds, 0<=MIN<=MAX<=86400)");
htsmain_free();
return -1;
}
opt->pause_min_ms = (int) (pmin * 1000.0);
opt->pause_max_ms = (int) (pmax * 1000.0);
}
break;
case 't': /* do not change type (ending) of filenames according to the MIME type */
opt->no_type_change = 1;
if (*(com+1)=='0') { opt->no_type_change = 0; com++; }

View File

@@ -521,6 +521,7 @@ void help(const char *app, int more) {
infomsg(" EN maximum mirror time in seconds (60=1 minute, 3600=1 hour)");
infomsg(" AN maximum transfer rate in bytes/seconds (1000=1KB/s max)");
infomsg(" %cN maximum number of connections/seconds (*%c10)");
infomsg(" %G random pause of MIN[:MAX] seconds between files (e.g. %G5:10)");
infomsg
(" GN pause transfer if N bytes reached, and wait until lock file is deleted");
infomsg("");

View File

@@ -6046,6 +6046,8 @@ HTSEXT_API httrackp *hts_create_opt(void) {
StringCopy(opt->footer, HTS_DEFAULT_FOOTER);
StringCopy(opt->strip_query, "");
StringCopy(opt->cookies_file, "");
opt->pause_min_ms = 0;
opt->pause_max_ms = 0;
opt->ftp_proxy = HTS_TRUE;
opt->convert_utf8 = HTS_TRUE;
StringCopy(opt->filelist, "");

View File

@@ -537,6 +537,8 @@ struct httrackp {
hts_boolean no_query_dedup; /**< with urlhack, keep query-argument order */
String cookies_file; /**< extra Netscape cookies.txt to preload
(--cookies-file) */
int pause_min_ms; /**< inter-file pause lower bound, ms (0=off, #185) */
int pause_max_ms; /**< inter-file pause upper bound, ms */
};
/* Running statistics for a mirror. */

View File

@@ -912,12 +912,58 @@ static int st_copyopt(httrackp *opt, int argc, char **argv) {
if (strcmp(StringBuff(to->cookies_file), "/tmp/jar.txt") != 0)
err = 1;
/* #185 pause pair: copied when enabled (max>0), the 0 sentinel skips */
from->pause_min_ms = 5000;
from->pause_max_ms = 10000;
to->pause_min_ms = to->pause_max_ms = 0;
copy_htsopt(from, to);
if (to->pause_min_ms != 5000 || to->pause_max_ms != 10000)
err = 1;
from->pause_min_ms = from->pause_max_ms = 0;
copy_htsopt(from, to);
if (to->pause_min_ms != 5000 || to->pause_max_ms != 10000)
err = 1;
hts_free_opt(from);
hts_free_opt(to);
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
return err;
}
static int st_pause(httrackp *opt, int argc, char **argv) {
int err = 0, i, seen_low = 0, seen_high = 0;
(void) opt;
(void) argc;
(void) argv;
/* Consecutive-ms seeds (production shape: launch timestamps a few ms apart)
must stay in range and spread, not collapse to a bound -- worst case for a
weak low-bit mixer. */
for (i = 0; i < 10000; i++) {
int t = hts_pause_target_ms((TStamp) (1719500000000LL + i), 5000, 10000);
if (t < 5000 || t > 10000)
err = 1;
seen_low |= (t < 6000);
seen_high |= (t > 9000);
}
if (!seen_low || !seen_high)
err = 1;
if (hts_pause_target_ms(12345, 8000, 8000) != 8000) /* equal bounds = fixed */
err = 1;
/* deterministic: a seed yields the same target even after an intervening call
with another seed (no global PRNG state to perturb it) */
{
int a = hts_pause_target_ms(99, 5000, 10000);
(void) hts_pause_target_ms(54321, 5000, 10000);
if (hts_pause_target_ms(99, 5000, 10000) != a)
err = 1;
}
printf("pause: %s\n", err ? "FAIL" : "OK");
return err;
}
static int st_relative(httrackp *opt, int argc, char **argv) {
char s[HTS_URLMAXSIZE * 2];
@@ -1264,6 +1310,7 @@ static const struct selftest_entry {
{"strsafe", "[overflow|overflow-buff [str]]", "bounded string-op self-test",
st_strsafe},
{"copyopt", "", "copy_htsopt option-copy self-test", st_copyopt},
{"pause", "", "randomized inter-file pause target self-test", st_pause},
{"relative", "<link> <curr-file>", "relative link between two paths",
st_relative},
{"resolve", "<link> <adr> <fil>", "resolve a link against an origin",

View File

@@ -90,4 +90,16 @@ refused "dangling-quote argument not refused cleanly"
run_only "$tmp/q-lone" '"'
refused "lone-quote argument not refused cleanly"
# --pause (#185): valid MIN[:MAX] accepted; malformed, reversed, over-range and
# non-finite values refused cleanly. NaN defeats naive `<`/`>` checks (it
# compares false to everything), so it must not slip through to the int cast.
run "$tmp/pause-ok" --pause 0.2:0.4
accepted "$tmp/pause-ok" "#185: valid --pause range rejected"
run "$tmp/pause-fix" --pause 0.2
accepted "$tmp/pause-fix" "#185: valid fixed --pause rejected"
for bad in nan nan:5 5:nan inf 10:5 99999; do
run "$tmp/pause-bad" --pause "$bad"
refused "#185: invalid --pause '$bad' not refused cleanly"
done
exit 0

15
tests/01_engine-pause.test Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash
#
# --pause (#185): the inter-file pause target must stay in [min,max] and spread
# across it (a per-call rand() would collapse it toward min). Driven by the
# in-process 'httrack -#test=pause' test. POSIX-portable ($(BASH) is /bin/sh on macOS).
set -eu
# 'run' is an ignored placeholder argument.
out=$(httrack -#test=pause run)
test "$out" = "pause: OK" || {
echo "expected 'pause: OK', got: $out" >&2
exit 1
}

29
tests/28_local-pause.test Executable file
View File

@@ -0,0 +1,29 @@
#!/bin/bash
#
# --pause (#185): a fixed inter-file delay must slow a multi-file crawl. Measure
# the same crawl with and without --pause and compare: the harness overhead
# cancels, leaving only the pause. Integer seconds keep it portable (BSD date
# has no %N); a lower bound is not timing-flaky since a pause only adds time.
set -e
: "${top_srcdir:=..}"
run() { # echoes the wall-clock seconds of one crawl
local t0 t1
t0=$(date +%s)
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
httrack 'BASEURL/types/index.html' -c1 "$@" >/dev/null 2>&1
t1=$(date +%s)
echo $((t1 - t0))
}
base=$(run)
paused=$(run --pause 0.5)
delta=$((paused - base))
echo "crawl: ${base}s, with --pause 0.5: ${paused}s (delta ${delta}s)"
if [ "$delta" -lt 2 ]; then
echo "FAIL: --pause did not delay the crawl (delta ${delta}s)" >&2
exit 1
fi

View File

@@ -41,6 +41,7 @@ TESTS = \
01_engine-idna.test \
01_engine-mime.test \
01_engine-parse.test \
01_engine-pause.test \
01_engine-rcfile.test \
01_engine-relative.test \
01_engine-savename.test \
@@ -73,6 +74,7 @@ TESTS = \
24_local-resume-overlap.test \
25_local-mime-exclude.test \
26_local-strip-query.test \
27_local-cookies-file.test
27_local-cookies-file.test \
28_local-pause.test
CLEANFILES = check-network_sh.cache