mirror of
https://github.com/xroche/httrack.git
synced 2026-06-18 00:04:12 +03:00
Compare commits
9 Commits
cleanup/cm
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fa57f0148f | ||
|
|
76260d5e6e | ||
|
|
5d0913dfce | ||
|
|
9b7601a987 | ||
|
|
4ec38c4e66 | ||
|
|
1142b64696 | ||
|
|
22d3eb44cd | ||
|
|
8246c7bbcd | ||
|
|
f13f90e9c4 |
@@ -30,6 +30,12 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/** @file htsarrays.h
|
||||
* Header-only generic dynamic array (a typed growable vector). All operations
|
||||
* are macros parameterized by the array lvalue A; the element type T is fixed
|
||||
* by the struct TypedArray(T) declares. Counts and capacities are in
|
||||
* elements, not bytes. The array owns its backing store: grow it via the Add/
|
||||
* Append/EnsureRoom macros and release it with TypedArrayFree. */
|
||||
#ifndef HTS_ARRAYS_DEFSTATIC
|
||||
#define HTS_ARRAYS_DEFSTATIC
|
||||
|
||||
@@ -39,7 +45,8 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
#include "htssafe.h"
|
||||
|
||||
/* Memory allocation assertion failure */
|
||||
/* Abort (with the failed byte count) when a growth allocation fails. The
|
||||
array macros never return an out-of-memory error; they assert and abort. */
|
||||
static void hts_record_assert_memory_failed(const size_t size) {
|
||||
fprintf(stderr, "memory allocation failed (%lu bytes)", \
|
||||
(long int) size); \
|
||||
@@ -61,6 +68,8 @@ static void hts_record_assert_memory_failed(const size_t size) {
|
||||
/** Capacity. **/ \
|
||||
size_t capa; \
|
||||
}
|
||||
|
||||
/** Initializer for an empty array (no backing store, size and capacity 0). **/
|
||||
#define EMPTY_TYPED_ARRAY { { NULL }, 0, 0 }
|
||||
|
||||
/** Array size, in elements. **/
|
||||
@@ -84,7 +93,8 @@ static void hts_record_assert_memory_failed(const size_t size) {
|
||||
/** Size of T. **/
|
||||
#define TypedArrayWidth(A) (sizeof(*TypedArrayElts(A)))
|
||||
|
||||
/** Nth element of the array. **/
|
||||
/** Nth element of the array, as an lvalue. No bounds check; N must be
|
||||
< TypedArraySize(A). **/
|
||||
#define TypedArrayNth(A, N) (TypedArrayElts(A)[N])
|
||||
|
||||
/**
|
||||
|
||||
@@ -63,12 +63,15 @@ extern "C" {
|
||||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
/* GCC extension */
|
||||
/* Compiler-portability attribute macros (no-ops on non-GCC). */
|
||||
#ifndef HTS_UNUSED
|
||||
#ifdef __GNUC__
|
||||
#define HTS_UNUSED __attribute__ ((unused))
|
||||
|
||||
#define HTS_STATIC static __attribute__ ((unused))
|
||||
|
||||
#define HTS_INLINE __inline__
|
||||
/* printf-style format check; fmt/arg are 1-based argument positions. */
|
||||
#define HTS_PRINTF_FUN(fmt, arg) __attribute__ ((format (printf, fmt, arg)))
|
||||
#else
|
||||
#define HTS_UNUSED
|
||||
@@ -78,29 +81,37 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* min/max evaluate their arguments twice; pass side-effect-free expressions. */
|
||||
#undef min
|
||||
#undef max
|
||||
#define min(a,b) ((a)>(b)?(b):(a))
|
||||
|
||||
#define max(a,b) ((a)>(b)?(a):(b))
|
||||
|
||||
#ifndef _WIN32
|
||||
#undef Sleep
|
||||
#define min(a,b) ((a)>(b)?(b):(a))
|
||||
|
||||
#define max(a,b) ((a)>(b)?(a):(b))
|
||||
|
||||
/* Win32 Sleep() shim for POSIX; argument is milliseconds. */
|
||||
#define Sleep(a) { if (((a)*1000)%1000000) usleep(((a)*1000)%1000000); if (((a)*1000)/1000000) sleep(((a)*1000)/1000000); }
|
||||
#endif
|
||||
|
||||
// teste égalité de 2 chars, case insensitive
|
||||
/* hichar: ASCII uppercasing of one char. streql: case-insensitive equality of
|
||||
two chars. ASCII only; not locale-aware. */
|
||||
#define hichar(a) ((((a)>='a') && ((a)<='z')) ? ((a)-('a'-'A')) : (a))
|
||||
|
||||
#define streql(a,b) (hichar(a)==hichar(b))
|
||||
|
||||
// caractère maj
|
||||
/* True if c is an ASCII uppercase letter. */
|
||||
#define isUpperLetter(a) ( ((a) >= 'A') && ((a) <= 'Z') )
|
||||
|
||||
/* Library internal definictions */
|
||||
/* Library-internal only (engine translation units that define
|
||||
HTS_INTERNAL_BYTECODE); not part of the consumer surface. */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
// functions
|
||||
/* Resolve a symbol in an already-loaded dynamic module. */
|
||||
#ifdef _WIN32
|
||||
#define DynamicGet(handle, sym) GetProcAddress(handle, sym)
|
||||
#else
|
||||
|
||||
@@ -31,6 +31,11 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/** @file htsbasenet.h
|
||||
Base networking definitions: platform socket headers, the optional global
|
||||
OpenSSL context, and the status-code/connection-state enumerations stored in
|
||||
htsblk and lien_back. Pulled in by htsnet.h. */
|
||||
|
||||
#ifndef HTS_DEFBASENETH
|
||||
#define HTS_DEFBASENETH
|
||||
|
||||
@@ -80,7 +85,8 @@ extern "C" {
|
||||
/* OpenSSL structure */
|
||||
#include <openssl/bio.h>
|
||||
|
||||
/* Global SSL context */
|
||||
/** Process-wide OpenSSL client context, created lazily on first TLS use;
|
||||
shared by all connections. NULL until initialized. */
|
||||
extern SSL_CTX *openssl_ctx;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -31,51 +31,77 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/** @file htsbauth.h
|
||||
HTTP Basic authentication storage: a per-session list of (URL-prefix,
|
||||
credentials) pairs, plus the cookie jar that holds it. */
|
||||
|
||||
#ifndef HTSBAUTH_DEFH
|
||||
#define HTSBAUTH_DEFH
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
// robots wizard
|
||||
/** One stored credential: the longest-prefix match against a request's
|
||||
host+path selects which auth header to send. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_bauth_chain
|
||||
#define HTS_DEF_FWSTRUCT_bauth_chain
|
||||
typedef struct bauth_chain bauth_chain;
|
||||
#endif
|
||||
struct bauth_chain {
|
||||
char prefix[1024]; /* www.foo.com/secure/ */
|
||||
char auth[1024]; /* base-64 encoded user:pass */
|
||||
struct bauth_chain *next; /* next element */
|
||||
char prefix[1024]; /* host + path prefix, e.g. www.foo.com/secure/ */
|
||||
char auth[1024]; /* base-64 encoded user:pass (Authorization payload) */
|
||||
struct bauth_chain *next; /* next element, NULL-terminated list */
|
||||
};
|
||||
|
||||
// buffer pour les cookies et authentification
|
||||
/** Per-session cookie jar; also holds the basic-auth list head (auth).
|
||||
The head node (auth) is embedded, not heap-allocated. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_t_cookie
|
||||
#define HTS_DEF_FWSTRUCT_t_cookie
|
||||
typedef struct t_cookie t_cookie;
|
||||
#endif
|
||||
struct t_cookie {
|
||||
int max_len;
|
||||
char data[32768];
|
||||
bauth_chain auth;
|
||||
int max_len; /* capacity of data[] in use */
|
||||
char data[32768]; /* raw cookie store (NUL-terminated field list) */
|
||||
bauth_chain auth; /* embedded head of the basic-auth list */
|
||||
};
|
||||
|
||||
/* Library internal definictions */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
// cookies
|
||||
/* cookies */
|
||||
int cookie_add(t_cookie * cookie, const char *cook_name, const char *cook_value,
|
||||
const char *domain, const char *path);
|
||||
|
||||
int cookie_del(t_cookie * cookie, const char *cook_name, const char *domain, const char *path);
|
||||
|
||||
int cookie_load(t_cookie * cookie, const char *path, const char *name);
|
||||
|
||||
int cookie_save(t_cookie * cookie, const char *name);
|
||||
|
||||
void cookie_insert(char *s, size_t s_size, const char *ins);
|
||||
|
||||
void cookie_delete(char *s, size_t s_size, size_t pos);
|
||||
|
||||
const char *cookie_get(char *buffer, const char *cookie_base, int param);
|
||||
|
||||
char *cookie_find(char *s, const char *cook_name, const char *domain, const char *path);
|
||||
|
||||
char *cookie_nextfield(char *a);
|
||||
|
||||
// basic auth
|
||||
/* basic auth */
|
||||
|
||||
/** Register credentials (auth = base-64 user:pass) for the prefix derived from
|
||||
adr (host) and fil (path). No-op returning 0 if cookie is NULL, allocation
|
||||
fails, or a matching prefix is already stored; returns 1 on insertion. */
|
||||
int bauth_add(t_cookie * cookie, const char *adr, const char *fil, const char *auth);
|
||||
|
||||
/** Return the stored base-64 credentials whose prefix matches adr+fil, or NULL
|
||||
if none (or cookie is NULL). Returned pointer aliases the jar's bauth_chain;
|
||||
caller must not free it. */
|
||||
char *bauth_check(t_cookie * cookie, const char *adr, const char *fil);
|
||||
|
||||
/** Build the auth lookup key (host + path, query string stripped, truncated at
|
||||
the last '/') from adr and fil into prefix; returns prefix. Caller must
|
||||
supply a buffer of HTS_URLMAXSIZE * 2 bytes. */
|
||||
char *bauth_prefix(char *buffer, const char *adr, const char *fil);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -52,6 +52,9 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
#define SELFTEST_VOLUME 3000 /* number of small entries in the scale pass */
|
||||
|
||||
/* prefix on assertion failures; set per entry point (-#A vs -#B) */
|
||||
static const char *selftest_tag = "cache-selftest";
|
||||
|
||||
/* Open a cache session. A write session (ro=0) rotates new.zip -> old.zip and
|
||||
opens a fresh new.zip; a read session (ro=1) opens new.zip in place. */
|
||||
static void selftest_open(cache_back *cache, httrackp *opt, int ro) {
|
||||
@@ -95,15 +98,13 @@ static void selftest_close(cache_back *cache) {
|
||||
that exits right after (same choice as the other -# cache subcommands) */
|
||||
}
|
||||
|
||||
/* Store one entry. The body is copied into a private buffer (any size), so
|
||||
callers may pass const data and cache_add never sees a cast-away qualifier;
|
||||
it consumes everything synchronously, so the copy is freed on return. */
|
||||
/* Store one entry; the body is copied so callers may pass const data. */
|
||||
static void store_entry(httrackp *opt, cache_back *cache, const char *adr,
|
||||
const char *fil, const char *save, int statuscode,
|
||||
const char *msg, const char *contenttype,
|
||||
const char *charset, const char *lastmodified,
|
||||
const char *etag, const char *location,
|
||||
const char *body, size_t body_len) {
|
||||
const char *cdispo, const char *body, size_t body_len) {
|
||||
htsblk r;
|
||||
char locbuf[HTS_URLMAXSIZE * 2];
|
||||
char *bodycopy = NULL;
|
||||
@@ -116,32 +117,30 @@ static void store_entry(httrackp *opt, cache_back *cache, const char *adr,
|
||||
strcpybuff(r.charset, charset);
|
||||
strcpybuff(r.lastmodified, lastmodified);
|
||||
strcpybuff(r.etag, etag);
|
||||
strcpybuff(r.cdispo, cdispo);
|
||||
strcpybuff(locbuf, location);
|
||||
r.location = locbuf;
|
||||
r.is_write = 0;
|
||||
/* an empty body must be a NULL pointer: cache_add rejects a non-NULL
|
||||
pointer with size 0 */
|
||||
/* an empty body must be NULL: cache_add rejects non-NULL with size 0 */
|
||||
if (body_len != 0) {
|
||||
bodycopy = malloct(body_len);
|
||||
memcpy(bodycopy, body, body_len);
|
||||
r.adr = bodycopy;
|
||||
}
|
||||
/* all_in_cache=1: keep the body in the ZIP whatever the content-type,
|
||||
so the read path never depends on a file on disk */
|
||||
/* all_in_cache=1: body stays in the ZIP, so reads never need a disk file */
|
||||
cache_add(opt, cache, &r, adr, fil, save, 1, NULL);
|
||||
if (bodycopy != NULL) {
|
||||
freet(bodycopy);
|
||||
}
|
||||
}
|
||||
|
||||
/* Read one entry back and check every field. Returns the number of
|
||||
mismatches (0 == success). */
|
||||
/* Read one entry back and check every field. Returns the mismatch count. */
|
||||
static int check_entry(httrackp *opt, cache_back *cache, const char *adr,
|
||||
const char *fil, int statuscode, const char *msg,
|
||||
const char *contenttype, const char *charset,
|
||||
const char *lastmodified, const char *etag,
|
||||
const char *location, const char *body,
|
||||
size_t body_len) {
|
||||
const char *location, const char *cdispo,
|
||||
const char *body, size_t body_len) {
|
||||
int fail = 0;
|
||||
char *locbuf = malloct(HTS_URLMAXSIZE * 2);
|
||||
htsblk r;
|
||||
@@ -153,15 +152,14 @@ static int check_entry(httrackp *opt, cache_back *cache, const char *adr,
|
||||
#define CHECK_STR(field, want) \
|
||||
do { \
|
||||
if (strcmp((field), (want)) != 0) { \
|
||||
fprintf(stderr, \
|
||||
"cache-selftest: %s%s: " #field " is '%s', expected '%s'\n", \
|
||||
adr, fil, (field), (want)); \
|
||||
fprintf(stderr, "%s: %s%s: " #field " is '%s', expected '%s'\n", \
|
||||
selftest_tag, adr, fil, (field), (want)); \
|
||||
fail++; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
if (r.statuscode != statuscode) {
|
||||
fprintf(stderr, "cache-selftest: %s%s: statuscode is %d, expected %d\n",
|
||||
fprintf(stderr, "%s: %s%s: statuscode is %d, expected %d\n", selftest_tag,
|
||||
adr, fil, r.statuscode, statuscode);
|
||||
fail++;
|
||||
}
|
||||
@@ -171,24 +169,23 @@ static int check_entry(httrackp *opt, cache_back *cache, const char *adr,
|
||||
CHECK_STR(r.lastmodified, lastmodified);
|
||||
CHECK_STR(r.etag, etag);
|
||||
CHECK_STR(locbuf, location);
|
||||
CHECK_STR(r.cdispo, cdispo);
|
||||
|
||||
if (r.size != (LLint) body_len) {
|
||||
fprintf(stderr, "cache-selftest: %s%s: size is " LLintP ", expected %d\n",
|
||||
fprintf(stderr, "%s: %s%s: size is " LLintP ", expected %d\n", selftest_tag,
|
||||
adr, fil, (LLint) r.size, (int) body_len);
|
||||
fail++;
|
||||
} else if (body_len != 0 &&
|
||||
(r.adr == NULL || memcmp(r.adr, body, body_len) != 0)) {
|
||||
fprintf(stderr, "cache-selftest: %s%s: body mismatch\n", adr, fil);
|
||||
fprintf(stderr, "%s: %s%s: body mismatch\n", selftest_tag, adr, fil);
|
||||
fail++;
|
||||
}
|
||||
|
||||
/* The loaded body must be NUL-terminated at [size]: cache_readex's strlen()
|
||||
consumers (htscore.c:1046, htscache.c) rely on it, and a missing
|
||||
terminator is a heap over-read. The buffer is malloc(size + slack), so
|
||||
reading [size] is in bounds. */
|
||||
/* loaded body must be NUL-terminated at [size] for cache_readex's strlen()
|
||||
consumers; buffer is malloc(size + slack) so [size] is in bounds */
|
||||
if (r.adr != NULL && r.adr[r.size] != '\0') {
|
||||
fprintf(stderr, "cache-selftest: %s%s: body not NUL-terminated at [size]\n",
|
||||
adr, fil);
|
||||
fprintf(stderr, "%s: %s%s: body not NUL-terminated at [size]\n",
|
||||
selftest_tag, adr, fil);
|
||||
fail++;
|
||||
}
|
||||
|
||||
@@ -378,27 +375,29 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
/* pass 1: create everything in a single write session */
|
||||
selftest_open_for_write(&cache, opt);
|
||||
|
||||
/* edge cases: normal HTML page */
|
||||
/* edge cases (cdispo "" except where noted): normal HTML page */
|
||||
store_entry(opt, &cache, "example.com", "/", "example.com/index.html", 200,
|
||||
"OK", "text/html", "utf-8", "Mon, 01 Jan 2024 00:00:00 GMT",
|
||||
"etag-normal", "", body_index, strlen(body_index));
|
||||
"etag-normal", "", "", body_index, strlen(body_index));
|
||||
/* redirect: empty body, empty optional fields, near-limit location */
|
||||
store_entry(opt, &cache, "example.com", "/moved", "example.com/moved.html",
|
||||
301, "Moved Permanently", "text/html", "", "", "", location_long,
|
||||
NULL, 0);
|
||||
/* non-HTML content-type kept in cache via all_in_cache, near-limit etag */
|
||||
"", NULL, 0);
|
||||
/* non-HTML content-type, near-limit etag */
|
||||
store_entry(opt, &cache, "example.com", "/api", "example.com/api.json", 200,
|
||||
"OK", "application/json", "utf-8",
|
||||
"Tue, 02 Jan 2024 12:00:00 GMT", etag_long, "", body_api,
|
||||
"Tue, 02 Jan 2024 12:00:00 GMT", etag_long, "", "", body_api,
|
||||
strlen(body_api));
|
||||
/* binary body */
|
||||
/* binary body, with a Content-Disposition */
|
||||
store_entry(opt, &cache, "example.com", "/logo", "example.com/logo.png", 200,
|
||||
"OK", "image/png", "", "", "etag-bin", "", binary_body,
|
||||
"OK", "image/png", "", "", "etag-bin", "",
|
||||
"attachment; filename=\"logo.png\"", binary_body,
|
||||
sizeof(binary_body));
|
||||
/* error status with a body and a location (non-2xx codes are cached too) */
|
||||
/* error status with a body and a location */
|
||||
store_entry(opt, &cache, "example.com", "/gone", "example.com/gone.html", 404,
|
||||
"Not Found", "text/html", "utf-8", "", "etag-404",
|
||||
"https://example.com/where-it-went", body_404, strlen(body_404));
|
||||
"https://example.com/where-it-went", "", body_404,
|
||||
strlen(body_404));
|
||||
|
||||
/* scale: a few thousand small entries */
|
||||
for (i = 0; i < SELFTEST_VOLUME; i++) {
|
||||
@@ -408,7 +407,7 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
sprintf(save, "example.com/v/%05d.html", i);
|
||||
sprintf(body, "<html>volume entry %d</html>", i);
|
||||
store_entry(opt, &cache, "example.com", fil, save, 200, "OK", "text/html",
|
||||
"utf-8", "", "", "", body, strlen(body));
|
||||
"utf-8", "", "", "", "", body, strlen(body));
|
||||
}
|
||||
|
||||
/* compression: a few large bodies */
|
||||
@@ -418,7 +417,7 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
sprintf(fil, "/big/%d.bin", i);
|
||||
sprintf(save, "example.com/big/%d.bin", i);
|
||||
store_entry(opt, &cache, "example.com", fil, save, 200, "OK",
|
||||
"application/octet-stream", "", "", "", "", large_body[i],
|
||||
"application/octet-stream", "", "", "", "", "", large_body[i],
|
||||
large_size[i]);
|
||||
}
|
||||
|
||||
@@ -427,22 +426,24 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
/* pass 2: read back and verify everything round-tripped */
|
||||
selftest_open_for_read(&cache, opt);
|
||||
|
||||
failures += check_entry(opt, &cache, "example.com", "/", 200, "OK",
|
||||
"text/html", "utf-8", "Mon, 01 Jan 2024 00:00:00 GMT",
|
||||
"etag-normal", "", body_index, strlen(body_index));
|
||||
failures +=
|
||||
check_entry(opt, &cache, "example.com", "/", 200, "OK", "text/html",
|
||||
"utf-8", "Mon, 01 Jan 2024 00:00:00 GMT", "etag-normal", "",
|
||||
"", body_index, strlen(body_index));
|
||||
failures += check_entry(opt, &cache, "example.com", "/moved", 301,
|
||||
"Moved Permanently", "text/html", "", "", "",
|
||||
location_long, NULL, 0);
|
||||
location_long, "", NULL, 0);
|
||||
failures +=
|
||||
check_entry(opt, &cache, "example.com", "/api", 200, "OK",
|
||||
"application/json", "utf-8", "Tue, 02 Jan 2024 12:00:00 GMT",
|
||||
etag_long, "", body_api, strlen(body_api));
|
||||
etag_long, "", "", body_api, strlen(body_api));
|
||||
failures +=
|
||||
check_entry(opt, &cache, "example.com", "/logo", 200, "OK", "image/png",
|
||||
"", "", "etag-bin", "", binary_body, sizeof(binary_body));
|
||||
"", "", "etag-bin", "", "attachment; filename=\"logo.png\"",
|
||||
binary_body, sizeof(binary_body));
|
||||
failures += check_entry(opt, &cache, "example.com", "/gone", 404, "Not Found",
|
||||
"text/html", "utf-8", "", "etag-404",
|
||||
"https://example.com/where-it-went", body_404,
|
||||
"https://example.com/where-it-went", "", body_404,
|
||||
strlen(body_404));
|
||||
|
||||
for (i = 0; i < SELFTEST_VOLUME; i++) {
|
||||
@@ -452,7 +453,7 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
sprintf(body, "<html>volume entry %d</html>", i);
|
||||
failures +=
|
||||
check_entry(opt, &cache, "example.com", fil, 200, "OK", "text/html",
|
||||
"utf-8", "", "", "", body, strlen(body));
|
||||
"utf-8", "", "", "", "", body, strlen(body));
|
||||
}
|
||||
|
||||
for (i = 0; i < large_count; i++) {
|
||||
@@ -460,7 +461,7 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
|
||||
sprintf(fil, "/big/%d.bin", i);
|
||||
failures += check_entry(opt, &cache, "example.com", fil, 200, "OK",
|
||||
"application/octet-stream", "", "", "", "",
|
||||
"application/octet-stream", "", "", "", "", "",
|
||||
large_body[i], large_size[i]);
|
||||
}
|
||||
|
||||
@@ -470,7 +471,7 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
selftest_open_for_write(&cache, opt);
|
||||
store_entry(opt, &cache, "example.com", "/", "example.com/index.html", 200,
|
||||
"OK", "text/html", "iso-8859-1", "Wed, 03 Jan 2024 09:30:00 GMT",
|
||||
"etag-updated", "", body_updated, strlen(body_updated));
|
||||
"etag-updated", "", "", body_updated, strlen(body_updated));
|
||||
selftest_close(&cache);
|
||||
|
||||
/* pass 4: re-read and confirm the updated value, not the old one */
|
||||
@@ -478,7 +479,7 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
failures +=
|
||||
check_entry(opt, &cache, "example.com", "/", 200, "OK", "text/html",
|
||||
"iso-8859-1", "Wed, 03 Jan 2024 09:30:00 GMT", "etag-updated",
|
||||
"", body_updated, strlen(body_updated));
|
||||
"", "", body_updated, strlen(body_updated));
|
||||
selftest_close(&cache);
|
||||
|
||||
/* pass 5: the disk-fallback read path (X-In-Cache: 0, body on disk) */
|
||||
@@ -490,3 +491,97 @@ int cache_selftests(httrackp *opt, const char *dir) {
|
||||
|
||||
return failures;
|
||||
}
|
||||
|
||||
/* Golden fixture: a small frozen cache read back to guard the read path and ZIP
|
||||
format. The table is the contract; tests/fixtures/cache-golden/.../new.zip is
|
||||
a witness written once via `httrack -#B <dir> regen`. Bodies stay in the ZIP
|
||||
(all_in_cache=1), so a read needs only new.zip -- fully portable. */
|
||||
|
||||
/* embedded NUL + high bytes: the binary-safe read path */
|
||||
static const char golden_binary[] = {
|
||||
'P', 'N', 'G', '\0', '\r', '\n', (char) 0xFF, (char) 0x80,
|
||||
'\0', '\0', 'e', 'n', 'd', (char) 0xCA, (char) 0xFE, '\n'};
|
||||
|
||||
typedef struct {
|
||||
const char *adr, *fil, *save, *msg, *contenttype, *charset, *lastmodified,
|
||||
*etag, *location, *cdispo, *body;
|
||||
size_t body_len;
|
||||
int statuscode;
|
||||
} golden_entry;
|
||||
|
||||
/* string-literal body + length (drops the terminator); the binary array passes
|
||||
its length explicitly, every byte counts */
|
||||
#define GBODY(s) (s), (sizeof(s) - 1)
|
||||
|
||||
static const golden_entry golden_entries[] = {
|
||||
/* normal HTML page */
|
||||
{"example.com", "/", "example.com/index.html", "OK", "text/html", "utf-8",
|
||||
"Mon, 01 Jan 2024 00:00:00 GMT", "etag-normal", "", "",
|
||||
GBODY("<html><body>hello</body></html>"), 200},
|
||||
/* redirect: empty body and optionals, a Location */
|
||||
{"example.com", "/moved", "example.com/moved.html", "Moved Permanently",
|
||||
"text/html", "", "", "", "https://example.com/new-home", "", NULL, 0, 301},
|
||||
/* non-HTML content */
|
||||
{"example.com", "/api", "example.com/api.json", "OK", "application/json",
|
||||
"utf-8", "Tue, 02 Jan 2024 12:00:00 GMT", "etag-api", "", "",
|
||||
GBODY("{\"k\":\"v\"}"), 200},
|
||||
/* binary body with a Content-Disposition */
|
||||
{"example.com", "/logo", "example.com/logo.png", "OK", "image/png", "", "",
|
||||
"etag-bin", "", "attachment; filename=\"logo.png\"", golden_binary,
|
||||
sizeof(golden_binary), 200},
|
||||
/* error status with a body and a Location */
|
||||
{"example.com", "/gone", "example.com/gone.html", "Not Found", "text/html",
|
||||
"utf-8", "", "etag-404", "https://example.com/where-it-went", "",
|
||||
GBODY("<html><body>404 Not Found</body></html>"), 404},
|
||||
};
|
||||
|
||||
#define GOLDEN_COUNT (sizeof(golden_entries) / sizeof(golden_entries[0]))
|
||||
|
||||
static void golden_setup(httrackp *opt, const char *dir) {
|
||||
char base[HTS_URLMAXSIZE];
|
||||
|
||||
strcpybuff(base, dir);
|
||||
if (base[0] != '\0' && base[strlen(base) - 1] != '/') {
|
||||
strcatbuff(base, "/");
|
||||
}
|
||||
StringCopy(opt->path_log, base);
|
||||
StringCopy(opt->path_html, base);
|
||||
StringCopy(opt->path_html_utf8, base);
|
||||
opt->cache = 1;
|
||||
}
|
||||
|
||||
int cache_golden_selftest(httrackp *opt, const char *dir, int regen) {
|
||||
int failures = 0;
|
||||
size_t k;
|
||||
cache_back cache;
|
||||
|
||||
selftest_tag = "cache-golden";
|
||||
golden_setup(opt, dir);
|
||||
|
||||
/* regen rewrites the fixture from the table; the test never passes it, so the
|
||||
read pass verifies bytes a previous build froze */
|
||||
if (regen) {
|
||||
selftest_open_for_write(&cache, opt);
|
||||
for (k = 0; k < GOLDEN_COUNT; k++) {
|
||||
const golden_entry *e = &golden_entries[k];
|
||||
|
||||
store_entry(opt, &cache, e->adr, e->fil, e->save, e->statuscode, e->msg,
|
||||
e->contenttype, e->charset, e->lastmodified, e->etag,
|
||||
e->location, e->cdispo, e->body, e->body_len);
|
||||
}
|
||||
selftest_close(&cache);
|
||||
}
|
||||
|
||||
selftest_open_for_read(&cache, opt);
|
||||
for (k = 0; k < GOLDEN_COUNT; k++) {
|
||||
const golden_entry *e = &golden_entries[k];
|
||||
|
||||
failures +=
|
||||
check_entry(opt, &cache, e->adr, e->fil, e->statuscode, e->msg,
|
||||
e->contenttype, e->charset, e->lastmodified, e->etag,
|
||||
e->location, e->cdispo, e->body, e->body_len);
|
||||
}
|
||||
selftest_close(&cache);
|
||||
|
||||
return failures;
|
||||
}
|
||||
|
||||
@@ -44,6 +44,14 @@ typedef struct httrackp httrackp;
|
||||
Returns the number of failed checks (0 == success). */
|
||||
int cache_selftests(httrackp *opt, const char *dir);
|
||||
|
||||
/* Read a committed (frozen) cache fixture under <dir>/hts-cache/new.zip and
|
||||
assert a fixed set of entries decodes field- and byte-exact. Unlike
|
||||
cache_selftests (write-then-read with the same build, a round-trip), this
|
||||
reads bytes an earlier build froze, so it catches read-path / format drift.
|
||||
regen!=0 first rewrites the fixture from the same table (to regenerate the
|
||||
committed file, never by the test). Returns the failed-check count. */
|
||||
int cache_golden_selftest(httrackp *opt, const char *dir, int regen);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
202
src/htscore.h
202
src/htscore.h
@@ -30,7 +30,9 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
// Fichier librairie .h
|
||||
/* Core engine declarations. Not an installed header, but part of the de-facto
|
||||
API surface: external consumers (e.g. httrack-android) read these structs and
|
||||
constants and call functions declared here. */
|
||||
#ifndef HTS_CORE_DEFH
|
||||
#define HTS_CORE_DEFH
|
||||
|
||||
@@ -38,7 +40,7 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
/* specific definitions */
|
||||
#include "htsbase.h"
|
||||
// Includes & définitions
|
||||
/* Includes and definitions */
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#ifdef _WIN32
|
||||
@@ -83,45 +85,45 @@ typedef struct filecreate_params filecreate_params;
|
||||
// options
|
||||
#include "htsopt.h"
|
||||
|
||||
// INCLUDES .H PARTIES DE CODE HTTRACK
|
||||
// HTTrack engine sub-headers
|
||||
|
||||
// routine main
|
||||
// main entry point
|
||||
#include "htscoremain.h"
|
||||
|
||||
// core routines
|
||||
#include "htscore.h"
|
||||
|
||||
// divers outils pour httrack.c
|
||||
// misc tools for httrack.c
|
||||
#include "htstools.h"
|
||||
|
||||
// aide pour la version en ligne de commande
|
||||
// command-line help
|
||||
#include "htshelp.h"
|
||||
|
||||
// génération du nom de fichier à sauver
|
||||
// build the on-disk save filename
|
||||
#include "htsname.h"
|
||||
|
||||
// gestion ftp
|
||||
// FTP support
|
||||
#include "htsftp.h"
|
||||
|
||||
// gestion interception d'URL
|
||||
// URL interception
|
||||
#include "htscatchurl.h"
|
||||
|
||||
// gestion robots.txt
|
||||
// robots.txt handling
|
||||
#include "htsrobots.h"
|
||||
|
||||
// routines d'acceptation de liens
|
||||
// link-acceptance rules
|
||||
#include "htswizard.h"
|
||||
|
||||
// routines de regexp
|
||||
// regexp/filter routines
|
||||
#include "htsfilters.h"
|
||||
|
||||
// gestion backing
|
||||
// download backing (the back[] slot ring)
|
||||
#include "htsback.h"
|
||||
|
||||
// gestion cache
|
||||
// cache handling
|
||||
#include "htscache.h"
|
||||
|
||||
// gestion hashage
|
||||
// hashing
|
||||
#include "htshash.h"
|
||||
#include "coucal.h"
|
||||
|
||||
@@ -129,65 +131,74 @@ typedef struct filecreate_params filecreate_params;
|
||||
|
||||
#include "hts-indextmpl.h"
|
||||
|
||||
// adr, fil
|
||||
/** A remote URL split into host and path, each a fixed inline buffer
|
||||
(HTS_URLMAXSIZE*2 bytes, NUL-terminated). */
|
||||
#ifndef HTS_DEF_FWSTRUCT_lien_adrfil
|
||||
#define HTS_DEF_FWSTRUCT_lien_adrfil
|
||||
typedef struct lien_adrfil lien_adrfil;
|
||||
#endif
|
||||
struct lien_adrfil {
|
||||
char adr[HTS_URLMAXSIZE * 2]; // adresse
|
||||
char fil[HTS_URLMAXSIZE * 2]; // nom du fichier distant
|
||||
char adr[HTS_URLMAXSIZE * 2]; /**< host (address) */
|
||||
char fil[HTS_URLMAXSIZE * 2]; /**< remote file path */
|
||||
};
|
||||
|
||||
// adr, fil, save
|
||||
/** A remote URL plus the local on-disk path it is saved to. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_lien_adrfilsave
|
||||
#define HTS_DEF_FWSTRUCT_lien_adrfilsave
|
||||
typedef struct lien_adrfilsave lien_adrfilsave;
|
||||
#endif
|
||||
struct lien_adrfilsave {
|
||||
lien_adrfil af;
|
||||
char save[HTS_URLMAXSIZE * 2]; // nom à sauver sur disque (avec chemin éventuel)
|
||||
char save[HTS_URLMAXSIZE * 2]; /**< local save path (with directory) */
|
||||
};
|
||||
|
||||
/** The download-slot ring: the set of concurrent transfers in flight.
|
||||
Allocated/owned by the engine; consumers (status callbacks, the loop)
|
||||
read it but do not resize or free it. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_struct_back
|
||||
#define HTS_DEF_FWSTRUCT_struct_back
|
||||
typedef struct struct_back struct_back;
|
||||
#endif
|
||||
struct struct_back {
|
||||
lien_back *lnk;
|
||||
int count;
|
||||
coucal ready;
|
||||
LLint ready_size_bytes;
|
||||
lien_back *lnk; /**< slot array, valid indices [0..count-1]
|
||||
(count+1 entries allocated); a slot is
|
||||
active iff lnk[i].status != STATUS_FREE.
|
||||
See struct lien_back in htsopt.h and the
|
||||
STATUS_* codes in htsbasenet.h. */
|
||||
int count; /**< number of usable slots (back_max) */
|
||||
coucal ready; /**< index of slots whose transfer completed */
|
||||
LLint ready_size_bytes; /**< total bytes buffered in completed slots */
|
||||
};
|
||||
|
||||
typedef struct cache_back_zip_entry cache_back_zip_entry;
|
||||
|
||||
// cache
|
||||
/** Open handle to the mirror cache (the read-from-old / write-to-new state
|
||||
used to resume and to avoid re-fetching unchanged files). Engine-owned. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_cache_back
|
||||
#define HTS_DEF_FWSTRUCT_cache_back
|
||||
typedef struct cache_back cache_back;
|
||||
#endif
|
||||
struct cache_back {
|
||||
int version; // 0 ou 1
|
||||
int version; /**< cache-file format version being read */
|
||||
/* */
|
||||
int type;
|
||||
int ro;
|
||||
FILE *dat, *ndx, *olddat;
|
||||
char *use; // liste des adr+fil
|
||||
FILE *lst; // liste des fichiers pour la "purge"
|
||||
FILE *txt; // liste des fichiers (info)
|
||||
int ro; /**< read-only: no new cache is written */
|
||||
FILE *dat, *ndx, *olddat; /**< new data, new index, old data files */
|
||||
char *use; /**< in-memory list of cached adr+fil keys */
|
||||
FILE *lst; /**< file list, used for purge */
|
||||
FILE *txt; /**< human-readable file list (info) */
|
||||
char lastmodified[256];
|
||||
// HASH
|
||||
coucal hashtable;
|
||||
// HASH for tests (naming subsystem)
|
||||
coucal cached_tests;
|
||||
// fichiers log optionnels
|
||||
/* optional log files */
|
||||
FILE *log;
|
||||
FILE *errlog;
|
||||
// variables
|
||||
int ptr_ant; // pointeur pour anticiper
|
||||
int ptr_last; // pointeur pour anticiper
|
||||
//
|
||||
/* read-ahead cursors into the old cache */
|
||||
int ptr_ant;
|
||||
int ptr_last;
|
||||
/* ZIP-backed cache backend (newer format) */
|
||||
void *zipInput;
|
||||
void *zipOutput;
|
||||
cache_back_zip_entry *zipEntries;
|
||||
@@ -199,16 +210,19 @@ struct cache_back {
|
||||
#define HTS_DEF_FWSTRUCT_hash_struct
|
||||
typedef struct hash_struct hash_struct;
|
||||
#endif
|
||||
/** Lookup indexes over the link heap: map save-name / URL back to a link, so a
|
||||
URL seen twice resolves to one entry. The coucal tables index into liens;
|
||||
they do not own the links. */
|
||||
struct hash_struct {
|
||||
/* Links big array reference */
|
||||
/* points at the engine's link array (opt->liens); not owned */
|
||||
const lien_url *const*const*liens;
|
||||
/* Savename (case insensitive ; lowercased) */
|
||||
/* save-name -> link index (case-insensitive: keys lowercased) */
|
||||
coucal sav;
|
||||
/* Address and path */
|
||||
/* address+path -> link index */
|
||||
coucal adrfil;
|
||||
/* Former address and path */
|
||||
/* former address+path -> link index (renamed/moved entries) */
|
||||
coucal former_adrfil;
|
||||
/** Buffers **/
|
||||
/* scratch buffers reused across lookups (not reentrant) */
|
||||
int normalized;
|
||||
char normfil[HTS_URLMAXSIZE * 2];
|
||||
char normfil2[HTS_URLMAXSIZE * 2];
|
||||
@@ -219,113 +233,171 @@ struct hash_struct {
|
||||
#define HTS_DEF_FWSTRUCT_filecreate_params
|
||||
typedef struct filecreate_params filecreate_params;
|
||||
#endif
|
||||
/** Parameters threaded through file-creation callbacks (filenote). */
|
||||
struct filecreate_params {
|
||||
FILE *lst;
|
||||
FILE *lst; /**< open file list to append created paths to */
|
||||
char path[HTS_URLMAXSIZE * 2];
|
||||
};
|
||||
|
||||
/* Access macros. */
|
||||
/* Convenience accessors over the link heap; assume `opt` (and where used,
|
||||
`ptr`/`parent_relative`) are in scope. heap(N) is the Nth link;
|
||||
heap_top_index() is the last recorded link's index. */
|
||||
#define heap(N) (opt->liens[N])
|
||||
|
||||
#define heap_top_index() (opt->lien_tot - 1)
|
||||
|
||||
#define heap_top() (heap(heap_top_index()))
|
||||
|
||||
#define urladr() (heap(ptr)->adr)
|
||||
|
||||
#define urlfil() (heap(ptr)->fil)
|
||||
|
||||
#define savename() (heap(ptr)->sav)
|
||||
|
||||
#define parenturladr() (heap(heap(ptr)->precedent)->adr)
|
||||
|
||||
#define parenturlfil() (heap(heap(ptr)->precedent)->fil)
|
||||
|
||||
#define parentsavename() (heap(heap(ptr)->precedent)->sav)
|
||||
|
||||
#define relativeurladr() ((!parent_relative)?urladr():parenturladr())
|
||||
|
||||
#define relativeurlfil() ((!parent_relative)?urlfil():parenturlfil())
|
||||
|
||||
#define relativesavename() ((!parent_relative)?savename():parentsavename())
|
||||
|
||||
/* Library internal definictions */
|
||||
/* Library-internal helpers (engine-only, HTS_INTERNAL_BYTECODE). */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
/* True if a new cache is being written (plain or zip backend). */
|
||||
HTS_STATIC int cache_writable(cache_back * cache) {
|
||||
return (cache != NULL && (cache->dat != NULL || cache->zipOutput != NULL));
|
||||
}
|
||||
|
||||
/* True if an old cache is available to read (plain or zip backend). */
|
||||
HTS_STATIC int cache_readable(cache_back * cache) {
|
||||
return (cache != NULL && (cache->olddat != NULL || cache->zipInput != NULL));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Fonctions
|
||||
// Functions
|
||||
|
||||
// INCLUDES .H PARTIES DE CODE HTTRACK
|
||||
|
||||
/* Library internal definictions */
|
||||
/* Library-internal only (engine TUs). */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
char *hts_cancel_file_pop(httrackp * opt);
|
||||
|
||||
#endif
|
||||
|
||||
// add a link on the heap
|
||||
/* Record a link on the heap. All strings are copied (caller keeps ownership).
|
||||
Returns 1 on success, 0 if the link limit (opt->maxlink) is reached. */
|
||||
int hts_record_link(httrackp * opt,
|
||||
const char *address, const char *file, const char *save,
|
||||
const char *ref_address, const char *ref_file,
|
||||
const char *codebase);
|
||||
|
||||
// index of the latest added link
|
||||
/* Index of the most recently recorded link. */
|
||||
size_t hts_record_link_latest(httrackp *opt);
|
||||
|
||||
// invalidate an entry
|
||||
/* Mark link at index lpos as not to be processed (sets pass2 = -1). */
|
||||
void hts_invalidate_link(httrackp * opt, int lpos);
|
||||
|
||||
// wipe all records
|
||||
/* Reset / free the engine's link heap. */
|
||||
void hts_record_init(httrackp *opt);
|
||||
|
||||
void hts_record_free(httrackp *opt);
|
||||
|
||||
//int httpmirror(char* url,int level,httrackp opt);
|
||||
/* Run the mirror for the given start URL(s) under opt. Top-level engine entry.
|
||||
*/
|
||||
int httpmirror(char *url1, httrackp * opt);
|
||||
|
||||
/* Write len bytes of adr to local path s. url_adr/url_fil (may be NULL) name
|
||||
the source URL for logging/notification. */
|
||||
int filesave(httrackp * opt, const char *adr, int len, const char *s,
|
||||
const char *url_adr /* = NULL */ ,
|
||||
const char *url_fil /* = NULL */ );
|
||||
|
||||
char *hts_cancel_file_pop(httrackp * opt);
|
||||
|
||||
int check_fatal_io_errno(void);
|
||||
|
||||
int engine_stats(void);
|
||||
|
||||
void host_ban(httrackp * opt, int ptr, struct_back * sback, const char *host);
|
||||
|
||||
/* Open local file s for writing (filecreate, truncate) or appending
|
||||
(fileappend), creating parent directories as needed. Return an open FILE*
|
||||
the caller must fclose(), or NULL on failure. */
|
||||
FILE *filecreate(filenote_strc * strct, const char *s);
|
||||
|
||||
FILE *fileappend(filenote_strc * strct, const char *s);
|
||||
|
||||
/* Create an empty file, return 1 on success, 0 on failure. */
|
||||
int filecreateempty(filenote_strc * strct, const char *filename);
|
||||
|
||||
int filenote(filenote_strc * strct, const char *s, filecreate_params * params);
|
||||
|
||||
void file_notify(httrackp * opt, const char *adr, const char *fil,
|
||||
const char *save, int create, int modify, int wasupdated);
|
||||
|
||||
void usercommand(httrackp * opt, int exe, const char *cmd, const char *file,
|
||||
const char *adr, const char *fil);
|
||||
|
||||
void usercommand_exe(const char *cmd, const char *file);
|
||||
|
||||
int filters_init(char ***ptrfilters, int maxfilter, int filterinc);
|
||||
|
||||
int fspc(httrackp * opt, FILE * fp, const char *type);
|
||||
|
||||
char *next_token(char *p, int flag);
|
||||
|
||||
//
|
||||
/* Read a whole file into a freshly malloc'd, NUL-terminated buffer; the caller
|
||||
owns it and must release it with freet(). Return NULL on missing/unreadable
|
||||
file (readfile_or substitutes defaultdata instead). The byte content is NOT
|
||||
transcoded except readfile_utf8, which expects a UTF-8 path. readfile2
|
||||
reports the byte size (excluding the NUL) via *size when non-NULL. */
|
||||
char *readfile(const char *fil);
|
||||
|
||||
char *readfile2(const char *fil, LLint * size);
|
||||
|
||||
char *readfile_utf8(const char *fil);
|
||||
|
||||
char *readfile_or(const char *fil, const char *defaultdata);
|
||||
|
||||
#if 0
|
||||
void check_rate(TStamp stat_timestart, int maxrate);
|
||||
#endif
|
||||
|
||||
// liens
|
||||
int liens_record(char *adr, char *fil, char *save, char *former_adr,
|
||||
char *former_fil, char *codebase);
|
||||
/* Backing (download-slot) scheduler. Operate on the back[] ring (struct_back).
|
||||
Not thread-safe; call from the single crawl loop. */
|
||||
|
||||
// backing, routines externes
|
||||
/* How many new sockets may be opened now, honoring maxsoc and the maxconn rate
|
||||
limit (>=0). _strict ignores reserved-slot headroom; the plain form leaves
|
||||
room for naming tests and stops at 0 when the stack is nearly full. */
|
||||
int back_pluggable_sockets(struct_back * sback, httrackp * opt);
|
||||
|
||||
int back_pluggable_sockets_strict(struct_back * sback, httrackp * opt);
|
||||
|
||||
/* Schedule more links from the heap into free slots. Returns the number queued,
|
||||
or <=0 if none could be added (no free slot / paused / stopped). */
|
||||
int back_fill(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
int ptr, int numero_passe);
|
||||
int backlinks_done(const struct_back * sback, lien_url ** liens,
|
||||
int lien_tot, int ptr);
|
||||
|
||||
/* Count of links already finished (in background or served from cache). */
|
||||
int backlinks_done(const struct_back *sback, lien_url **liens, int lien_tot,
|
||||
int ptr);
|
||||
|
||||
/* Like back_fill, but a no-op (returns -1) when in-memory buffered data already
|
||||
exceeds opt->maxcache. */
|
||||
int back_fillmax(struct_back * sback, httrackp * opt, cache_back * cache,
|
||||
int ptr, int numero_passe);
|
||||
|
||||
/* Interactive prompt: continue an interrupted mirror? Returns nonzero to go on.
|
||||
*/
|
||||
int ask_continue(httrackp * opt);
|
||||
|
||||
/* Number of decimal digits in n. */
|
||||
int nombre_digit(int n);
|
||||
|
||||
// Java
|
||||
@@ -336,17 +408,23 @@ int hts_add_file(char *file, int file_position);
|
||||
// Polling
|
||||
#if HTS_POLL
|
||||
int check_flot(T_SOC s);
|
||||
|
||||
int check_stdin(void);
|
||||
|
||||
int read_stdin(char *s, int max);
|
||||
#endif
|
||||
/* Socket readiness probes: nonzero if the socket has an error / has data. */
|
||||
int check_sockerror(T_SOC s);
|
||||
|
||||
int check_sockdata(T_SOC s);
|
||||
|
||||
/* external modules */
|
||||
/* external modules: register a link discovered by a parser plugin. */
|
||||
int htsAddLink(htsmoduleStruct * str, char *link);
|
||||
|
||||
// Void
|
||||
/* No-op function (used as a do-nothing callback / to defeat optimizers). */
|
||||
void voidf(void);
|
||||
|
||||
/* HTML marker comment marking where the top index is spliced. */
|
||||
#define HTS_TOPINDEX "TOP_INDEX_HTTRACK"
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2440,6 +2440,22 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
case 'B': // golden cache fixture read: httrack -#B <dir> [regen]
|
||||
if (na + 1 < argc) {
|
||||
const int regen =
|
||||
(na + 2 < argc && strcmp(argv[na + 2], "regen") == 0);
|
||||
const int err =
|
||||
cache_golden_selftest(opt, argv[na + 1], regen);
|
||||
|
||||
printf("cache-golden: %s\n", err ? "FAIL" : "OK");
|
||||
htsmain_free();
|
||||
return err;
|
||||
} else {
|
||||
fprintf(stderr, "Option #B requires a directory argument\n");
|
||||
htsmain_free();
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
case 'C': // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file
|
||||
{
|
||||
int hasFilter = 0;
|
||||
|
||||
113
src/htsdefines.h
113
src/htsdefines.h
@@ -30,11 +30,16 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
// Fichier librairie .h
|
||||
/** @file htsdefines.h
|
||||
* Public callback prototypes and the wrapper/plug-in interface: the function
|
||||
* pointer types a parser or wrapper module implements, and the callback table
|
||||
* the engine dispatches through. */
|
||||
#ifndef HTS_DEFINES_DEFH
|
||||
#define HTS_DEFINES_DEFH
|
||||
|
||||
/* Forward definitions */
|
||||
/* Forward declarations of engine structs, so this header is usable without
|
||||
pulling in their full definitions. Each is guarded so multiple public
|
||||
headers can repeat the typedef without clashing. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_httrackp
|
||||
#define HTS_DEF_FWSTRUCT_httrackp
|
||||
typedef struct httrackp httrackp;
|
||||
@@ -64,7 +69,8 @@ typedef struct t_hts_callbackarg t_hts_callbackarg;
|
||||
typedef struct t_hts_callbackarg t_hts_callbackarg;
|
||||
#endif
|
||||
|
||||
/* External callbacks */
|
||||
/* Marks a symbol an external wrapper module exports back to the engine
|
||||
(dllexport on Windows, nothing elsewhere). */
|
||||
#ifndef EXTERNAL_FUNCTION
|
||||
#ifdef _WIN32
|
||||
#define EXTERNAL_FUNCTION __declspec(dllexport)
|
||||
@@ -73,78 +79,141 @@ typedef struct t_hts_callbackarg t_hts_callbackarg;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* --wrapper plug function prototype */
|
||||
|
||||
/* Entry points of a --wrapper plug-in: hts_plug(opt, argv) is called once to
|
||||
install the wrapper (argv is the wrapper's argument string), hts_unplug(opt)
|
||||
once to tear it down. Both return non-zero on success. */
|
||||
typedef int (*t_hts_plug) (httrackp * opt, const char *argv);
|
||||
|
||||
typedef int (*t_hts_unplug) (httrackp * opt);
|
||||
|
||||
/* htsopt function callbacks definitions */
|
||||
/* Engine callback prototypes. Each is one hook the engine fires at a defined
|
||||
point of a mirror; a wrapper installs the ones it cares about in the
|
||||
callback table below. carg carries the user-defined argument chain; int
|
||||
returns are 1 to continue/accept, 0 to abort/refuse unless noted. */
|
||||
|
||||
/* Called once when the wrapper is installed; allocate per-run state here. */
|
||||
typedef void (*t_hts_htmlcheck_init) (t_hts_callbackarg * carg);
|
||||
|
||||
/* Called once when the wrapper is removed; release per-run state here. */
|
||||
typedef void (*t_hts_htmlcheck_uninit) (t_hts_callbackarg * carg);
|
||||
|
||||
/* Fired at the start of a mirror, after options are parsed. */
|
||||
typedef int (*t_hts_htmlcheck_start) (t_hts_callbackarg * carg, httrackp * opt);
|
||||
|
||||
/* Fired at the end of a mirror. */
|
||||
typedef int (*t_hts_htmlcheck_end) (t_hts_callbackarg * carg, httrackp * opt);
|
||||
|
||||
/* Fired while options are being changed, to validate or adjust them. */
|
||||
typedef int (*t_hts_htmlcheck_chopt) (t_hts_callbackarg * carg, httrackp * opt);
|
||||
|
||||
/* Rewrite hook over an in-memory page: the html and len arguments point at the
|
||||
buffer and its length (the callback may reallocate and resize it),
|
||||
url_adresse and url_fichier name it. */
|
||||
typedef int (*t_hts_htmlcheck_process) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, char **html, int *len,
|
||||
const char *url_adresse,
|
||||
const char *url_fichier);
|
||||
|
||||
/* Same shape as process, run before HTML parsing. */
|
||||
typedef t_hts_htmlcheck_process t_hts_htmlcheck_preprocess;
|
||||
|
||||
/* Same shape as process, run after HTML parsing. */
|
||||
typedef t_hts_htmlcheck_process t_hts_htmlcheck_postprocess;
|
||||
|
||||
/* Inspect a page (read-only html/len) without rewriting it. */
|
||||
typedef int (*t_hts_htmlcheck_check_html) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, char *html, int len,
|
||||
const char *url_adresse,
|
||||
const char *url_fichier);
|
||||
|
||||
/* Answer an engine query identified by 'question'; returns the answer string
|
||||
(owned by the callback, must stay valid until the next call). */
|
||||
typedef const char *(*t_hts_htmlcheck_query) (t_hts_callbackarg * carg,
|
||||
httrackp * opt,
|
||||
const char *question);
|
||||
|
||||
/* Second query channel, same contract as query. */
|
||||
typedef const char *(*t_hts_htmlcheck_query2) (t_hts_callbackarg * carg,
|
||||
httrackp * opt,
|
||||
const char *question);
|
||||
|
||||
/* Third query channel, same contract as query. */
|
||||
typedef const char *(*t_hts_htmlcheck_query3) (t_hts_callbackarg * carg,
|
||||
httrackp * opt,
|
||||
const char *question);
|
||||
|
||||
/* Per-tick progress hook: 'back' is the transfer slot array of 'back_max'
|
||||
entries, back_index the active one; lien_tot/lien_ntot and stats report
|
||||
queue size and running totals, stat_time the elapsed time. */
|
||||
typedef int (*t_hts_htmlcheck_loop) (t_hts_callbackarg * carg, httrackp * opt,
|
||||
lien_back * back, int back_max,
|
||||
int back_index, int lien_tot,
|
||||
int lien_ntot, int stat_time,
|
||||
hts_stat_struct * stats);
|
||||
|
||||
/* Veto a link (adr host, fil path) after its transfer; status is the result.
|
||||
Return 0 to drop the link. */
|
||||
typedef int (*t_hts_htmlcheck_check_link) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, const char *adr,
|
||||
const char *fil, int status);
|
||||
|
||||
/* Veto a link by its MIME type before download; return 0 to skip it. */
|
||||
typedef int (*t_hts_htmlcheck_check_mime) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, const char *adr,
|
||||
const char *fil, const char *mime,
|
||||
int status);
|
||||
|
||||
/* Fired when the mirror pauses, waiting on 'lockfile' to be removed. */
|
||||
typedef void (*t_hts_htmlcheck_pause) (t_hts_callbackarg * carg, httrackp * opt,
|
||||
const char *lockfile);
|
||||
|
||||
/* Fired after a file is written to disk; 'file' is the local path. */
|
||||
typedef void (*t_hts_htmlcheck_filesave) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, const char *file);
|
||||
|
||||
/* Richer file-saved notification: source host/filename, local path, and flags
|
||||
telling whether the file is new, modified, or left unchanged. */
|
||||
typedef void (*t_hts_htmlcheck_filesave2) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, const char *hostname,
|
||||
const char *filename,
|
||||
const char *localfile, int is_new,
|
||||
int is_modified, int not_updated);
|
||||
|
||||
/* Fired for each link parsed out of a page; 'link' may be edited in place. */
|
||||
typedef int (*t_hts_htmlcheck_linkdetected) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, char *link);
|
||||
|
||||
/* As linkdetected, plus tag_start, the markup the link was found in. */
|
||||
typedef int (*t_hts_htmlcheck_linkdetected2) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, char *link,
|
||||
const char *tag_start);
|
||||
|
||||
/* Fired on each transfer-status change of slot 'back'. */
|
||||
typedef int (*t_hts_htmlcheck_xfrstatus) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, lien_back * back);
|
||||
|
||||
/* Choose the local save path for a URL; write it into 'save'. adr/fil name the
|
||||
target, referer_adr/referer_fil the page that linked it. */
|
||||
typedef int (*t_hts_htmlcheck_savename) (t_hts_callbackarg * carg,
|
||||
httrackp * opt,
|
||||
const char *adr_complete,
|
||||
const char *fil_complete,
|
||||
const char *referer_adr,
|
||||
const char *referer_fil, char *save);
|
||||
|
||||
/* Extended save-name hook, same signature as savename. */
|
||||
typedef t_hts_htmlcheck_savename t_hts_htmlcheck_extsavename;
|
||||
|
||||
/* Inspect or edit the outgoing request headers in 'buff' before they are sent.
|
||||
*/
|
||||
typedef int (*t_hts_htmlcheck_sendhead) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, char *buff,
|
||||
const char *adr, const char *fil,
|
||||
const char *referer_adr,
|
||||
const char *referer_fil,
|
||||
htsblk * outgoing);
|
||||
|
||||
/* Inspect the incoming response headers in 'buff' after they are received. */
|
||||
typedef int (*t_hts_htmlcheck_receivehead) (t_hts_callbackarg * carg,
|
||||
httrackp * opt, char *buff,
|
||||
const char *adr, const char *fil,
|
||||
@@ -152,9 +221,11 @@ typedef int (*t_hts_htmlcheck_receivehead) (t_hts_callbackarg * carg,
|
||||
const char *referer_fil,
|
||||
htsblk * incoming);
|
||||
|
||||
/* External additional parsing module(s) */
|
||||
/* External parser module hooks: detect claims a document type (return 1 to
|
||||
take it), parse then extracts its links. 'str' carries the document. */
|
||||
typedef int (*t_hts_htmlcheck_detect) (t_hts_callbackarg * carg, httrackp * opt,
|
||||
htsmoduleStruct * str);
|
||||
|
||||
typedef int (*t_hts_htmlcheck_parse) (t_hts_callbackarg * carg, httrackp * opt,
|
||||
htsmoduleStruct * str);
|
||||
|
||||
@@ -164,20 +235,24 @@ typedef int (*t_hts_htmlcheck_parse) (t_hts_callbackarg * carg, httrackp * opt,
|
||||
typedef struct t_hts_htmlcheck_callbacks t_hts_htmlcheck_callbacks;
|
||||
#endif
|
||||
|
||||
/* Callabck array */
|
||||
/* Declares one named callback slot: its function pointer (typed
|
||||
t_hts_htmlcheck_<NAME>) paired with the carg passed to it. */
|
||||
#define DEFCALLBACK(NAME) \
|
||||
struct NAME { \
|
||||
t_hts_htmlcheck_ ##NAME fun; \
|
||||
t_hts_callbackarg *carg; \
|
||||
} NAME
|
||||
|
||||
/* Callback items */
|
||||
/* Generic, type-erased callback slot used where the hook type is opaque. */
|
||||
typedef void *t_hts_htmlcheck_t_hts_htmlcheck_callbacks_item;
|
||||
|
||||
typedef DEFCALLBACK(t_hts_htmlcheck_callbacks_item);
|
||||
|
||||
/* Linked list, which should be used for the 'arg' user-defined argument */
|
||||
/* Per-callback argument node. Wrappers chain these so a new hook can wrap an
|
||||
existing one: userdef is the wrapper's own data, prev points back to the
|
||||
function and carg it displaced (call it to keep the previous behavior). */
|
||||
struct t_hts_callbackarg {
|
||||
/* User-defined agument for the called function */
|
||||
/* User-defined argument for the called function */
|
||||
void *userdef;
|
||||
|
||||
/* Previous function, if any (fun != NULL) */
|
||||
@@ -187,7 +262,9 @@ struct t_hts_callbackarg {
|
||||
} prev;
|
||||
};
|
||||
|
||||
/* Callback structure */
|
||||
/* The full callback table, one slot per hook; installed in httrackp options
|
||||
and dispatched by the engine. The trailing comments mark the API version a
|
||||
slot first appeared in. */
|
||||
struct t_hts_htmlcheck_callbacks {
|
||||
/* v3.41 */
|
||||
DEFCALLBACK(init);
|
||||
@@ -219,9 +296,11 @@ struct t_hts_htmlcheck_callbacks {
|
||||
DEFCALLBACK(extsavename);
|
||||
};
|
||||
|
||||
/* Library internal definitions */
|
||||
/* Library-internal helpers, compiled only inside the engine. */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
/* Maps a callback slot's name to its byte offset in the callback table, so a
|
||||
slot can be installed by name. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_t_hts_callback_ref
|
||||
#define HTS_DEF_FWSTRUCT_t_hts_callback_ref
|
||||
typedef struct t_hts_callback_ref t_hts_callback_ref;
|
||||
@@ -235,18 +314,26 @@ struct t_hts_callback_ref {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Default (no-op) callback table the engine starts from. */
|
||||
extern const t_hts_htmlcheck_callbacks default_callbacks;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Internal helpers for building an HTTP request/response into the engine's
|
||||
scratch buffer (opt->state.HTbuff): START resets it, PRINT appends; the
|
||||
PANIC variant records a fatal error message. */
|
||||
#define HT_PRINT(A) strcatbuff(opt->state.HTbuff,A);
|
||||
|
||||
#define HT_REQUEST_START opt->state.HTbuff[0]='\0';
|
||||
|
||||
#define HT_REQUEST_END
|
||||
#define HTT_REQUEST_START opt->state.HTbuff[0]='\0';
|
||||
|
||||
#define HTT_REQUEST_END
|
||||
#define HTS_REQUEST_START opt->state.HTbuff[0]='\0';
|
||||
|
||||
#define HTS_REQUEST_END
|
||||
#define HTS_PANIC_PRINTF(S) strcpybuff(opt->state._hts_errmsg,S);
|
||||
|
||||
|
||||
105
src/htsglobal.h
105
src/htsglobal.h
@@ -30,12 +30,19 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
// Fichier réunissant l'ensemble des defines
|
||||
/** @file htsglobal.h
|
||||
* Foundational portability layer included by every other public header:
|
||||
* version strings, platform/feature switches, the HTSEXT_API export marker,
|
||||
* the integer/time/socket typedefs (LLint, TStamp, INTsys, T_SOC), printf
|
||||
* format helpers, and the file-access mode constants. */
|
||||
|
||||
#ifndef HTTRACK_GLOBAL_DEFH
|
||||
#define HTTRACK_GLOBAL_DEFH
|
||||
|
||||
// Version (also check external version information)
|
||||
/* Package version strings (the library ABI version is VERSION_INFO in
|
||||
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
||||
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
||||
LIB_VERSION the data/cache format generation. */
|
||||
#define HTTRACK_VERSION "3.49-8"
|
||||
#define HTTRACK_VERSIONID "3.49.8"
|
||||
#define HTTRACK_AFF_VERSION "3.x"
|
||||
@@ -46,7 +53,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
|
||||
// Définition plate-forme
|
||||
// Platform detection (sizes, feature macros)
|
||||
#include "htsconfig.h"
|
||||
|
||||
// WIN32 types
|
||||
@@ -57,11 +64,17 @@ Please visit our Website: http://www.httrack.com
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* GCC extension */
|
||||
/* Compiler-attribute helpers, no-ops where unsupported.
|
||||
HTS_UNUSED: suppress unused-symbol warnings. HTS_STATIC: an unused-safe
|
||||
static. HTS_PRINTF_FUN(fmt, arg): mark a printf-like function so the
|
||||
compiler type-checks the format string at argument index fmt against the
|
||||
varargs starting at arg. */
|
||||
#ifndef HTS_UNUSED
|
||||
#ifdef __GNUC__
|
||||
#define HTS_UNUSED __attribute__ ((unused))
|
||||
|
||||
#define HTS_STATIC static __attribute__ ((unused))
|
||||
|
||||
#define HTS_PRINTF_FUN(fmt, arg) __attribute__ ((format (printf, fmt, arg)))
|
||||
#else
|
||||
#define HTS_UNUSED
|
||||
@@ -86,6 +99,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#endif
|
||||
#ifndef S_ISREG
|
||||
#define S_ISREG(m) ((m) & _S_IFREG)
|
||||
|
||||
#define S_ISDIR(m) ((m) & _S_IFDIR)
|
||||
#endif
|
||||
|
||||
@@ -132,7 +146,7 @@ Please visit our Website: http://www.httrack.com
|
||||
#define BIGSTK
|
||||
#endif
|
||||
|
||||
// compatibilité DOS
|
||||
// DOS-style 8.3 filenames? 1 on Windows, 0 elsewhere
|
||||
#ifdef _WIN32
|
||||
#define HTS_DOSNAME 1
|
||||
#else
|
||||
@@ -168,7 +182,10 @@ Please visit our Website: http://www.httrack.com
|
||||
#define __cdecl
|
||||
#endif
|
||||
|
||||
/* rc file */
|
||||
/* Install paths and config-file names. HTTRACKRC is the per-user rc filename,
|
||||
HTTRACKCNF the system-wide config, HTTRACKDIR the shared data directory; the
|
||||
ETC/BIN/LIB/PREFIX paths are the defaults these derive from when not set by
|
||||
the build. */
|
||||
#ifdef _WIN32
|
||||
#define HTS_HTTRACKRC "httrackrc"
|
||||
#else
|
||||
@@ -197,9 +214,11 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
#endif
|
||||
|
||||
/* Max URL length */
|
||||
/* Maximum URL length, in bytes. Callers size URL/path string buffers to this;
|
||||
anything longer is rejected. */
|
||||
#define HTS_URLMAXSIZE 1024
|
||||
/* Max command-line length (>=HTS_URLMAXSIZE*2) */
|
||||
/* Maximum command-line argument length, in bytes (kept >= HTS_URLMAXSIZE*2 so
|
||||
an addr+path pair always fits). */
|
||||
#define HTS_CDLMAXSIZE 1024
|
||||
/* MIME-type buffer contract (htsblk.contenttype/charset/contentencoding); holds
|
||||
the longest registered MIME type, the Office OOXML ones reaching 73 chars */
|
||||
@@ -219,24 +238,30 @@ Please visit our Website: http://www.httrack.com
|
||||
#define LF "\x0a"
|
||||
#endif
|
||||
|
||||
/* équivaut à "paramètre vide", par exemple -F (none) */
|
||||
/* Sentinel meaning "empty parameter", e.g. -F (none) */
|
||||
#define HTS_NOPARAM "(none)"
|
||||
#define HTS_NOPARAM2 "\"(none)\""
|
||||
|
||||
/* maximum et minimum */
|
||||
/* Larger/smaller of two values. Macros: arguments are evaluated twice. */
|
||||
#define maximum(A,B) ( (A) > (B) ? (A) : (B) )
|
||||
|
||||
#define minimum(A,B) ( (A) < (B) ? (A) : (B) )
|
||||
|
||||
/* chaine no empty ? (and not null) */
|
||||
/* True when A is a non-NULL, non-empty string. */
|
||||
#define strnotempty(A) (((A) != NULL && (A)[0] != '\0'))
|
||||
|
||||
/* optimisation inline si possible */
|
||||
/* 'inline' where the dialect supports it (C++), nothing in plain C. */
|
||||
#ifdef __cplusplus
|
||||
#define HTS_INLINE inline
|
||||
#else
|
||||
#define HTS_INLINE
|
||||
#endif
|
||||
|
||||
/* Marks a symbol as part of the library's public ABI: exported from
|
||||
libhttrack and visible to callers. Symbols without it stay internal (hidden
|
||||
under -fvisibility=hidden). Expands to dllexport when building the library,
|
||||
dllimport when consuming it, and the visibility("default") attribute on
|
||||
ELF. */
|
||||
#ifdef _WIN32
|
||||
#ifdef LIBHTTRACK_EXPORTS
|
||||
#define HTSEXT_API __declspec(dllexport)
|
||||
@@ -247,6 +272,7 @@ Please visit our Website: http://www.httrack.com
|
||||
/* See <http://gcc.gnu.org/wiki/Visibility> */
|
||||
#if ( ( defined(__GNUC__) && ( __GNUC__ >= 4 ) ) \
|
||||
|| ( defined(HAVE_VISIBILITY) && HAVE_VISIBILITY ) )
|
||||
|
||||
#define HTSEXT_API __attribute__ ((visibility ("default")))
|
||||
#else
|
||||
#define HTSEXT_API
|
||||
@@ -260,10 +286,13 @@ Please visit our Website: http://www.httrack.com
|
||||
*/
|
||||
#if defined(__GNUC__) && \
|
||||
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
|
||||
|
||||
#define HTS_DEPRECATED(msg) __attribute__((deprecated(msg)))
|
||||
#elif defined(__GNUC__)
|
||||
|
||||
#define HTS_DEPRECATED(msg) __attribute__((deprecated))
|
||||
#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
|
||||
|
||||
#define HTS_DEPRECATED(msg) __declspec(deprecated(msg))
|
||||
#else
|
||||
#define HTS_DEPRECATED(msg)
|
||||
@@ -277,12 +306,16 @@ Please visit our Website: http://www.httrack.com
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// long long int? (or int)
|
||||
// (and int cast for system functions like malloc() )
|
||||
|
||||
/* Wide integer types, chosen per platform.
|
||||
LLint: signed 64-bit counter for byte counts and large sizes (falls back to
|
||||
plain int where 64-bit is unavailable).
|
||||
TStamp: timestamp/duration in the same width (a double in the no-64-bit
|
||||
fallback).
|
||||
LLintP: the printf conversion for an LLint. */
|
||||
#if HTS_LONGLONG
|
||||
#ifdef LLINT_FORMAT
|
||||
typedef LLINT_TYPE LLint;
|
||||
|
||||
typedef LLINT_TYPE TStamp;
|
||||
|
||||
#define LLintP LLINT_FORMAT
|
||||
@@ -290,17 +323,21 @@ typedef LLINT_TYPE TStamp;
|
||||
|
||||
#ifdef _WIN32
|
||||
typedef __int64 LLint;
|
||||
|
||||
typedef __int64 TStamp;
|
||||
|
||||
#define LLintP "%I64d"
|
||||
#elif (defined(_LP64) || defined(__x86_64__) \
|
||||
|| defined(__powerpc64__) || defined(__64BIT__))
|
||||
|
||||
typedef long int LLint;
|
||||
|
||||
typedef long int TStamp;
|
||||
|
||||
#define LLintP "%ld"
|
||||
#else
|
||||
typedef long long int LLint;
|
||||
|
||||
typedef long long int TStamp;
|
||||
|
||||
#define LLintP "%lld"
|
||||
@@ -315,6 +352,9 @@ typedef int LLint;
|
||||
typedef double TStamp;
|
||||
#endif
|
||||
|
||||
/* Integer type for file offsets/sizes passed to the C library. Widens to
|
||||
LLint (with HTS_FSEEKO for fseeko/ftello) under large-file support, plain
|
||||
int otherwise; INTsysP is its printf conversion. */
|
||||
#ifdef LFS_FLAG
|
||||
typedef LLint INTsys;
|
||||
|
||||
@@ -328,8 +368,11 @@ typedef int INTsys;
|
||||
#define INTsysP "%d"
|
||||
#endif
|
||||
|
||||
/* Socket-handle type. An unsigned integer wide enough for a Windows SOCKET;
|
||||
a plain int file descriptor on POSIX. */
|
||||
#ifdef _WIN32
|
||||
#if defined(_WIN64)
|
||||
|
||||
typedef unsigned __int64 T_SOC;
|
||||
#else
|
||||
typedef unsigned __int32 T_SOC;
|
||||
@@ -338,7 +381,7 @@ typedef unsigned __int32 T_SOC;
|
||||
typedef int T_SOC;
|
||||
#endif
|
||||
|
||||
/* IPV4, IPV6 and various unified structures */
|
||||
/* Buffer size for a printed network address (IPv4 or IPv6, NUL included). */
|
||||
#define HTS_MAXADDRLEN 64
|
||||
|
||||
#ifdef _WIN32
|
||||
@@ -346,17 +389,22 @@ typedef int T_SOC;
|
||||
#define __cdecl
|
||||
#endif
|
||||
|
||||
/* mode pour mkdir ET chmod (accès aux fichiers) */
|
||||
/* Permission bits for created folders and files (mkdir and chmod).
|
||||
PROTECT_FOLDER is owner-only. With HTS_ACCESS set (the default) the ACCESS_
|
||||
modes also grant group/other read; otherwise they stay owner-only. */
|
||||
#define HTS_PROTECT_FOLDER (S_IRUSR|S_IWUSR|S_IXUSR)
|
||||
|
||||
#if HTS_ACCESS
|
||||
#define HTS_ACCESS_FILE (S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH)
|
||||
|
||||
#define HTS_ACCESS_FOLDER (S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
|
||||
#else
|
||||
#define HTS_ACCESS_FILE (S_IRUSR|S_IWUSR)
|
||||
|
||||
#define HTS_ACCESS_FOLDER (S_IRUSR|S_IWUSR|S_IXUSR)
|
||||
#endif
|
||||
|
||||
/* vérifier la déclaration des variables préprocesseur */
|
||||
/* Sanity-check that the required preprocessor switches are defined */
|
||||
#ifndef HTS_DOSNAME
|
||||
#error | HTS_DOSNAME Has not been defined.
|
||||
#error | Set it to 1 if you are under DOS, 0 under Unix.
|
||||
@@ -366,7 +414,7 @@ typedef int T_SOC;
|
||||
#error
|
||||
#endif
|
||||
#ifndef HTS_ACCESS
|
||||
/* Par défaut, accès à tous les utilisateurs */
|
||||
/* Default: files readable by all users */
|
||||
#define HTS_ACCESS 1
|
||||
#endif
|
||||
|
||||
@@ -375,13 +423,13 @@ typedef int T_SOC;
|
||||
|
||||
/* HTSLib */
|
||||
|
||||
// Cache DNS, accélère les résolution d'adresses
|
||||
// Enable the DNS cache (speeds up address resolution)
|
||||
#define HTS_DNSCACHE 1
|
||||
|
||||
// ID d'une pseudo-socket locale pour les file://
|
||||
// Pseudo-socket id standing in for a local file:// transfer
|
||||
#define LOCAL_SOCKET_ID -2
|
||||
|
||||
// taille de chaque buffer (10 sockets 650 ko)
|
||||
// Per-connection transfer buffer size, in bytes
|
||||
#define TAILLE_BUFFER 65536
|
||||
|
||||
#ifdef HTS_DO_NOT_USE_PTHREAD
|
||||
@@ -405,6 +453,7 @@ struct mlink {
|
||||
int id;
|
||||
struct mlink *next;
|
||||
};
|
||||
|
||||
static const t_htsboundary htsboundary = 0xDEADBEEF;
|
||||
#endif
|
||||
#endif
|
||||
@@ -418,7 +467,7 @@ static const t_htsboundary htsboundary = 0xDEADBEEF;
|
||||
/* Debugging */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
// débuggage types
|
||||
// type-detection debug
|
||||
#define DEBUG_SHOWTYPES 0
|
||||
// backing debug
|
||||
#define BDEBUG 0
|
||||
@@ -436,28 +485,28 @@ static const t_htsboundary htsboundary = 0xDEADBEEF;
|
||||
#define DEBUG_ROBOTS 0
|
||||
// debug hash
|
||||
#define DEBUG_HASH 0
|
||||
// Vérification d'intégrité
|
||||
// integrity-check debug
|
||||
#define DEBUG_CHECKINT 0
|
||||
// nbr sockets debug
|
||||
#define NSDEBUG 0
|
||||
|
||||
// débuggage HTSLib
|
||||
// HTSLib debug
|
||||
#define HDEBUG 0
|
||||
// surveillance de la connexion
|
||||
#define CNXDEBUG 0
|
||||
// debuggage cookies
|
||||
#define DEBUG_COOK 0
|
||||
// débuggage hard..
|
||||
// heavy/low-level debug
|
||||
#define HTS_WIDE_DEBUG 0
|
||||
// debuggage deletehttp et cie
|
||||
#define HTS_DEBUG_CLOSESOCK 0
|
||||
// debug tracage mémoire
|
||||
// memory-tracing debug
|
||||
#define MEMDEBUG 0
|
||||
|
||||
// htsmain
|
||||
#define DEBUG_STEPS 0
|
||||
|
||||
// Débuggage de contrôle
|
||||
// Derived debug control switches
|
||||
#if HTS_DEBUG_CLOSESOCK
|
||||
#define _HTS_WIDE 1
|
||||
#endif
|
||||
|
||||
@@ -2580,8 +2580,8 @@ HTSEXT_API TStamp mtime_local(void) {
|
||||
assert(! "gettimeofday");
|
||||
}
|
||||
|
||||
return (TStamp) (((TStamp) tv.tv_sec * (TStamp) 1000)
|
||||
+ ((TStamp) tv.tv_usec / (TStamp) 1000000));
|
||||
return (TStamp) (((TStamp) tv.tv_sec * (TStamp) 1000) +
|
||||
((TStamp) tv.tv_usec / (TStamp) 1000));
|
||||
#else
|
||||
struct timeb B;
|
||||
ftime(&B);
|
||||
|
||||
@@ -31,10 +31,15 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/** @file htsmodules.h
|
||||
Loadable-parser (external module) interface. The engine hands a downloaded
|
||||
object to a module via htsmoduleStruct; the module reports discovered links
|
||||
back through the addLink callback. */
|
||||
|
||||
#ifndef HTS_MODULES
|
||||
#define HTS_MODULES
|
||||
|
||||
/* Forware definitions */
|
||||
/* Forward definitions */
|
||||
#ifndef HTS_DEF_FWSTRUCT_lien_url
|
||||
#define HTS_DEF_FWSTRUCT_lien_url
|
||||
typedef struct lien_url lien_url;
|
||||
@@ -56,18 +61,18 @@ typedef struct cache_back cache_back;
|
||||
typedef struct hash_struct hash_struct;
|
||||
#endif
|
||||
|
||||
/* Function type to add links inside the module
|
||||
link : link to add (absolute or relative)
|
||||
str : structure defined below
|
||||
Returns 1 if the link was added, 0 if not
|
||||
*/
|
||||
/** Callback a module invokes to report a discovered link.
|
||||
str: the per-object context the module was called with.
|
||||
link: link to add (absolute or relative); the engine copies it.
|
||||
Returns 1 if the engine accepted/queued the link, 0 if it was rejected. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_htsmoduleStruct
|
||||
#define HTS_DEF_FWSTRUCT_htsmoduleStruct
|
||||
typedef struct htsmoduleStruct htsmoduleStruct;
|
||||
#endif
|
||||
typedef int (*t_htsAddLink) (htsmoduleStruct * str, char *link);
|
||||
|
||||
/* Structure passed to the module */
|
||||
/** Per-object context passed to a parser module for one downloaded file.
|
||||
Field access classes are noted; engine owns all pointers unless stated. */
|
||||
struct htsmoduleStruct {
|
||||
/* Read-only elements */
|
||||
const char *filename; /* filename (C:\My Web Sites\...) */
|
||||
@@ -119,21 +124,39 @@ struct htsmoduleStruct {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Used to wrap module initialization */
|
||||
/* return 1 if init was ok */
|
||||
/** Module lifecycle hooks. Init/PlugInit return 1 on success, 0 on failure;
|
||||
Exit returns its own status (ignored by the engine). */
|
||||
typedef int (*t_htsWrapperInit) (char *fn, char *args);
|
||||
|
||||
typedef int (*t_htsWrapperExit) (void);
|
||||
|
||||
typedef int (*t_htsWrapperPlugInit) (char *args);
|
||||
|
||||
/* Library internal definictions */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
|
||||
/** Capabilities string ("-noV6", "-nossl", ...) followed by "+name" for each
|
||||
loaded module. Returned pointer aliases opt->state.HTbuff; do not free, and
|
||||
it is overwritten by the next call. */
|
||||
HTSEXT_API const char *hts_get_version_info(httrackp * opt);
|
||||
|
||||
/** Static capabilities string set by htspe_init(); valid for the process
|
||||
lifetime, do not free. */
|
||||
HTSEXT_API const char *hts_is_available(void);
|
||||
|
||||
/** Initialize the module subsystem (idempotent): builds the capabilities
|
||||
string and, on Windows, hardens the DLL search path. */
|
||||
extern void htspe_init(void);
|
||||
|
||||
/** Tear-down counterpart of htspe_init(); currently a no-op. */
|
||||
extern void htspe_uninit(void);
|
||||
|
||||
/** Run the external-parser callbacks for the object described by str.
|
||||
Returns the parse callback result (>=0) on a handled object, or -1 if no
|
||||
module claimed it or its wrapper_name is blacklisted. */
|
||||
extern int hts_parse_externals(htsmoduleStruct * str);
|
||||
|
||||
/*extern int swf_is_available;*/
|
||||
/** Nonzero if IPv6 support was compiled in (== HTS_INET6). */
|
||||
extern int V6_is_available;
|
||||
#endif
|
||||
|
||||
|
||||
89
src/htsnet.h
89
src/htsnet.h
@@ -32,6 +32,11 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/** @file htsnet.h
|
||||
Socket/connection layer. Provides SOCaddr, an opaque IPv4/IPv6
|
||||
socket-address wrapper, plus accessor macros so callers never branch on
|
||||
address family. Builds on htsbasenet.h. */
|
||||
|
||||
#ifndef HTS_DEFNETH
|
||||
#define HTS_DEFNETH
|
||||
|
||||
@@ -43,32 +48,32 @@ Please visit our Website: http://www.httrack.com
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#ifdef _WIN32
|
||||
// pour read
|
||||
// for read
|
||||
#include <io.h>
|
||||
// pour FindFirstFile
|
||||
// for FindFirstFile
|
||||
#include <winbase.h>
|
||||
typedef USHORT in_port_t;
|
||||
|
||||
typedef ADDRESS_FAMILY sa_family_t;
|
||||
#else
|
||||
//typedef int T_SOC;
|
||||
#define INVALID_SOCKET -1
|
||||
#include <netdb.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <sys/time.h>
|
||||
/* Force for sun env. */
|
||||
/* Force BSD_COMP for Sun environments. */
|
||||
#ifndef BSD_COMP
|
||||
#define BSD_COMP
|
||||
#endif
|
||||
#include <sys/ioctl.h>
|
||||
/* gethostname & co */
|
||||
/* gethostname & co */
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
/* inet_addr */
|
||||
/* inet_addr */
|
||||
#include <arpa/inet.h>
|
||||
// pas la peine normalement..
|
||||
/* normally not needed; provide in_addr_t where the platform lacks it */
|
||||
#ifndef HTS_DO_NOT_REDEFINE_in_addr_t
|
||||
typedef unsigned long in_addr_t;
|
||||
#endif
|
||||
@@ -78,14 +83,16 @@ typedef unsigned long in_addr_t;
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Ipv4 structures */
|
||||
/** Raw IP address type: in6_addr when IPv6 is enabled, else in_addr. */
|
||||
#if HTS_INET6 != 0
|
||||
typedef struct in6_addr INaddr;
|
||||
#else
|
||||
typedef struct in_addr INaddr;
|
||||
#endif
|
||||
|
||||
/* This should handle all cases */
|
||||
/** Opaque socket address holding either an IPv4 or IPv6 endpoint. Use the
|
||||
SOCaddr_* accessors rather than touching m_addr; sa_family selects the
|
||||
active union member. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_SOCaddr
|
||||
#define HTS_DEF_FWSTRUCT_SOCaddr
|
||||
typedef struct SOCaddr SOCaddr;
|
||||
@@ -103,6 +110,8 @@ struct SOCaddr {
|
||||
} m_addr;
|
||||
};
|
||||
|
||||
/** Pointer to the port field (network byte order) for the active family.
|
||||
Asserts on NULL or an unset/unknown family. */
|
||||
static HTS_INLINE HTS_UNUSED in_port_t* SOCaddr_sinport_(SOCaddr *const addr,
|
||||
const char *file, const int line) {
|
||||
assertf_(addr != NULL, file, line);
|
||||
@@ -122,6 +131,8 @@ static HTS_INLINE HTS_UNUSED in_port_t* SOCaddr_sinport_(SOCaddr *const addr,
|
||||
}
|
||||
}
|
||||
|
||||
/** Length of the active sockaddr (sockaddr_in or sockaddr_in6), or 0 if the
|
||||
family is unset/unknown. The 0 case doubles as the "not valid" test. */
|
||||
static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_size_(const SOCaddr*const addr,
|
||||
const char *file, const int line) {
|
||||
assertf_(addr != NULL, file, line);
|
||||
@@ -140,33 +151,52 @@ static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_size_(const SOCaddr*const addr,
|
||||
}
|
||||
}
|
||||
|
||||
/** Reset to the unset state (family AF_UNSPEC), making the address invalid. */
|
||||
static HTS_INLINE HTS_UNUSED void SOCaddr_clear_(SOCaddr*const addr,
|
||||
const char *file, const int line) {
|
||||
assertf_(addr != NULL, file, line);
|
||||
addr->m_addr.sa.sa_family = AF_UNSPEC;
|
||||
}
|
||||
|
||||
/* Ipv4/6 structure members */
|
||||
#define SOCaddr_sinfamily(server) ((server).m_addr.sa.sa_family)
|
||||
#define SOCaddr_sinport(server) (*SOCaddr_sinport_(&(server), __FILE__, __LINE__))
|
||||
#define SOCaddr_size(server) (SOCaddr_size_(&(server), __FILE__, __LINE__))
|
||||
#define SOCaddr_is_valid(server) (SOCaddr_size_(&(server), __FILE__, __LINE__) != 0 )
|
||||
#define SOCaddr_clear(server) SOCaddr_clear_(&(server), __FILE__, __LINE__)
|
||||
#define SOCaddr_sockaddr(server) ((server).m_addr.sa)
|
||||
#define SOCaddr_capacity(server) sizeof((server).m_addr)
|
||||
/* SOCaddr accessors; server is an lvalue SOCaddr, not a pointer. */
|
||||
#define SOCaddr_sinfamily(server) \
|
||||
((server).m_addr.sa.sa_family) /* AF_INET / AF_INET6 */
|
||||
|
||||
/* AF_xx */
|
||||
#define SOCaddr_sinport(server) \
|
||||
(*SOCaddr_sinport_(&(server), __FILE__, \
|
||||
__LINE__)) /* port lvalue (network order) */
|
||||
|
||||
#define SOCaddr_size(server) \
|
||||
(SOCaddr_size_(&(server), __FILE__, __LINE__)) /* active sockaddr length */
|
||||
|
||||
#define SOCaddr_is_valid(server) \
|
||||
(SOCaddr_size_(&(server), __FILE__, __LINE__) != \
|
||||
0) /* nonzero if family is set */
|
||||
|
||||
#define SOCaddr_clear(server) SOCaddr_clear_(&(server), __FILE__, __LINE__)
|
||||
|
||||
#define SOCaddr_sockaddr(server) \
|
||||
((server).m_addr.sa) /* generic struct sockaddr view */
|
||||
|
||||
#define SOCaddr_capacity(server) \
|
||||
sizeof((server).m_addr) /* full union size, for recvfrom() etc. */
|
||||
|
||||
/** Address family to bind/listen with: AF_INET6 when IPv6 is enabled (dual
|
||||
stack), else AF_INET. */
|
||||
#if HTS_INET6 != 0
|
||||
#define AFinet AF_INET6
|
||||
#else
|
||||
#define AFinet AF_INET
|
||||
#endif
|
||||
|
||||
/* Set port to sockaddr structure */
|
||||
/** Set the port (host-order argument, stored network-order) on the active
|
||||
* family. */
|
||||
#define SOCaddr_initport(server, port) do { \
|
||||
SOCaddr_sinport(server) = htons((in_port_t) (port)); \
|
||||
} while(0)
|
||||
|
||||
/** Initialize as an all-zero IPv4 wildcard (INADDR_ANY) address; returns its
|
||||
sockaddr length. */
|
||||
static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_initany_(SOCaddr*const addr,
|
||||
const char *file, const int line) {
|
||||
assertf_(addr != NULL, file, line);
|
||||
@@ -175,13 +205,15 @@ static HTS_INLINE HTS_UNUSED socklen_t SOCaddr_initany_(SOCaddr*const addr,
|
||||
return SOCaddr_size_(addr, file, line);
|
||||
}
|
||||
|
||||
/** Initialize server as an IPv4 wildcard (INADDR_ANY) address. */
|
||||
#define SOCaddr_initany(server) do { \
|
||||
SOCaddr_initany_(&(server), __FILE__, __LINE__); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
Copy sockaddr_in/sockaddr_in6/raw IPv4/raw IPv6 to our opaque SOCaddr
|
||||
*/
|
||||
/** Populate server from data. data_size selects the source form: a full
|
||||
sockaddr_in / sockaddr_in6, or a raw 4-byte (IPv4) / 16-byte (IPv6) address
|
||||
with port zeroed. Any other size leaves an AF_INET shell. Returns the
|
||||
resulting sockaddr length. */
|
||||
static HTS_UNUSED socklen_t SOCaddr_copyaddr_(SOCaddr*const server,
|
||||
const void *data, const size_t data_size,
|
||||
const char *file, const int line) {
|
||||
@@ -214,20 +246,24 @@ static HTS_UNUSED socklen_t SOCaddr_copyaddr_(SOCaddr*const server,
|
||||
return SOCaddr_size_(server, file, line);
|
||||
}
|
||||
|
||||
/** Copy hpaddr (length hpsize) into server, writing the result length into the
|
||||
lvalue server_len (int). See SOCaddr_copyaddr_ for accepted forms. */
|
||||
#define SOCaddr_copyaddr(server, server_len, hpaddr, hpsize) do { \
|
||||
server_len = (int) SOCaddr_copyaddr_(&(server), hpaddr, hpsize, __FILE__, __LINE__); \
|
||||
} while(0)
|
||||
|
||||
/** Like SOCaddr_copyaddr but discards the result length. */
|
||||
#define SOCaddr_copyaddr2(server, hpaddr, hpsize) do { \
|
||||
(void) SOCaddr_copyaddr_(&(server), hpaddr, hpsize, __FILE__, __LINE__); \
|
||||
} while(0)
|
||||
|
||||
/** Copy one SOCaddr (src) into another (dest), preserving family and port. */
|
||||
#define SOCaddr_copy_SOCaddr(dest, src) do { \
|
||||
SOCaddr_copyaddr_(&(dest), &(src).m_addr.sa, SOCaddr_size(src), __FILE__, __LINE__); \
|
||||
} while(0)
|
||||
|
||||
/* Get dotted address */
|
||||
|
||||
/** Write the numeric (dotted/colon) host of ss into namebuf (capacity
|
||||
namebuflen), scope id stripped. On failure namebuf becomes "". */
|
||||
static HTS_UNUSED void SOCaddr_inetntoa_(char *namebuf, size_t namebuflen,
|
||||
SOCaddr *const ss,
|
||||
const char *file, const int line) {
|
||||
@@ -248,13 +284,14 @@ static HTS_UNUSED void SOCaddr_inetntoa_(char *namebuf, size_t namebuflen,
|
||||
}
|
||||
}
|
||||
|
||||
/** Numeric host of ss into namebuf (capacity namebuflen); "" on failure. */
|
||||
#define SOCaddr_inetntoa(namebuf, namebuflen, ss) \
|
||||
SOCaddr_inetntoa_(namebuf, namebuflen, &(ss), __FILE__, __LINE__)
|
||||
|
||||
/* Get protocol ID */
|
||||
/** Single-char family tag: '1' for IPv4, '2' otherwise (used in the cache). */
|
||||
#define SOCaddr_getproto(ss) ( SOCaddr_size(ss) == sizeof(struct sockaddr_in) ? '1' : '2')
|
||||
|
||||
/* Socket length type */
|
||||
/** Length type for socket APIs (getsockname, accept, ...). */
|
||||
typedef socklen_t SOClen;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
533
src/htsopt.h
533
src/htsopt.h
@@ -81,38 +81,41 @@ struct String {
|
||||
|
||||
/* Defines */
|
||||
#define CATBUFF_SIZE (STRING_SIZE*2*2)
|
||||
|
||||
#define STRING_SIZE 2048
|
||||
|
||||
/* Proxy structure */
|
||||
/* Proxy configuration. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_t_proxy
|
||||
#define HTS_DEF_FWSTRUCT_t_proxy
|
||||
typedef struct t_proxy t_proxy;
|
||||
#endif
|
||||
struct t_proxy {
|
||||
int active;
|
||||
String name;
|
||||
int port;
|
||||
String bindhost; // bind this host
|
||||
int active; /**< nonzero if a proxy is configured */
|
||||
String name; /**< proxy host name */
|
||||
int port; /**< proxy port */
|
||||
String bindhost; /**< local address to bind the outgoing socket to */
|
||||
};
|
||||
|
||||
/* Structure utile pour copier en bloc les paramètres */
|
||||
/* Bundle of filter pointers, kept together for bulk copy. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_htsfilters
|
||||
#define HTS_DEF_FWSTRUCT_htsfilters
|
||||
typedef struct htsfilters htsfilters;
|
||||
#endif
|
||||
struct htsfilters {
|
||||
char ***filters;
|
||||
int *filptr;
|
||||
//int* filter_max;
|
||||
char ***filters; /**< pointer to the +/-pattern filter array */
|
||||
int *filptr; /**< pointer to the current filter count */
|
||||
// int* filter_max;
|
||||
};
|
||||
|
||||
/* User callbacks chain */
|
||||
typedef int (*htscallbacksfncptr) (void);
|
||||
|
||||
typedef struct htscallbacks htscallbacks;
|
||||
|
||||
struct htscallbacks {
|
||||
void *moduleHandle;
|
||||
htscallbacksfncptr exitFnc;
|
||||
htscallbacks *next;
|
||||
void *moduleHandle; /**< handle of the module that registered the callback */
|
||||
htscallbacksfncptr exitFnc; /**< function to run on engine exit */
|
||||
htscallbacks *next; /**< next entry in the callback chain */
|
||||
};
|
||||
|
||||
/* filenote() internal file structure */
|
||||
@@ -188,14 +191,14 @@ typedef enum hts_log_type {
|
||||
} hts_log_type;
|
||||
#endif
|
||||
|
||||
/* Structure état du miroir */
|
||||
/* Mirror cancellation list node. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_htsoptstatecancel
|
||||
#define HTS_DEF_FWSTRUCT_htsoptstatecancel
|
||||
typedef struct htsoptstatecancel htsoptstatecancel;
|
||||
#endif
|
||||
struct htsoptstatecancel {
|
||||
char *url;
|
||||
htsoptstatecancel *next;
|
||||
char *url; /**< URL flagged to be cancelled */
|
||||
htsoptstatecancel *next; /**< next cancellation entry */
|
||||
};
|
||||
|
||||
/* Mutexes */
|
||||
@@ -210,48 +213,48 @@ typedef struct htsmutex_s htsmutex_s, *htsmutex;
|
||||
typedef struct struct_coucal struct_coucal, *coucal;
|
||||
#endif
|
||||
|
||||
/* Structure état du miroir */
|
||||
/* Mirror runtime state (mutable engine state, not user options). */
|
||||
#ifndef HTS_DEF_FWSTRUCT_htsoptstate
|
||||
#define HTS_DEF_FWSTRUCT_htsoptstate
|
||||
typedef struct htsoptstate htsoptstate;
|
||||
#endif
|
||||
struct htsoptstate {
|
||||
htsmutex lock; /* 3.41 */
|
||||
htsmutex lock; /**< guards this state block */
|
||||
/* */
|
||||
int stop;
|
||||
int stop; /**< set to request the mirror to stop */
|
||||
int exit_xh;
|
||||
int back_add_stats;
|
||||
/* */
|
||||
int mimehtml_created;
|
||||
String mimemid;
|
||||
FILE *mimefp;
|
||||
int delayedId;
|
||||
int mimehtml_created; /**< MIME/MHTML output already started */
|
||||
String mimemid; /**< MIME multipart boundary id */
|
||||
FILE *mimefp; /**< MIME/MHTML output file */
|
||||
int delayedId; /**< counter for delayed-type-check ids */
|
||||
/* */
|
||||
filenote_strc strc;
|
||||
/* Functions context (avoir thread variables!) */
|
||||
htscallbacks callbacks;
|
||||
concat_strc concat;
|
||||
usercommand_strc usercmd;
|
||||
fspc_strc fspc;
|
||||
filenote_strc strc; /**< filenote() listing state */
|
||||
/* Per-call function contexts (thread-local scratch, avoids globals) */
|
||||
htscallbacks callbacks; /**< user callback chain head */
|
||||
concat_strc concat; /**< concat() rotating buffers */
|
||||
usercommand_strc usercmd; /**< pending user shell command */
|
||||
fspc_strc fspc; /**< error/warning/info counters */
|
||||
char *userhttptype;
|
||||
int verif_backblue_done;
|
||||
int verif_backblue_done; /**< backblue.gif/fade.gif already emitted */
|
||||
int verif_external_status;
|
||||
t_dnscache *dns_cache;
|
||||
int dns_cache_nthreads;
|
||||
t_dnscache *dns_cache; /**< DNS resolution cache */
|
||||
int dns_cache_nthreads; /**< number of in-flight DNS resolver threads */
|
||||
/* HTML parsing state */
|
||||
char _hts_errmsg[HTS_CDLMAXSIZE + 256];
|
||||
char _hts_errmsg[HTS_CDLMAXSIZE + 256]; /**< last engine error message */
|
||||
int _hts_in_html_parsing;
|
||||
int _hts_in_html_done;
|
||||
int _hts_in_html_poll;
|
||||
int _hts_setpause;
|
||||
int _hts_in_mirror;
|
||||
char **_hts_addurl;
|
||||
int _hts_in_mirror; /**< nonzero while a mirror is running */
|
||||
char **_hts_addurl; /**< extra URLs to inject at runtime */
|
||||
int _hts_cancel;
|
||||
htsoptstatecancel *cancel; /* 3.41 */
|
||||
htsoptstatecancel *cancel; /**< list of URLs flagged for cancellation */
|
||||
char HTbuff[2048];
|
||||
unsigned int debug_state;
|
||||
unsigned int tmpnameid; /* 3.41 */
|
||||
int is_ended; /* 3.48-14 */
|
||||
unsigned int tmpnameid; /**< counter for temporary file names */
|
||||
int is_ended; /**< mirror has finished */
|
||||
};
|
||||
|
||||
/* Library handles */
|
||||
@@ -264,12 +267,13 @@ typedef struct htslibhandles htslibhandles;
|
||||
typedef struct htslibhandle htslibhandle;
|
||||
#endif
|
||||
struct htslibhandle {
|
||||
char *moduleName;
|
||||
void *handle;
|
||||
char *moduleName; /**< name of a loaded external module */
|
||||
void *handle; /**< dlopen() handle for it */
|
||||
};
|
||||
|
||||
struct htslibhandles {
|
||||
int count;
|
||||
htslibhandle *handles;
|
||||
int count; /**< number of loaded module handles */
|
||||
htslibhandle *handles; /**< array of loaded module handles */
|
||||
};
|
||||
|
||||
/* Javascript parser flags */
|
||||
@@ -286,176 +290,192 @@ typedef enum htsparsejava_flags {
|
||||
typedef struct lien_buffers lien_buffers;
|
||||
#endif
|
||||
|
||||
// paramètres httrack (options)
|
||||
/*
|
||||
* Per-mirror options and state block. This is the central HTTrack parameters
|
||||
* structure: created by hts_create_opt(), it carries every tunable option for
|
||||
* one mirror and embeds the live engine state, and is then consumed by
|
||||
* hts_main2().
|
||||
*
|
||||
* Callers normally configure it through the command-line argv vector (the
|
||||
* option parser), not by writing fields directly. The only fields real
|
||||
* consumers poke directly are 'log' and 'errlog' (set either to NULL to
|
||||
* silence logging).
|
||||
*/
|
||||
#ifndef HTS_DEF_FWSTRUCT_httrackp
|
||||
#define HTS_DEF_FWSTRUCT_httrackp
|
||||
typedef struct httrackp httrackp;
|
||||
#endif
|
||||
struct httrackp {
|
||||
size_t size_httrackp; // size of this structure
|
||||
size_t size_httrackp; /**< size of this structure (version/ABI guard) */
|
||||
/* */
|
||||
int wizard; // wizard aucun/grand/petit
|
||||
int flush; // fflush sur les fichiers log
|
||||
int travel; // type de déplacements (same domain etc)
|
||||
int seeker; // up & down
|
||||
int depth; // nombre de niveaux de récursion
|
||||
int extdepth; // nombre de niveaux de récursion à l'éxtérieur
|
||||
int urlmode; // liens relatifs etc
|
||||
int wizard; /**< interactive wizard level (none/full/light) */
|
||||
int flush; /**< fflush() log files after each write */
|
||||
int travel; /**< link-following scope (same domain, etc.) */
|
||||
int seeker; /**< allowed direction: go up and/or down the tree */
|
||||
int depth; /**< maximum recursion depth (-rN) */
|
||||
int extdepth; /**< maximum recursion depth outside the start domain */
|
||||
int urlmode; /**< saved-link rewriting style (relative, absolute, etc.) */
|
||||
int no_type_change; // do not change file type according to MIME
|
||||
int debug; // mode débug log
|
||||
int getmode; // sauver html, images..
|
||||
FILE *log; // fichier log
|
||||
FILE *errlog; // et erreur
|
||||
LLint maxsite; // taille max site
|
||||
LLint maxfile_nonhtml; // taille max non html
|
||||
LLint maxfile_html; // taille max html
|
||||
int maxsoc; // nbre sockets
|
||||
LLint fragment; // fragmentation d'un site
|
||||
int nearlink; // prendre les images/data proche d'une page mais à l'extérieur
|
||||
int makeindex; // faire un index
|
||||
int kindex; // et un index 'keyword'
|
||||
int delete_old; // effacer anciens fichiers
|
||||
int timeout; // nombre de secondes de timeout
|
||||
int rateout; // nombre d'octets minium pour le transfert
|
||||
int maxtime; // temps max en secondes
|
||||
int maxrate; // taux de transfert max
|
||||
float maxconn; // nombre max de connexions/s
|
||||
int waittime; // démarrage programmé
|
||||
int cache; // génération d'un cache
|
||||
//int aff_progress; // barre de progression
|
||||
int shell; // gestion d'un shell par pipe stdin/stdout
|
||||
t_proxy proxy; // configuration du proxy
|
||||
int savename_83; // conversion 8-3 pour les noms de fichiers
|
||||
int savename_type; // type de noms: structure originale/html-images en un seul niveau
|
||||
String savename_userdef; // structure userdef (ex: %h%p/%n%q.%t)
|
||||
int debug; /**< debug logging level */
|
||||
int getmode; /**< what to fetch (HTML, images, ...) bitmask */
|
||||
FILE *log; /**< informational log stream; NULL mutes it */
|
||||
FILE *errlog; /**< error log stream; NULL mutes it */
|
||||
LLint maxsite; /**< max total bytes for the whole mirror */
|
||||
LLint maxfile_nonhtml; /**< max bytes per non-HTML file */
|
||||
LLint maxfile_html; /**< max bytes per HTML file */
|
||||
int maxsoc; /**< max simultaneous sockets (-cN) */
|
||||
LLint fragment; /**< split site after this many bytes */
|
||||
int nearlink; /**< also fetch images/data adjacent to a page but off-site */
|
||||
int makeindex; /**< build a top-level index.html */
|
||||
int kindex; /**< build a keyword index */
|
||||
int delete_old; /**< delete locally obsolete files after update */
|
||||
int timeout; /**< connection timeout in seconds */
|
||||
int rateout; /**< minimum transfer rate (bytes/s) before abort */
|
||||
int maxtime; /**< max total mirror duration in seconds */
|
||||
int maxrate; /**< max transfer rate cap (bytes/s) */
|
||||
float maxconn; /**< max connections per second */
|
||||
int waittime; /**< scheduled start time (wall-clock seconds) */
|
||||
int cache; /**< cache generation mode */
|
||||
// int aff_progress; // progress bar
|
||||
int shell; /**< driven by a shell over stdin/stdout pipes */
|
||||
t_proxy proxy; /**< proxy configuration */
|
||||
int savename_83; /**< force 8.3 (DOS) file names */
|
||||
int savename_type; /**< saved-name layout (original tree, flat, ...) */
|
||||
String
|
||||
savename_userdef; /**< user-defined name template (e.g. %h%p/%n%q.%t) */
|
||||
int savename_delayed; // delayed type check
|
||||
int delayed_cached; // delayed type check can be cached to speedup updates
|
||||
int mimehtml; // MIME-html
|
||||
int user_agent_send; // user agent (ex: httrack/1.0 [sun])
|
||||
String user_agent; //
|
||||
String referer; // referer
|
||||
String from; // from
|
||||
String path_log; // chemin pour cache et log
|
||||
String path_html; // chemin pour miroir
|
||||
String path_html_utf8; // chemin pour miroir, UTF-8
|
||||
String path_bin; // chemin pour templates
|
||||
int retry; // nombre d'essais supplémentaires en cas d'échec
|
||||
int makestat; // mettre à jour un fichier log de statistiques de transfert
|
||||
int maketrack; // mettre à jour un fichier log de statistiques d'opérations
|
||||
int parsejava; // parsing des classes java pour récupérer les class, gif & cie ; see htsparsejava_flags
|
||||
int hostcontrol; // abandon d'un host trop lent etc.
|
||||
int errpage; // générer une page d'erreur en cas de 404 etc.
|
||||
int check_type; // si type inconnu (cgi,asp,/) alors tester lien (et gérer moved éventuellement)
|
||||
int all_in_cache; // tout mettre en cache!
|
||||
int robots; // traitement des robots
|
||||
int external; // pages externes->pages d'erreur
|
||||
int passprivacy; // pas de mot de pass dans les liens externes?
|
||||
int includequery; // include la query-string
|
||||
int mirror_first_page; // miroir des liens
|
||||
String sys_com; // commande système
|
||||
int sys_com_exec; // executer commande
|
||||
int accept_cookie; // gestion des cookies
|
||||
t_cookie *cookie;
|
||||
int http10; // forcer http 1.0
|
||||
int nokeepalive; // pas de keep-alive
|
||||
int nocompression; // pas de compression
|
||||
int sizehack; // forcer réponse "mis à jour" si taille identique
|
||||
int mimehtml; /**< produce a single MIME/MHTML archive */
|
||||
int user_agent_send; /**< send a User-Agent header */
|
||||
String user_agent; /**< User-Agent value (e.g. httrack/1.0) */
|
||||
String referer; /**< Referer value to send */
|
||||
String from; /**< From value to send */
|
||||
String path_log; /**< directory for cache and logs */
|
||||
String path_html; /**< output directory for the mirror */
|
||||
String path_html_utf8; /**< output directory for the mirror, UTF-8 form */
|
||||
String path_bin; /**< directory for HTML templates */
|
||||
int retry; /**< extra retries on a failed transfer */
|
||||
int makestat; /**< maintain a transfer-statistics log */
|
||||
int maketrack; /**< maintain an operations-statistics log */
|
||||
int parsejava; /**< Java/JS parsing mode; see htsparsejava_flags */
|
||||
int hostcontrol; /**< drop hosts that are too slow, etc. */
|
||||
int errpage; /**< generate an error page on 404 and similar */
|
||||
int check_type; /**< probe unknown-type links (cgi/asp/dir) and follow moves
|
||||
*/
|
||||
int all_in_cache; /**< keep all retrieved data in the cache */
|
||||
int robots; /**< robots.txt handling level */
|
||||
int external; /**< render external links as error pages */
|
||||
int passprivacy; /**< strip passwords from external links */
|
||||
int includequery; /**< include the query string in saved names */
|
||||
int mirror_first_page; /**< only mirror the links of the first page */
|
||||
String sys_com; /**< system command to run */
|
||||
int sys_com_exec; /**< actually execute sys_com */
|
||||
int accept_cookie; /**< accept and send cookies */
|
||||
t_cookie *cookie; /**< cookie store */
|
||||
int http10; /**< force HTTP/1.0 */
|
||||
int nokeepalive; /**< disable keep-alive */
|
||||
int nocompression; /**< disable content compression */
|
||||
int sizehack; /**< treat same-size response as "updated" */
|
||||
int urlhack; // force "url normalization" to avoid loops
|
||||
int tolerant; // accepter content-length incorrect
|
||||
int parseall; // essayer de tout parser (tags inconnus contenant des liens, par exemple)
|
||||
int parsedebug; // débugger parser (debug!)
|
||||
int norecatch; // ne pas reprendre les fichiers effacés localement par l'utilisateur
|
||||
int verbosedisplay; // animation textuelle
|
||||
String footer; // ligne d'infos
|
||||
int maxcache; // maximum en mémoire au niveau du cache (backing)
|
||||
//int maxcache_anticipate; // maximum de liens à anticiper (majorant)
|
||||
int ftp_proxy; // proxy http pour ftp
|
||||
String filelist; // fichier liste URL à inclure
|
||||
String urllist; // fichier liste de filtres à inclure
|
||||
htsfilters filters; // contient les pointeurs pour les filtres
|
||||
int tolerant; /**< accept an incorrect Content-Length */
|
||||
int parseall; /**< parse aggressively, including unknown tags with links */
|
||||
int parsedebug; /**< parser debug mode */
|
||||
int norecatch; /**< do not re-fetch files the user deleted locally */
|
||||
int verbosedisplay; /**< animated text progress display */
|
||||
String footer; /**< footer/info line injected into pages */
|
||||
int maxcache; /**< in-memory cache backing limit (bytes) */
|
||||
// int maxcache_anticipate; // maximum links to anticipate (upper bound)
|
||||
int ftp_proxy; /**< use the HTTP proxy for FTP too */
|
||||
String filelist; /**< file listing URLs to include */
|
||||
String urllist; /**< file listing filters to include */
|
||||
htsfilters filters; /**< filter pointers (+/-pattern rules) */
|
||||
hash_struct *hash; // hash structure
|
||||
lien_url **liens; // links
|
||||
int lien_tot; // top index of "links" heap (always out-of-range)
|
||||
lien_buffers *liensbuf; // links buffers
|
||||
robots_wizard *robotsptr; // robots ptr
|
||||
String lang_iso; // en, fr ..
|
||||
String lang_iso; /**< Accept-Language value (en, fr, ...) */
|
||||
String accept; // Accept:
|
||||
String headers; // Additional headers
|
||||
String mimedefs; // ext1=mimetype1\next2=mimetype2..
|
||||
String mod_blacklist; // (3.41)
|
||||
String mod_blacklist; /**< blacklisted modules */
|
||||
int convert_utf8; // filenames UTF-8 conversion (3.46)
|
||||
//
|
||||
int maxlink; // nombre max de liens
|
||||
int maxfilter; // nombre max de filtres
|
||||
int maxlink; /**< max number of links */
|
||||
int maxfilter; /**< max number of filters */
|
||||
//
|
||||
const char *exec; // adresse du nom de l'éxecutable
|
||||
const char *exec; /**< path of the running executable */
|
||||
//
|
||||
int quiet; // poser des questions autres que wizard?
|
||||
int keyboard; // vérifier stdin
|
||||
int quiet; /**< suppress non-wizard questions */
|
||||
int keyboard; /**< poll stdin for keyboard input */
|
||||
int bypass_limits; // bypass built-in limits
|
||||
int background_on_suspend; // background process on suspend signal
|
||||
//
|
||||
int is_update; // c'est une update (afficher "File updated...")
|
||||
int dir_topindex; // reconstruire top index par la suite
|
||||
int is_update; /**< this run is an update (show "File updated...") */
|
||||
int dir_topindex; /**< rebuild the top index afterwards */
|
||||
//
|
||||
// callbacks
|
||||
t_hts_htmlcheck_callbacks *callbacks_fun;
|
||||
t_hts_htmlcheck_callbacks
|
||||
*callbacks_fun; /**< user HTML/parsing callback table */
|
||||
// store library handles
|
||||
htslibhandles libHandles;
|
||||
htslibhandles libHandles; /**< loaded external module handles */
|
||||
//
|
||||
htsoptstate state; // state
|
||||
htsoptstate state; /**< embedded live engine state */
|
||||
};
|
||||
|
||||
// stats for httrack
|
||||
/* Running statistics for a mirror. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_hts_stat_struct
|
||||
#define HTS_DEF_FWSTRUCT_hts_stat_struct
|
||||
typedef struct hts_stat_struct hts_stat_struct;
|
||||
#endif
|
||||
struct hts_stat_struct {
|
||||
LLint HTS_TOTAL_RECV; // flux entrant reçu
|
||||
LLint stat_bytes; // octets écrits sur disque
|
||||
// int HTS_TOTAL_RECV_STATE; // status: 0 tout va bien 1: ralentir un peu 2: ralentir 3: beaucoup
|
||||
TStamp stat_timestart; // départ
|
||||
LLint HTS_TOTAL_RECV; /**< total bytes received from the network */
|
||||
LLint stat_bytes; /**< total bytes written to disk */
|
||||
// int HTS_TOTAL_RECV_STATE; // status: 0 ok 1: slow down a little 2: slow
|
||||
// down 3: a lot
|
||||
TStamp stat_timestart; /**< mirror start time */
|
||||
//
|
||||
LLint total_packed; // flux entrant compressé reçu
|
||||
LLint total_unpacked; // flux entrant compressé reçu
|
||||
int total_packedfiles; // fichiers compressés
|
||||
LLint total_packed; /**< compressed bytes received (on the wire) */
|
||||
LLint total_unpacked; /**< bytes after decompression */
|
||||
int total_packedfiles; /**< number of compressed files */
|
||||
//
|
||||
TStamp istat_timestart[2]; // départ pour calcul instantanné
|
||||
LLint istat_bytes[2]; // calcul pour instantanné
|
||||
TStamp istat_reference01; // top départ donné par #0 à #1
|
||||
int istat_idlasttimer; // id du timer qui a récemment donné une stat
|
||||
TStamp
|
||||
istat_timestart[2]; /**< window start times for the instantaneous rate */
|
||||
LLint istat_bytes[2]; /**< window byte counts for the instantaneous rate */
|
||||
TStamp
|
||||
istat_reference01; /**< reference timestamp handed from window #0 to #1 */
|
||||
int istat_idlasttimer; /**< id of the timer that last produced a stat */
|
||||
//
|
||||
int stat_files; // nombre de fichiers écrits
|
||||
int stat_updated_files; // nombre de fichiers mis à jour
|
||||
int stat_background; // nombre de fichiers écrits en arrière plan
|
||||
int stat_files; /**< number of files written */
|
||||
int stat_updated_files; /**< number of files updated */
|
||||
int stat_background; /**< number of files written in the background */
|
||||
//
|
||||
int stat_nrequests; // nombre de requêtes sur socket
|
||||
int stat_sockid; // nombre de sockets allouées au total
|
||||
int stat_nsocket; // nombre de sockets
|
||||
int stat_errors; // nombre d'erreurs
|
||||
int stat_errors_front; // idem, mais au tout premier niveau
|
||||
int stat_warnings; // '' warnings
|
||||
int stat_infos; // '' infos
|
||||
int nbk; // fichiers anticipés en arrière plan et terminés
|
||||
LLint nb; // données transférées actuellement (estimation)
|
||||
int stat_nrequests; /**< number of requests issued on sockets */
|
||||
int stat_sockid; /**< total number of sockets ever allocated */
|
||||
int stat_nsocket; /**< current number of open sockets */
|
||||
int stat_errors; /**< number of errors */
|
||||
int stat_errors_front; /**< errors at the very first level */
|
||||
int stat_warnings; /**< number of warnings */
|
||||
int stat_infos; /**< number of info messages */
|
||||
int nbk; /**< background-anticipated files now completed */
|
||||
LLint nb; /**< bytes currently being transferred (estimate) */
|
||||
//
|
||||
LLint rate;
|
||||
LLint rate; /**< current transfer rate */
|
||||
//
|
||||
TStamp last_connect; // last connect() call
|
||||
TStamp last_request; // last request issued
|
||||
TStamp last_connect; /**< time of the last connect() call */
|
||||
TStamp last_request; /**< time of the last request issued */
|
||||
};
|
||||
|
||||
// structure pour paramètres supplémentaires lors de la requête
|
||||
/* Extra per-request parameters (mirrors httrackp request options). */
|
||||
#ifndef HTS_DEF_FWSTRUCT_htsrequest_proxy
|
||||
#define HTS_DEF_FWSTRUCT_htsrequest_proxy
|
||||
typedef struct htsrequest_proxy htsrequest_proxy;
|
||||
#endif
|
||||
struct htsrequest_proxy {
|
||||
int active;
|
||||
const char* name;
|
||||
int port;
|
||||
const char* bindhost; // bind this host
|
||||
int active; /**< nonzero if a proxy is used for this request */
|
||||
const char *name; /**< proxy host name */
|
||||
int port; /**< proxy port */
|
||||
const char *bindhost; /**< local address to bind the outgoing socket to */
|
||||
};
|
||||
|
||||
#ifndef HTS_DEF_FWSTRUCT_htsrequest
|
||||
@@ -463,93 +483,93 @@ struct htsrequest_proxy {
|
||||
typedef struct htsrequest htsrequest;
|
||||
#endif
|
||||
struct htsrequest {
|
||||
short int user_agent_send; // user agent (ex: httrack/1.0 [sun])
|
||||
short int http11; // l'en tête peut (doit) être signé HTTP/1.1 et non HTTP/1.0
|
||||
short int nokeepalive; // pas de keep-alive
|
||||
short int range_used; // Range utilisé
|
||||
short int nocompression; // Pas de compression
|
||||
short int user_agent_send; /**< send a User-Agent header */
|
||||
short int http11; /**< sign the request as HTTP/1.1 rather than HTTP/1.0 */
|
||||
short int nokeepalive; /**< disable keep-alive */
|
||||
short int range_used; /**< a Range header is in use */
|
||||
short int nocompression; /**< disable compression */
|
||||
short int flush_garbage; // recycled
|
||||
const char* user_agent;
|
||||
const char* referer;
|
||||
const char* from;
|
||||
const char* lang_iso;
|
||||
const char* accept;
|
||||
const char* headers;
|
||||
htsrequest_proxy proxy; // proxy
|
||||
const char *user_agent; /**< User-Agent value */
|
||||
const char *referer; /**< Referer value */
|
||||
const char *from; /**< From value */
|
||||
const char *lang_iso; /**< Accept-Language value */
|
||||
const char *accept; /**< Accept value */
|
||||
const char *headers; /**< extra request headers */
|
||||
htsrequest_proxy proxy; /**< proxy for this request */
|
||||
};
|
||||
|
||||
// structure pour retour d'une connexion/prise d'en tête
|
||||
/* Result of a connection / header fetch. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_htsblk
|
||||
#define HTS_DEF_FWSTRUCT_htsblk
|
||||
typedef struct htsblk htsblk;
|
||||
#endif
|
||||
struct htsblk {
|
||||
int statuscode; // status-code, -1=erreur, 200=OK,201=..etc (cf RFC1945)
|
||||
short int notmodified; // page ou fichier NON modifié (transféré)
|
||||
short int is_write; // sortie sur disque (out) ou en mémoire (adr)
|
||||
short int is_chunk; // mode chunk
|
||||
short int compressed; // compressé?
|
||||
short int empty; // vide?
|
||||
short int keep_alive; // Keep-Alive?
|
||||
short int keep_alive_trailers; // ..with trailers extension
|
||||
int keep_alive_t; // KA timeout
|
||||
int keep_alive_max; // KA number of requests
|
||||
char *adr; // adresse du bloc de mémoire, NULL=vide
|
||||
char *headers; // adresse des en têtes si présents
|
||||
FILE *out; // écriture directe sur disque (si is_write=1)
|
||||
LLint size; // taille fichier
|
||||
char msg[80]; // message éventuel si échec ("\0"=non précisé)
|
||||
int statuscode; /**< HTTP status code; -1=error, 200=OK, ... (RFC1945) */
|
||||
short int notmodified; /**< page/file was not modified (not transferred) */
|
||||
short int is_write; /**< output goes to disk (out) vs memory (adr) */
|
||||
short int is_chunk; /**< chunked transfer encoding */
|
||||
short int compressed; /**< body is compressed */
|
||||
short int empty; /**< body is empty */
|
||||
short int keep_alive; /**< connection is keep-alive */
|
||||
short int keep_alive_trailers; /**< keep-alive with trailers extension */
|
||||
int keep_alive_t; /**< keep-alive timeout (seconds) */
|
||||
int keep_alive_max; /**< keep-alive max number of requests */
|
||||
char *adr; /**< in-memory body buffer; NULL if empty */
|
||||
char *headers; /**< received headers, if any */
|
||||
FILE *out; /**< destination file when is_write=1 */
|
||||
LLint size; /**< body size */
|
||||
char msg[80]; /**< failure message ("" if none) */
|
||||
char contenttype[HTS_MIMETYPE_SIZE]; // content-type (e.g. "text/html")
|
||||
char charset[HTS_MIMETYPE_SIZE]; // charset (e.g. "iso-8859-1")
|
||||
char contentencoding[HTS_MIMETYPE_SIZE]; // content-encoding (e.g. "gzip")
|
||||
char *location; // on copie dedans éventuellement la véritable 'location'
|
||||
LLint totalsize; // taille totale à télécharger (-1=inconnue)
|
||||
short int is_file; // ce n'est pas une socket mais un descripteur de fichier si 1
|
||||
T_SOC soc; // ID socket
|
||||
SOCaddr address; // IP address
|
||||
char *location; /**< resolved Location target, if any */
|
||||
LLint totalsize; /**< total size to download (-1=unknown) */
|
||||
short int is_file; /**< 1 if a file descriptor rather than a socket */
|
||||
T_SOC soc; /**< socket id */
|
||||
SOCaddr address; /**< peer IP address */
|
||||
int address_size; // IP address structure length (unused internally)
|
||||
FILE *fp; // fichier pour file://
|
||||
FILE *fp; /**< file handle for file:// */
|
||||
#if HTS_USEOPENSSL
|
||||
short int ssl; // is this connection a SSL one? (https)
|
||||
short int ssl; /**< nonzero if this is an SSL connection (https) */
|
||||
// BIO* ssl_soc; // SSL structure
|
||||
SSL *ssl_con; // connection structure
|
||||
SSL *ssl_con; /**< SSL connection structure */
|
||||
#endif
|
||||
char lastmodified[64]; // Last-Modified
|
||||
char etag[256]; // Etag
|
||||
char cdispo[256]; // Content-Disposition coupé
|
||||
LLint crange; // Content-Range
|
||||
LLint crange_start; // Content-Range
|
||||
LLint crange_end; // Content-Range
|
||||
int debugid; // debug connection
|
||||
char lastmodified[64]; /**< Last-Modified value */
|
||||
char etag[256]; /**< ETag value */
|
||||
char cdispo[256]; /**< Content-Disposition filename (truncated) */
|
||||
LLint crange; /**< Content-Range length */
|
||||
LLint crange_start; /**< Content-Range start offset */
|
||||
LLint crange_end; /**< Content-Range end offset */
|
||||
int debugid; /**< connection debug id */
|
||||
/* */
|
||||
htsrequest req; // paramètres pour la requête
|
||||
/*char digest[32+2]; // digest md5 généré par le moteur ("" si non généré) */
|
||||
htsrequest req; /**< parameters used for the request */
|
||||
/*char digest[32+2]; // md5 digest generated by the engine ("" if none) */
|
||||
};
|
||||
|
||||
// structure d'un lien
|
||||
/* A single link in the crawl. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_lien_url
|
||||
#define HTS_DEF_FWSTRUCT_lien_url
|
||||
typedef struct lien_url lien_url;
|
||||
#endif
|
||||
struct lien_url {
|
||||
char *adr; // adresse
|
||||
char *fil; // nom du fichier distant
|
||||
char *sav; // nom à sauver sur disque (avec chemin éventuel)
|
||||
char *cod; // chemin codebase éventuel si classe java
|
||||
char *former_adr; // adresse initiale (avant éventuel moved), peut être nulle
|
||||
char *former_fil; // nom du fichier distant initial (avant éventuel moved), peut être nul
|
||||
char *adr; /**< host/address part of the URL */
|
||||
char *fil; /**< remote file path */
|
||||
char *sav; /**< local save name (with any path) */
|
||||
char *cod; /**< codebase path for a Java class, if any */
|
||||
char *former_adr; /**< original address before a move; may be NULL */
|
||||
char *former_fil; /**< original remote file before a move; may be NULL */
|
||||
|
||||
int premier; // pointeur sur le premier lien qui a donné lieu aux autres liens du domaine
|
||||
int precedent; // pointeur sur le lien qui a donné lieu à ce lien précis
|
||||
int depth; // profondeur autorisée lien ; >0 forte 0=faible
|
||||
int pass2; // traiter après les autres, seconde passe. si == -1, lien traité en background
|
||||
char link_import; // lien importé à la suite d'un moved - ne pas appliquer les règles classiques up/down
|
||||
//int moved; // pointeur sur moved
|
||||
int retry; // nombre de retry restants
|
||||
int testmode; // mode test uniquement, envoyer juste un head!
|
||||
int premier; /**< index of the first link that seeded this domain */
|
||||
int precedent; /**< index of the link that referenced this one */
|
||||
int depth; /**< remaining allowed depth; >0 strong, 0 weak */
|
||||
int pass2; /**< second-pass marker; -1 means handled in background */
|
||||
char link_import; /**< imported after a move; skip the usual up/down rules */
|
||||
// int moved; // pointer to moved
|
||||
int retry; /**< remaining retries */
|
||||
int testmode; /**< test only: send just a HEAD */
|
||||
};
|
||||
|
||||
// chargement de fichiers en 'arrière plan'
|
||||
/* A file being fetched in the background. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_lien_back
|
||||
#define HTS_DEF_FWSTRUCT_lien_back
|
||||
typedef struct lien_back lien_back;
|
||||
@@ -558,43 +578,44 @@ struct lien_back {
|
||||
#if DEBUG_CHECKINT
|
||||
char magic;
|
||||
#endif
|
||||
char url_adr[HTS_URLMAXSIZE * 2]; // adresse
|
||||
char url_fil[HTS_URLMAXSIZE * 2]; // nom du fichier distant
|
||||
char url_sav[HTS_URLMAXSIZE * 2]; // nom à sauver sur disque (avec chemin éventuel)
|
||||
char referer_adr[HTS_URLMAXSIZE * 2]; // adresse host page referer
|
||||
char referer_fil[HTS_URLMAXSIZE * 2]; // fichier page referer
|
||||
char location_buffer[HTS_URLMAXSIZE * 2]; // "location" en cas de "moved" (302,..)
|
||||
char *tmpfile; // nom à sauver temporairement (compressé)
|
||||
char tmpfile_buffer[HTS_URLMAXSIZE * 2]; // buffer pour le nom à sauver temporairement
|
||||
char send_too[1024]; // données à envoyer en même temps que le header
|
||||
int status; // status (-1=non utilisé, 0: prêt, >0: opération en cours)
|
||||
int locked; // locked (to be used soon)
|
||||
int testmode; // mode de test
|
||||
int timeout; // gérer des timeouts? (!=0 : nombre de secondes)
|
||||
TStamp timeout_refresh; // si oui, time refresh
|
||||
int rateout; // timeout refresh? (!=0 : taux minimum toléré en octets/s)
|
||||
TStamp rateout_time; // si oui, date de départ
|
||||
LLint maxfile_nonhtml; // taille max d'un fichier non html
|
||||
LLint maxfile_html; // idem pour un ficheir html
|
||||
htsblk r; // structure htsblk de chaque objet en background
|
||||
int is_update; // mode update
|
||||
int head_request; // requète HEAD?
|
||||
LLint range_req_size; // range utilisé
|
||||
TStamp ka_time_start; // refresh time for KA
|
||||
char url_adr[HTS_URLMAXSIZE * 2]; /**< host/address part of the URL */
|
||||
char url_fil[HTS_URLMAXSIZE * 2]; /**< remote file path */
|
||||
char url_sav[HTS_URLMAXSIZE * 2]; /**< local save name (with any path) */
|
||||
char referer_adr[HTS_URLMAXSIZE * 2]; /**< referer page host/address */
|
||||
char referer_fil[HTS_URLMAXSIZE * 2]; /**< referer page file */
|
||||
char
|
||||
location_buffer[HTS_URLMAXSIZE * 2]; /**< Location on a move (302, ...) */
|
||||
char *tmpfile; /**< temporary save name (compressed) */
|
||||
char tmpfile_buffer[HTS_URLMAXSIZE * 2]; /**< storage for tmpfile */
|
||||
char send_too[1024]; /**< data to send together with the header */
|
||||
int status; /**< -1=unused, 0=ready, >0=operation in progress */
|
||||
int locked; /**< locked (reserved) */
|
||||
int testmode; /**< test mode */
|
||||
int timeout; /**< timeout in seconds (0=none) */
|
||||
TStamp timeout_refresh; /**< last activity time, for timeout tracking */
|
||||
int rateout; /**< minimum tolerated rate in bytes/s (0=none) */
|
||||
TStamp rateout_time; /**< start time for the rate window */
|
||||
LLint maxfile_nonhtml; /**< max bytes for a non-HTML file */
|
||||
LLint maxfile_html; /**< max bytes for an HTML file */
|
||||
htsblk r; /**< per-object result block */
|
||||
int is_update; /**< update mode */
|
||||
int head_request; /**< this is a HEAD request */
|
||||
LLint range_req_size; /**< Range request size used */
|
||||
TStamp ka_time_start; /**< keep-alive refresh start time */
|
||||
//
|
||||
int http11; // L'en tête doit être signé HTTP/1.1 et non HTTP/1.0
|
||||
int is_chunk; // chunk?
|
||||
char *chunk_adr; // adresse chunk en cours de chargement
|
||||
LLint chunk_size; // taille chunk en cours de chargement
|
||||
LLint chunk_blocksize; // taille data declaree par le chunk
|
||||
LLint compressed_size; // taille compressés (stats uniquement)
|
||||
int http11; /**< sign the request as HTTP/1.1 rather than HTTP/1.0 */
|
||||
int is_chunk; /**< chunked transfer */
|
||||
char *chunk_adr; /**< buffer for the chunk being loaded */
|
||||
LLint chunk_size; /**< size of the chunk being loaded */
|
||||
LLint chunk_blocksize; /**< data size declared by the chunk */
|
||||
LLint compressed_size; /**< compressed size (stats only) */
|
||||
//
|
||||
//int links_index; // to access liens[links_index]
|
||||
//
|
||||
char info[256]; // éventuel status pour le ftp
|
||||
int stop_ftp; // flag stop pour ftp
|
||||
int finalized; // finalized (optim memory)
|
||||
int early_add; // was added before link heap saw it
|
||||
char info[256]; /**< status text, e.g. for FTP */
|
||||
int stop_ftp; /**< stop flag for FTP */
|
||||
int finalized; /**< finalized (memory optimization) */
|
||||
int early_add; /**< was added before the link heap saw it */
|
||||
#if DEBUG_CHECKINT
|
||||
char magic2;
|
||||
#endif
|
||||
|
||||
@@ -104,6 +104,7 @@ static HTS_UNUSED void abortf_(const char *exp, const char *file, int line) {
|
||||
* Check whether 'VAR' is of type char[].
|
||||
*/
|
||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||
|
||||
/* Note: char[] and const char[] are compatible */
|
||||
#define HTS_IS_CHAR_BUFFER(VAR) ( __builtin_types_compatible_p ( typeof (VAR), char[] ) )
|
||||
#else
|
||||
@@ -139,8 +140,11 @@ static HTS_UNUSED void htssafe_compile_time_check_(void) {
|
||||
* (MSVC, ...) keep the previous behavior via the #else branches.
|
||||
*/
|
||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||
|
||||
#if defined(__has_attribute)
|
||||
|
||||
#if __has_attribute(warning)
|
||||
|
||||
#define HTS_BUFF_PTR_ATTR(msg) __attribute__((unused, noinline, warning(msg)))
|
||||
#endif
|
||||
#endif
|
||||
@@ -152,29 +156,51 @@ static HTS_UNUSED void htssafe_compile_time_check_(void) {
|
||||
|
||||
HTS_BUFF_PTR_ATTR("strcpybuff() destination is a pointer (capacity unknown): "
|
||||
"NOT bounds-checked; use strlcpybuff(dst, src, size)")
|
||||
|
||||
static char *strcpybuff_ptr_(char *dest, const char *src) {
|
||||
return strcpy(dest, src);
|
||||
}
|
||||
|
||||
HTS_BUFF_PTR_ATTR("strcatbuff() destination is a pointer (capacity unknown): "
|
||||
"NOT bounds-checked; use strlcatbuff(dst, src, size)")
|
||||
|
||||
static char *strcatbuff_ptr_(char *dest, const char *src) {
|
||||
return strcat(dest, src);
|
||||
}
|
||||
|
||||
HTS_BUFF_PTR_ATTR("strncatbuff() destination is a pointer (capacity unknown): "
|
||||
"NOT bounds-checked; use strlcatbuff(dst, src, size)")
|
||||
|
||||
static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
||||
return strncat(dest, src, n);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* SIZE CONTRACT shared by strcpybuff/strcatbuff/strncatbuff (the "buff"
|
||||
* family): the destination bound is taken from sizeof(A), so A MUST be a real
|
||||
* char[] array in scope. The bound is the full array size in bytes, INCLUDING
|
||||
* the terminating NUL. On overflow the *_safe_ helpers do NOT truncate: they
|
||||
* abort() (assertf). On success the result is always NUL-terminated.
|
||||
*
|
||||
* CRITICAL CAVEAT: if A is a bare char* pointer (not an array), sizeof(A) is
|
||||
* the pointer size, not the buffer capacity. There is no way to recover the
|
||||
* real capacity, so these macros SILENTLY DEGRADE to the unbounded raw
|
||||
* strcpy()/strcat()/strncat() while still looking like a checked call. The
|
||||
* bound is lost. On GCC/Clang (C) the pointer case routes through the
|
||||
* *_ptr_ stubs above, which carry a 'warning' attribute to flag the site at
|
||||
* compile time; on other compilers it is silent. When the destination is a
|
||||
* pointer of known capacity, call the explicit-size strlcpybuff/strlcatbuff
|
||||
* (passing the capacity, NUL included) instead.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Append at most N characters from "B" to "A".
|
||||
* If "A" is a char[] variable whose size is not sizeof(char*), then the size
|
||||
* If "A" is a char[] variable whose size is not sizeof(char*), then the size
|
||||
* is assumed to be the capacity of this array.
|
||||
*/
|
||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||
|
||||
#define strncatbuff(A, B, N) __builtin_choose_expr( HTS_IS_CHAR_BUFFER(A), \
|
||||
strncat_safe_(A, sizeof(A), B, \
|
||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), N, \
|
||||
@@ -195,6 +221,7 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
||||
* is assumed to be the capacity of this array.
|
||||
*/
|
||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||
|
||||
#define strcatbuff(A, B) __builtin_choose_expr( HTS_IS_CHAR_BUFFER(A), \
|
||||
strncat_safe_(A, sizeof(A), B, \
|
||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), (size_t) -1, \
|
||||
@@ -215,6 +242,7 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
||||
* is assumed to be the capacity of this array.
|
||||
*/
|
||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||
|
||||
#define strcpybuff(A, B) __builtin_choose_expr( HTS_IS_CHAR_BUFFER(A), \
|
||||
strcpy_safe_(A, sizeof(A), B, \
|
||||
HTS_IS_NOT_CHAR_BUFFER(B) ? (size_t) -1 : sizeof(B), \
|
||||
@@ -229,6 +257,14 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
||||
"overflow while copying '" #B "' to '"#A"'", __FILE__, __LINE__) )
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Explicit-size variants (strlcatbuff/strlncatbuff/strlcpybuff): the
|
||||
* destination capacity is the caller-supplied S (total bytes, NUL included),
|
||||
* NOT derived from sizeof(A). Use these when A is a pointer or its capacity is
|
||||
* not its sizeof. Same abort-on-overflow, always-NUL-terminated contract; no
|
||||
* silent pointer degradation since the bound is passed in.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Append characters of "B" to "A", "A" having a maximum capacity of "S".
|
||||
*/
|
||||
@@ -256,6 +292,7 @@ static char *strncatbuff_ptr_(char *dest, const char *src, size_t n) {
|
||||
|
||||
/** strnlen replacement (autotools). **/
|
||||
#if ( ! defined(_WIN32) && ! defined(HAVE_STRNLEN) )
|
||||
|
||||
static HTS_UNUSED size_t strnlen(const char *s, size_t maxlen) {
|
||||
size_t i;
|
||||
for(i = 0 ; i < maxlen && s[i] != '\0' ; i++) ;
|
||||
@@ -263,6 +300,10 @@ static HTS_UNUSED size_t strnlen(const char *s, size_t maxlen) {
|
||||
}
|
||||
#endif
|
||||
|
||||
/* strlen of source, but bounded by sizeof_source (its capacity, NUL included).
|
||||
Aborts if source is NULL or has no NUL within that capacity. The sentinel
|
||||
sizeof_source == (size_t)-1 means "capacity unknown", and falls back to the
|
||||
unbounded strlen (used when the source is a pointer rather than an array). */
|
||||
static HTS_INLINE HTS_UNUSED size_t strlen_safe_(const char *source, const size_t sizeof_source,
|
||||
const char *file, int line) {
|
||||
size_t size;
|
||||
@@ -273,6 +314,11 @@ static HTS_INLINE HTS_UNUSED size_t strlen_safe_(const char *source, const size_
|
||||
return size;
|
||||
}
|
||||
|
||||
/* Core bounded append. Appends min(strlen(source), n) bytes of source onto
|
||||
dest. sizeof_dest is dest's total capacity (NUL included); sizeof_source is
|
||||
source's capacity or (size_t)-1 if unknown. Aborts if the result (existing
|
||||
dest length + appended bytes + NUL) would not fit sizeof_dest: this NEVER
|
||||
truncates. Always NUL-terminates on success. */
|
||||
static HTS_INLINE HTS_UNUSED char* strncat_safe_(char *const dest, const size_t sizeof_dest,
|
||||
const char *const source, const size_t sizeof_source,
|
||||
const size_t n,
|
||||
@@ -288,6 +334,9 @@ static HTS_INLINE HTS_UNUSED char* strncat_safe_(char *const dest, const size_t
|
||||
return dest;
|
||||
}
|
||||
|
||||
/* Core bounded copy: empties dest then appends all of source via
|
||||
strncat_safe_. sizeof_dest is dest's total capacity (NUL included). Aborts
|
||||
(no truncation) if source plus its NUL would not fit. */
|
||||
static HTS_INLINE HTS_UNUSED char* strcpy_safe_(char *const dest, const size_t sizeof_dest,
|
||||
const char *const source, const size_t sizeof_source,
|
||||
const char *exp, const char *file, int line) {
|
||||
@@ -333,9 +382,11 @@ static HTS_INLINE HTS_UNUSED htsbuff htsbuff_ptr_(char *buf, size_t cap) {
|
||||
* On other compilers there is no such guard, so pass only true arrays there.
|
||||
*/
|
||||
#if (defined(__GNUC__) && !defined(__cplusplus))
|
||||
|
||||
/* 0 for an array, a -1 array-size compile error for a pointer. */
|
||||
#define htsbuff_must_be_array_(A) \
|
||||
(sizeof(char[1 - 2 * !!__builtin_types_compatible_p(typeof(A), typeof(&(A)[0]))]) - 1)
|
||||
|
||||
#define htsbuff_array(ARR) htsbuff_ptr_((ARR), sizeof(ARR) + htsbuff_must_be_array_(ARR))
|
||||
#else
|
||||
#define htsbuff_array(ARR) htsbuff_ptr_((ARR), sizeof(ARR))
|
||||
@@ -378,11 +429,20 @@ static HTS_INLINE HTS_UNUSED const char *htsbuff_str(const htsbuff *b) {
|
||||
return b->buf;
|
||||
}
|
||||
|
||||
/* Thin aliases over the libc allocator/memcpy (historical "t" suffix); no
|
||||
added bounds checking. freet() also NULLs the freed pointer and tolerates
|
||||
NULL. memcpybuff() despite the name is a raw memcpy: the caller owns the
|
||||
bounds. */
|
||||
#define malloct(A) malloc(A)
|
||||
|
||||
#define calloct(A,B) calloc((A), (B))
|
||||
|
||||
#define freet(A) do { if ((A) != NULL) { free(A); (A) = NULL; } } while(0)
|
||||
|
||||
#define strdupt(A) strdup(A)
|
||||
|
||||
#define realloct(A,B) realloc(A, B)
|
||||
|
||||
#define memcpybuff(A, B, N) memcpy((A), (B), (N))
|
||||
|
||||
#endif
|
||||
|
||||
112
src/htsstrings.h
112
src/htsstrings.h
@@ -42,7 +42,9 @@ Please visit our Website: http://www.httrack.com
|
||||
#ifndef HTS_UNUSED
|
||||
#ifdef __GNUC__
|
||||
#define HTS_UNUSED __attribute__ ((unused))
|
||||
|
||||
#define HTS_STATIC static __attribute__ ((unused))
|
||||
|
||||
#define HTS_PRINTF_FUN(fmt, arg) __attribute__ ((format (printf, fmt, arg)))
|
||||
#else
|
||||
#define HTS_UNUSED
|
||||
@@ -58,6 +60,23 @@ typedef struct String String;
|
||||
#endif
|
||||
#ifndef HTS_DEF_STRUCT_String
|
||||
#define HTS_DEF_STRUCT_String
|
||||
/**
|
||||
* Growable owned string.
|
||||
*
|
||||
* Ownership/lifetime: the String owns buffer_ and frees it (StringFree).
|
||||
* buffer_ is allocated lazily, so a freshly STRING_EMPTY/StringInit'd String,
|
||||
* or one just StringFree'd/StringAcquire'd, has buffer_ == NULL and
|
||||
* length_ == capacity_ == 0. Any growing operation may realloc, so a pointer
|
||||
* obtained from StringBuff/StringBuffRW is invalidated by the next append,
|
||||
* copy, or room request; do not cache it across such calls.
|
||||
*
|
||||
* Invariants when buffer_ != NULL: length_ < capacity_, and buffer_[length_]
|
||||
* is a NUL (the content is always NUL-terminated). length_ excludes that NUL;
|
||||
* capacity_ counts it. The empty state (buffer_ == NULL) has no readable NUL,
|
||||
* so callers must not treat StringBuff() of an untouched String as "".
|
||||
*
|
||||
* Direct field access is internal (trailing underscore); use the macros below.
|
||||
*/
|
||||
struct String {
|
||||
char *buffer_;
|
||||
size_t length_;
|
||||
@@ -68,6 +87,7 @@ struct String {
|
||||
/** Allocator **/
|
||||
#ifndef STRING_REALLOC
|
||||
#define STRING_REALLOC(BUFF, SIZE) ( (char*) realloc(BUFF, SIZE) )
|
||||
|
||||
#define STRING_FREE(BUFF) free(BUFF)
|
||||
#endif
|
||||
#ifndef STRING_ASSERT
|
||||
@@ -75,45 +95,49 @@ struct String {
|
||||
#define STRING_ASSERT(EXP) assert(EXP)
|
||||
#endif
|
||||
|
||||
/** An empty string **/
|
||||
/** Initializer for an empty String (NULL buffer). Use to declare or reset. **/
|
||||
#define STRING_EMPTY { (char*) NULL, 0, 0 }
|
||||
|
||||
/** String buffer **/
|
||||
/** Read-only buffer pointer. NULL until the String has been written to.
|
||||
Invalidated by any subsequent growing operation. **/
|
||||
#define StringBuff(BLK) ( (const char*) ((BLK).buffer_) )
|
||||
|
||||
/** String buffer (read/write) **/
|
||||
/** Read/write buffer pointer. Same NULL/invalidation rules as StringBuff. **/
|
||||
#define StringBuffRW(BLK) ((BLK).buffer_)
|
||||
|
||||
/** String length **/
|
||||
/** Current length in bytes, excluding the terminating NUL. **/
|
||||
#define StringLength(BLK) ((BLK).length_)
|
||||
|
||||
/** String not empty ? **/
|
||||
/** Non-zero if the String holds at least one byte. **/
|
||||
#define StringNotEmpty(BLK) ( StringLength(BLK) > 0 )
|
||||
|
||||
/** String capacity **/
|
||||
/** Allocated capacity in bytes, including room for the terminating NUL. **/
|
||||
#define StringCapacity(BLK) ((BLK).capacity_)
|
||||
|
||||
/** Subcharacter **/
|
||||
/** Byte at POS (read). No bounds check; POS must be < StringLength. **/
|
||||
#define StringSub(BLK, POS) ( StringBuff(BLK)[POS] )
|
||||
|
||||
/** Subcharacter (read/write) **/
|
||||
/** Byte at POS (read/write). No bounds check; POS must be < StringLength. **/
|
||||
#define StringSubRW(BLK, POS) ( StringBuffRW(BLK)[POS] )
|
||||
|
||||
/** Subcharacter (read/write) **/
|
||||
#define StringSubRW(BLK, POS) ( StringBuffRW(BLK)[POS] )
|
||||
|
||||
/** Right subcharacter **/
|
||||
/** Byte POS positions from the end (read). POS==1 is the last byte. **/
|
||||
#define StringRight(BLK, POS) ( StringBuff(BLK)[StringLength(BLK) - POS] )
|
||||
|
||||
/** Right subcharacter (read/write) **/
|
||||
/** Byte POS positions from the end (read/write). POS==1 is the last byte. **/
|
||||
#define StringRightRW(BLK, POS) ( StringBuffRW(BLK)[StringLength(BLK) - POS] )
|
||||
|
||||
/** Remove the utter right character from the string. **/
|
||||
/** Drop the last byte and re-terminate. Undefined if the String is empty
|
||||
(no length check; would underflow). **/
|
||||
#define StringPopRight(BLK) do { \
|
||||
StringBuffRW(BLK)[--StringLength(BLK)] = '\0'; \
|
||||
} while(0)
|
||||
|
||||
/** Ensure the string is large enough for exactly CAPACITY bytes overall (including \0). **/
|
||||
/** Grow so capacity_ >= CAPACITY (total bytes, including the NUL). May realloc
|
||||
(invalidating prior buffer pointers); aborts via STRING_ASSERT on OOM.
|
||||
Never shrinks. **/
|
||||
#define StringRoomTotal(BLK, CAPACITY) do { \
|
||||
const size_t capacity_ = (size_t) (CAPACITY); \
|
||||
while ((BLK).capacity_ < capacity_) { \
|
||||
@@ -127,31 +151,37 @@ struct String {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/** Ensure the string is large enough for exactly SIZE more characters (not including \0). **/
|
||||
/** Reserve room for SIZE more bytes beyond the current length (plus the NUL).
|
||||
May realloc, invalidating prior buffer pointers. **/
|
||||
#define StringRoom(BLK, SIZE) StringRoomTotal(BLK, StringLength(BLK) + (SIZE) + 1)
|
||||
|
||||
/** Return the RW buffer for a strcat() operation of at most SIZE characters. **/
|
||||
/** Reserve room for SIZE more bytes and return the (post-realloc) RW buffer,
|
||||
for appending in place. Does not update length_; the caller must. **/
|
||||
#define StringBuffN(BLK, SIZE) StringBuffN_(&(BLK), SIZE)
|
||||
HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
||||
StringRoom(*blk, size);
|
||||
return StringBuffRW(*blk);
|
||||
}
|
||||
|
||||
/** Initialize a string. **/
|
||||
/** Zero the fields (NULL buffer, no allocation). Use on an uninitialized
|
||||
String only; does NOT free an existing buffer (use StringFree to reset
|
||||
an owned one), so calling it on a live String leaks. **/
|
||||
#define StringInit(BLK) do { \
|
||||
(BLK).buffer_ = NULL; \
|
||||
(BLK).capacity_ = 0; \
|
||||
(BLK).length_ = 0; \
|
||||
} while(0)
|
||||
|
||||
/** Clear a string (set its length to 0) **/
|
||||
/** Truncate to length 0, keeping the allocation. Forces a non-NULL buffer
|
||||
(allocates if empty) and writes the leading NUL, so StringBuff is "". **/
|
||||
#define StringClear(BLK) do { \
|
||||
(BLK).length_ = 0; \
|
||||
StringRoom(BLK, 0); \
|
||||
(BLK).buffer_[0] = '\0'; \
|
||||
} while(0)
|
||||
|
||||
/** Set the length of a string to 'SIZE'. If SIZE is negative, check the size using strlen(). **/
|
||||
/** Set length_ to SIZE, or to strlen(buffer_) if SIZE is negative. Caller
|
||||
asserts SIZE fits the existing content; does not (re)allocate. **/
|
||||
#define StringSetLength(BLK, SIZE) do { \
|
||||
if (SIZE >= 0) { \
|
||||
(BLK).length_ = SIZE; \
|
||||
@@ -160,7 +190,8 @@ HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/** Free a string (release memory) **/
|
||||
/** Release the owned buffer and reset to the empty state (NULL buffer).
|
||||
Idempotent; safe on an already-empty String. **/
|
||||
#define StringFree(BLK) do { \
|
||||
if ((BLK).buffer_ != NULL) { \
|
||||
STRING_FREE((BLK).buffer_); \
|
||||
@@ -170,8 +201,12 @@ HTS_STATIC char *StringBuffN_(String * blk, int size) {
|
||||
(BLK).length_ = 0; \
|
||||
} while(0)
|
||||
|
||||
/** Assign an allocated pointer to a a string.
|
||||
The pointer _MUST_ be compatible with STRING_REALLOC() and STRING_FREE() **/
|
||||
/** Take ownership of a NUL-terminated heap string STR (the String will free
|
||||
it). Frees any current buffer first. STR MUST have been allocated by an
|
||||
allocator compatible with STRING_REALLOC()/STRING_FREE(), and must not be
|
||||
freed or used by the caller afterwards. length_/capacity_ are set to
|
||||
strlen(STR) (capacity_ here excludes the NUL, so the next append reallocs).
|
||||
**/
|
||||
#define StringSetBuffer(BLK, STR) do { \
|
||||
size_t len__ = strlen( STR ); \
|
||||
StringFree(BLK); \
|
||||
@@ -180,7 +215,9 @@ The pointer _MUST_ be compatible with STRING_REALLOC() and STRING_FREE() **/
|
||||
(BLK).length_ = len__; \
|
||||
} while(0)
|
||||
|
||||
/** Append a memory block to a string **/
|
||||
/** Append SIZE raw bytes from STR (NULs allowed as data). Grows as needed and
|
||||
re-terminates with a NUL after the appended bytes. STR must not alias
|
||||
BLK's buffer (a realloc would invalidate it). **/
|
||||
#define StringMemcat(BLK, STR, SIZE) do { \
|
||||
const char* str_mc_ = (STR); \
|
||||
const size_t size_mc_ = (size_t) (SIZE); \
|
||||
@@ -192,13 +229,14 @@ The pointer _MUST_ be compatible with STRING_REALLOC() and STRING_FREE() **/
|
||||
*((BLK).buffer_ + (BLK).length_) = '\0'; \
|
||||
} while(0)
|
||||
|
||||
/** Copy a memory block to a string **/
|
||||
/** Replace content with SIZE raw bytes from STR (NULs allowed as data).
|
||||
Same non-aliasing requirement as StringMemcat. **/
|
||||
#define StringMemcpy(BLK, STR, SIZE) do { \
|
||||
(BLK).length_ = 0; \
|
||||
StringMemcat(BLK, STR, SIZE); \
|
||||
} while(0)
|
||||
|
||||
/** Add a character **/
|
||||
/** Append one byte and re-terminate. Grows as needed. **/
|
||||
#define StringAddchar(BLK, c) do { \
|
||||
String * const s__ = &(BLK); \
|
||||
char c__ = (c); \
|
||||
@@ -207,7 +245,9 @@ The pointer _MUST_ be compatible with STRING_REALLOC() and STRING_FREE() **/
|
||||
StringBuffRW(*s__)[StringLength(*s__) ] = 0; \
|
||||
} while(0)
|
||||
|
||||
/** Acquire a string ; it's the client's responsability to free() it **/
|
||||
/** Hand the buffer to the caller and reset the String to empty (NULL buffer).
|
||||
The returned pointer is now owned by the caller, who must STRING_FREE() it.
|
||||
Returns NULL if the String was empty. **/
|
||||
HTS_STATIC char *StringAcquire(String * blk) {
|
||||
char *buff = StringBuffRW(*blk);
|
||||
|
||||
@@ -217,7 +257,8 @@ HTS_STATIC char *StringAcquire(String * blk) {
|
||||
return buff;
|
||||
}
|
||||
|
||||
/** Clone a string. **/
|
||||
/** Return an independent deep copy of *src (its own allocation). The caller
|
||||
owns the result and must StringFree it. **/
|
||||
HTS_STATIC String StringDup(const String * src) {
|
||||
String s = STRING_EMPTY;
|
||||
|
||||
@@ -225,7 +266,10 @@ HTS_STATIC String StringDup(const String * src) {
|
||||
return s;
|
||||
}
|
||||
|
||||
/** Attach a string using a pointer. **/
|
||||
/** Take ownership of *str (a NUL-terminated heap string) and NULL it out, so
|
||||
ownership transfers and the caller keeps no dangling alias. Frees any
|
||||
current buffer first. *str MUST be allocator-compatible (see
|
||||
StringSetBuffer). No-op if str or *str is NULL. **/
|
||||
HTS_STATIC void StringAttach(String * blk, char **str) {
|
||||
StringFree(*blk);
|
||||
if (str != NULL && *str != NULL) {
|
||||
@@ -235,7 +279,8 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
||||
}
|
||||
}
|
||||
|
||||
/** Append a string to another one. **/
|
||||
/** Append the C string STR (up to its NUL). No-op if STR is NULL. STR must not
|
||||
alias BLK's buffer. **/
|
||||
#define StringCat(BLK, STR) do { \
|
||||
const char *const str__ = ( STR ); \
|
||||
if (str__ != NULL) { \
|
||||
@@ -244,6 +289,8 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/** Append at most SIZE leading bytes of the C string STR. No-op if STR is
|
||||
NULL. STR must not alias BLK's buffer. **/
|
||||
#define StringCatN(BLK, STR, SIZE) do { \
|
||||
const char *str__ = ( STR ); \
|
||||
if (str__ != NULL) { \
|
||||
@@ -255,6 +302,8 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/** Replace content with at most SIZE leading bytes of the C string STR.
|
||||
If STR is NULL, clears to "". STR must not alias BLK's buffer. **/
|
||||
#define StringCopyN(BLK, STR, SIZE) do { \
|
||||
const char *str__ = ( STR ); \
|
||||
const size_t usize__ = (SIZE); \
|
||||
@@ -270,9 +319,13 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/** Replace blk's content with a copy of String blk2. blk and blk2 must be
|
||||
distinct Strings (use StringCopyOverlapped if they may be the same). **/
|
||||
#define StringCopyS(blk, blk2) StringCopyN(blk, (blk2).buffer_, (blk2).length_)
|
||||
|
||||
/** Copy a string to another one. **/
|
||||
/** Replace content with a copy of the C string STR. If STR is NULL, clears to
|
||||
"". STR must not alias BLK's buffer (use StringCopyOverlapped if it might).
|
||||
**/
|
||||
#define StringCopy(BLK, STR) do { \
|
||||
const char *str__ = ( STR ); \
|
||||
if (str__ != NULL) { \
|
||||
@@ -283,7 +336,8 @@ HTS_STATIC void StringAttach(String * blk, char **str) {
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/** Copy a (potentially overlapping) string to another one. **/
|
||||
/** Like StringCopy but safe when STR aliases BLK's own buffer: copies via a
|
||||
temporary, so a self-copy or overlap is well-defined. **/
|
||||
#define StringCopyOverlapped(BLK, STR) do { \
|
||||
String s__ = STRING_EMPTY; \
|
||||
StringCopy(s__, STR); \
|
||||
|
||||
@@ -31,6 +31,12 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/** @file htswrap.h
|
||||
Legacy entry points of the callback-wrapper subsystem. The live callback
|
||||
registration API now lives on the httrackp options block (hts_set_callback);
|
||||
only the no-op init/free stubs remain exported here for ABI compatibility.
|
||||
*/
|
||||
|
||||
#ifndef HTSWRAP_DEFH
|
||||
#define HTSWRAP_DEFH
|
||||
|
||||
@@ -50,7 +56,10 @@ typedef struct httrackp httrackp;
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/** Legacy no-op retained for ABI compatibility; always returns 1. */
|
||||
HTSEXT_API int htswrap_init(void); // LEGACY
|
||||
|
||||
/** Legacy no-op retained for ABI compatibility; always returns 1. */
|
||||
HTSEXT_API int htswrap_free(void); // LEGACY
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -30,6 +30,25 @@ Please visit our Website: http://www.httrack.com
|
||||
/* Author: Xavier Roche */
|
||||
/* ------------------------------------------------------------ */
|
||||
|
||||
/**
|
||||
* @file httrack-library.h
|
||||
* @brief Public C API for embedding the HTTrack mirroring engine.
|
||||
*
|
||||
* Two ways to drive the engine, both supported and used by real consumers:
|
||||
* - argv path: build an argv vector and call hts_main()/hts_main2(), exactly
|
||||
* as the command-line tool is configured.
|
||||
* - struct/callback path: hts_create_opt(), install callbacks with
|
||||
* CHAIN_FUNCTION(), then hts_main2(), then hts_free_opt().
|
||||
*
|
||||
* Typical lifecycle: hts_init() once per process, then per mirror
|
||||
* hts_create_opt() -> CHAIN_FUNCTION() -> hts_main2() (blocking) ->
|
||||
* hts_get_stats()/hts_errmsg() -> hts_free_opt().
|
||||
*
|
||||
* Threading: hts_main2() blocks the calling thread. hts_request_stop() and
|
||||
* hts_has_stopped() are safe to call for the same opt from another thread while
|
||||
* the mirror runs. hts_free_opt() must not run until hts_has_stopped() is true.
|
||||
*/
|
||||
|
||||
#ifndef HTTRACK_DEFLIB
|
||||
#define HTTRACK_DEFLIB
|
||||
|
||||
@@ -54,15 +73,18 @@ typedef struct strc_int2bytes2 strc_int2bytes2;
|
||||
#endif
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_log_type
|
||||
#define HTS_DEF_DEFSTRUCT_hts_log_type
|
||||
/** Log severity levels, most to least severe. A message is emitted only if its
|
||||
level is <= opt->debug. LOG_ERRNO is a flag OR'd into the level to append
|
||||
": <strerror(errno)>" to the message. */
|
||||
typedef enum hts_log_type {
|
||||
LOG_PANIC,
|
||||
LOG_ERROR,
|
||||
LOG_WARNING,
|
||||
LOG_NOTICE,
|
||||
LOG_INFO,
|
||||
LOG_DEBUG,
|
||||
LOG_TRACE,
|
||||
LOG_ERRNO = 1 << 8
|
||||
LOG_PANIC, /**< Fatal condition. */
|
||||
LOG_ERROR, /**< Error. */
|
||||
LOG_WARNING, /**< Warning. */
|
||||
LOG_NOTICE, /**< Notice; the default opt->debug level. */
|
||||
LOG_INFO, /**< Informational. */
|
||||
LOG_DEBUG, /**< Debug detail. */
|
||||
LOG_TRACE, /**< Most verbose tracing. */
|
||||
LOG_ERRNO = 1 << 8 /**< Flag: append strerror(errno) to the message. */
|
||||
} hts_log_type;
|
||||
#endif
|
||||
#ifndef HTS_DEF_FWSTRUCT_hts_stat_struct
|
||||
@@ -70,7 +92,9 @@ typedef enum hts_log_type {
|
||||
typedef struct hts_stat_struct hts_stat_struct;
|
||||
#endif
|
||||
|
||||
/** Assert error callback. **/
|
||||
/** Assertion/error handler. Receives the failed expression text, source file,
|
||||
and line. The strings are valid only for the duration of the call; do not
|
||||
retain them. */
|
||||
#ifndef HTS_DEF_FWSTRUCT_htsErrorCallback
|
||||
#define HTS_DEF_FWSTRUCT_htsErrorCallback
|
||||
typedef void (*htsErrorCallback) (const char *msg, const char *file, int line);
|
||||
@@ -79,10 +103,14 @@ typedef void (*htsErrorCallback) (const char *msg, const char *file, int line);
|
||||
/* Helpers for plugging callbacks
|
||||
requires: htsdefines.h */
|
||||
|
||||
/*
|
||||
Add a function callback 'FUNCTION' to the option structure 'OPT' callback member 'MEMBER',
|
||||
with an optional (may be NULL) argument 'ARGUMENT'
|
||||
*/
|
||||
/**
|
||||
* Install callback FUNCTION into OPT->callbacks_fun->MEMBER, chaining it ahead
|
||||
* of any callback already there (whose function and carg are saved for
|
||||
* CALLBACKARG_PREV_FUN/CALLBACKARG_PREV_CARG). ARGUMENT is an optional (may be
|
||||
* NULL) user pointer, later read inside the callback with
|
||||
* CALLBACKARG_USERDEF(). Allocates a t_hts_callbackarg with hts_malloc (not
|
||||
* checked for OOM); it is freed by hts_free_opt().
|
||||
*/
|
||||
#define CHAIN_FUNCTION(OPT, MEMBER, FUNCTION, ARGUMENT) do { \
|
||||
t_hts_callbackarg *carg = (t_hts_callbackarg*) hts_malloc(sizeof(t_hts_callbackarg)); \
|
||||
carg->userdef = ( ARGUMENT ); \
|
||||
@@ -95,180 +123,529 @@ with an optional (may be NULL) argument 'ARGUMENT'
|
||||
/* The following helpers are useful only if you know that an existing callback migh be existing before before the call to CHAIN_FUNCTION()
|
||||
If your functions were added just after hts_create_opt(), no need to make the previous function check */
|
||||
|
||||
/* Get the user-defined pointer initially passed to CHAIN_FUNCTION(), given the callback's carg argument */
|
||||
/** Inside a chained callback, return the ARGUMENT pointer originally passed to
|
||||
CHAIN_FUNCTION(), or NULL when CARG is NULL. */
|
||||
#define CALLBACKARG_USERDEF(CARG) ( ( (CARG) != NULL ) ? (CARG)->userdef : NULL )
|
||||
|
||||
/* Get the previously existing function before the call to CHAIN_FUNCTION(), given the callback's carg argument */
|
||||
/** Return the callback of type NAME that this one chained over, cast to its
|
||||
function-pointer type, or NULL. Call it to forward to the prior handler. */
|
||||
#define CALLBACKARG_PREV_FUN(CARG, NAME) ( (t_hts_htmlcheck_ ##NAME) ( ( (CARG) != NULL ) ? (CARG)->prev.fun : NULL ) )
|
||||
|
||||
/* Get the previously existing function argument before the call to CHAIN_FUNCTION(), given the callback's carg argument */
|
||||
/** Return the carg of the callback this one chained over (pass it when
|
||||
forwarding to the CALLBACKARG_PREV_FUN result), or NULL. */
|
||||
#define CALLBACKARG_PREV_CARG(CARG) ( ( (CARG) != NULL ) ? (CARG)->prev.carg : NULL )
|
||||
|
||||
/* Functions */
|
||||
|
||||
/* Initialization */
|
||||
/** Initialize the engine (lazy, idempotent, process-global): threading, the
|
||||
hashtable assert handler, modules, the MD5 self-test, and TLS when built
|
||||
with it. Only the first call does work. Honors $HTS_LOG for the debug level.
|
||||
Always returns 1. Call before hts_create_opt() or hts_main(). */
|
||||
HTSEXT_API int hts_init(void);
|
||||
|
||||
/** No-op kept for API compatibility. Frees nothing (the process-global mutexes
|
||||
set up by hts_init() are never released) and always returns 1. */
|
||||
HTSEXT_API int hts_uninit(void);
|
||||
|
||||
/** Block until all background mirror threads have finished. No-op unless built
|
||||
with threaded fetching. */
|
||||
HTSEXT_API void htsthread_wait(void);
|
||||
|
||||
/* Main functions */
|
||||
/** Run a full mirror from a command-line argv (argv[0] is ignored, as in
|
||||
main()). Creates a fresh option set, runs the engine, and frees it. Returns
|
||||
the engine exit code. Call hts_init() first. */
|
||||
HTSEXT_API int hts_main(int argc, char **argv);
|
||||
|
||||
/** Run a full mirror using a caller-supplied option set. Use this instead of
|
||||
hts_main() to set options or plug callbacks on opt first. Blocks until the
|
||||
mirror ends and returns the engine exit code. The caller keeps ownership of
|
||||
opt and must release it with hts_free_opt(). */
|
||||
HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt);
|
||||
|
||||
/* Options handling */
|
||||
/** Allocate and default-initialize an option set, preloading the bundled parser
|
||||
modules. Returns a heap object the caller owns and must release with
|
||||
hts_free_opt(). Does not return NULL on allocation failure. */
|
||||
HTSEXT_API httrackp *hts_create_opt(void);
|
||||
|
||||
/** Free an option set created by hts_create_opt() (callback chains, plugged
|
||||
modules, DNS cache, owned strings, and the structure). NULL is accepted. The
|
||||
pointer is invalid afterward. Do not call while a mirror is running on that
|
||||
opt; wait until hts_has_stopped() is true. */
|
||||
HTSEXT_API void hts_free_opt(httrackp * opt);
|
||||
|
||||
/** Return sizeof(httrackp) as the library sees it, for caller-vs-library struct
|
||||
ABI mismatch checks. */
|
||||
HTSEXT_API size_t hts_sizeof_opt(void);
|
||||
|
||||
/** Snapshot opt's error/warning/info counters and return a pointer to them.
|
||||
Returns NULL if opt is NULL. The result aliases a single process-global
|
||||
static: it is not thread-safe and is overwritten by the next call, so copy
|
||||
out the fields you need. */
|
||||
HTSEXT_API const hts_stat_struct* hts_get_stats(httrackp * opt);
|
||||
|
||||
/** Legacy no-op retained for API compatibility. */
|
||||
HTSEXT_API void set_wrappers(httrackp * opt); /* LEGACY */
|
||||
|
||||
/** Load a plugin shared library and run its hts_plug(opt, argv) entry point. On
|
||||
success the handle is recorded in opt and unloaded by hts_free_opt().
|
||||
@return 1 if loaded and hts_plug succeeded; 0 if loaded but hts_plug was
|
||||
missing or refused; -1 if the library could not be loaded. */
|
||||
HTSEXT_API int plug_wrapper(httrackp * opt, const char *moduleName,
|
||||
const char *argv);
|
||||
|
||||
/** Install the process-global assertion/error callback (NULL clears it). Not
|
||||
per-opt, and not safe to change while a mirror runs. */
|
||||
HTSEXT_API void hts_set_error_callback(htsErrorCallback handler);
|
||||
|
||||
/** Return the current process-global error callback, or NULL. */
|
||||
HTSEXT_API htsErrorCallback hts_get_error_callback(void);
|
||||
|
||||
/* Logging */
|
||||
/** Legacy: write prefix then msg to opt->log. Returns 0 if written, 1 if
|
||||
opt->log is NULL. Prefer hts_log_print(). */
|
||||
HTSEXT_API int hts_log(httrackp * opt, const char *prefix, const char *msg);
|
||||
|
||||
/** printf-style log at level @p type (an hts_log_type, optionally |LOG_ERRNO).
|
||||
Forwards to the registered log callback, and when the level is <= opt->debug
|
||||
also to opt->log. @p format must be non-NULL. */
|
||||
HTSEXT_API void hts_log_print(httrackp * opt, int type, const char *format,
|
||||
...) HTS_PRINTF_FUN(3, 4);
|
||||
|
||||
/** va_list form of hts_log_print(). @p opt may be NULL (only the callback
|
||||
runs). Preserves errno. @p format must be non-NULL. */
|
||||
HTSEXT_API void hts_log_vprint(httrackp * opt, int type, const char *format,
|
||||
va_list args);
|
||||
HTSEXT_API void hts_set_log_vprint_callback(void (*callback)(httrackp * opt,
|
||||
int type,
|
||||
const char *format, va_list args));
|
||||
|
||||
/** Install the process-global log callback invoked by hts_log_vprint() for
|
||||
every message, regardless of opt->debug (NULL clears it). Not per-opt. */
|
||||
HTSEXT_API void
|
||||
hts_set_log_vprint_callback(void (*callback)(httrackp *opt, int type,
|
||||
const char *format, va_list args));
|
||||
|
||||
/* Infos */
|
||||
/** Human-readable build/feature string plus the names of plugged modules. The
|
||||
result is written into and aliases a 2048-byte scratch buffer inside opt: it
|
||||
is valid until that buffer is next used, and must not be freed. opt must be
|
||||
non-NULL. */
|
||||
HTSEXT_API const char *hts_get_version_info(httrackp * opt);
|
||||
|
||||
/** Static build-features string (TLS, zlib, ipv6, and so on). Process-global
|
||||
storage; do not free or modify. */
|
||||
HTSEXT_API const char *hts_is_available(void);
|
||||
HTSEXT_API const char* hts_version(void);
|
||||
HTSEXT_API const hts_stat_struct* hts_get_stats(httrackp * opt);
|
||||
|
||||
/** HTTrack version id string. Static storage; do not free. */
|
||||
HTSEXT_API const char *hts_version(void);
|
||||
|
||||
/* Wrapper functions */
|
||||
HTSEXT_API int htswrap_init(void); // DEPRECATED - DUMMY FUNCTION
|
||||
|
||||
HTSEXT_API int htswrap_free(void); // DEPRECATED - DUMMY FUNCTION
|
||||
|
||||
/** Register callback @p fct under @p name in opt's callback table (for example
|
||||
"start", "check-html", "linkdetected"). Returns 1 on success, 0 if @p name
|
||||
is not a known slot. Prefer CHAIN_FUNCTION(), which preserves any prior
|
||||
callback. */
|
||||
HTSEXT_API int htswrap_add(httrackp * opt, const char *name, void *fct);
|
||||
|
||||
/** Return the function pointer registered under @p name in opt as a uintptr_t,
|
||||
or 0 if none or unknown. */
|
||||
HTSEXT_API uintptr_t htswrap_read(httrackp * opt, const char *name);
|
||||
HTSEXT_API int htswrap_set_userdef(httrackp * opt, void *userdef);
|
||||
HTSEXT_API void *htswrap_get_userdef(httrackp * opt);
|
||||
|
||||
/* Internal library allocators, if a different libc is being used by the client */
|
||||
/** strdup() through the library allocator. Returns a heap copy freed with
|
||||
hts_free(), or NULL on failure. */
|
||||
HTSEXT_API char *hts_strdup(const char *string);
|
||||
|
||||
/** malloc() through the library allocator. Free with hts_free(). NULL on OOM.
|
||||
*/
|
||||
HTSEXT_API void *hts_malloc(size_t size);
|
||||
|
||||
/** realloc() through the library allocator. NULL on failure, leaving the
|
||||
original block unchanged. */
|
||||
HTSEXT_API void *hts_realloc(void *const data, const size_t size);
|
||||
|
||||
/** free() through the library allocator. NULL is accepted. */
|
||||
HTSEXT_API void hts_free(void *data);
|
||||
|
||||
/* Other functions */
|
||||
HTSEXT_API int hts_resetvar(void); // DEPRECATED - DUMMY FUNCTION
|
||||
|
||||
/** (Re)build the top-level index.html aggregating every mirror project found
|
||||
under @p path. @p binpath is the data root used to locate the
|
||||
templates/topindex-*.html files, falling back to built-in templates. Writes
|
||||
<path>/index.html. @return 1 on success, 0 on failure. */
|
||||
HTSEXT_API int hts_buildtopindex(httrackp * opt, const char *path,
|
||||
const char *binpath);
|
||||
|
||||
/** Scan every mirror project under @p path and return a CRLF-separated list:
|
||||
@p type==1 gives the distinct category names, any other value gives the
|
||||
project directory names. The result is heap-allocated and owned by the
|
||||
caller (free with freet()); it may be NULL. Not UTF-8. @p path is modified in
|
||||
place (a trailing '/' is stripped). */
|
||||
HTSEXT_API char *hts_getcategories(char *path, int type);
|
||||
|
||||
/** Read the `category=` value from a winprofile.ini file. The result is
|
||||
heap-allocated and owned by the caller (free with freet()), or NULL when the
|
||||
file is missing or has no category line. Not UTF-8. */
|
||||
HTSEXT_API char *hts_getcategory(const char *filename);
|
||||
|
||||
/* Catch-URL */
|
||||
/** Open a local capture socket (a mini-proxy), trying a list of standard ports
|
||||
until one binds. Writes the chosen port to *port_prox and the local host
|
||||
address into adr_prox (a caller buffer of at least 128 bytes), and returns
|
||||
the listening socket. Returns INVALID_SOCKET if no port could be bound. */
|
||||
HTSEXT_API T_SOC catch_url_init_std(int *port_prox, char *adr_prox);
|
||||
|
||||
/** Open a local capture socket bound to *port (0 picks a free port). Writes the
|
||||
effective port back to *port and the local dotted address into @p adr (a
|
||||
caller buffer of at least 128 bytes), and returns the listening socket.
|
||||
Returns INVALID_SOCKET on failure. */
|
||||
HTSEXT_API T_SOC catch_url_init(int *port, char *adr);
|
||||
|
||||
/** Block on capture socket @p soc, accept one browser connection, and capture
|
||||
the proxied HTTP request: write the absolute URL to @p url, the upper-cased
|
||||
method to @p method, and the rebuilt request (request line, headers, and any
|
||||
POST body) to @p data, then send a canned response and close.
|
||||
@return 1 on success, 0 on error; on error @p url instead holds the peer's
|
||||
"ip:port". The buffers are caller-allocated and not bounds-checked: @p data
|
||||
must be CATCH_URL_DATA_SIZE bytes, and @p url / @p method must fit the
|
||||
captured request line. */
|
||||
HTSEXT_API int catch_url(T_SOC soc, char *url, char *method, char *data);
|
||||
|
||||
/* State */
|
||||
/** Whether the engine is parsing HTML. Returns 0 if not, otherwise the percent
|
||||
done (at least 1). @p flag >= 0 also requests a progress refresh; pass a
|
||||
negative value to query without side effects. */
|
||||
HTSEXT_API int hts_is_parsing(httrackp * opt, int flag);
|
||||
|
||||
/** Current background phase: 0 none, 1 testing links, 2 purge, 3, 4 scheduling,
|
||||
5 waiting for a slot. */
|
||||
HTSEXT_API int hts_is_testing(httrackp * opt);
|
||||
|
||||
/** Nonzero once the engine has begun its exit sequence. */
|
||||
HTSEXT_API int hts_is_exiting(httrackp * opt);
|
||||
|
||||
/*HTSEXT_API int hts_setopt(httrackp* opt); DEPRECATED ; see copy_htsopt() */
|
||||
|
||||
/** Queue extra start URLs to inject into a running mirror. @p url is a
|
||||
caller-owned, NULL-terminated array of strings; the engine stores the
|
||||
pointer without copying, so the array and its strings must stay valid until
|
||||
the engine consumes them. @return nonzero if a list is now set. */
|
||||
HTSEXT_API int hts_addurl(httrackp * opt, char **url);
|
||||
|
||||
/** Clear any pending add-URL list set by hts_addurl(). Always returns 0. */
|
||||
HTSEXT_API int hts_resetaddurl(httrackp * opt);
|
||||
|
||||
/** Apply the runtime-tunable options from @p from onto @p to, to adjust a live
|
||||
mirror. Only fields set to a non-sentinel value are copied; the rest of @p
|
||||
to is left untouched. The user-agent string is deep-copied. @return 0. */
|
||||
HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to);
|
||||
|
||||
/** Return the engine's last error message, or NULL. The string is owned by
|
||||
@p opt; do not free it, and use it only while @p opt lives. */
|
||||
HTSEXT_API char *hts_errmsg(httrackp * opt);
|
||||
|
||||
/** Get or set the transfer-pause flag. @p p >= 0 sets it (nonzero means
|
||||
paused); a negative value queries. @return the current pause flag. */
|
||||
HTSEXT_API int hts_setpause(httrackp * opt, int);
|
||||
|
||||
/** Ask the running mirror to terminate (sets the stop flag under the state
|
||||
lock, so it is safe to call from another thread). @p force is currently
|
||||
ignored.
|
||||
@return 0; no-op if @p opt is NULL. */
|
||||
HTSEXT_API int hts_request_stop(httrackp * opt, int force);
|
||||
|
||||
/** Queue a single in-progress file, by URL, to be cancelled by the engine.
|
||||
@p url is copied internally. Takes the state lock, so it is thread-safe.
|
||||
@return the underlying push result. */
|
||||
HTSEXT_API int hts_cancel_file_push(httrackp * opt, const char *url);
|
||||
|
||||
/** Cancel the in-progress link-testing phase. Effective only while a test runs.
|
||||
*/
|
||||
HTSEXT_API void hts_cancel_test(httrackp * opt);
|
||||
|
||||
/** Cancel the in-progress HTML parsing. Effective only while parsing is active.
|
||||
*/
|
||||
HTSEXT_API void hts_cancel_parsing(httrackp * opt);
|
||||
HTSEXT_API void hts_cancel_test(httrackp * opt);
|
||||
HTSEXT_API void hts_cancel_parsing(httrackp * opt);
|
||||
|
||||
/** Nonzero once the mirror has fully ended. Read under the engine state lock,
|
||||
so safe to poll from another thread. Wait for this before hts_free_opt(). */
|
||||
HTSEXT_API int hts_has_stopped(httrackp * opt);
|
||||
|
||||
/* Tools */
|
||||
/** Ensure the directory chain leading to @p path exists, creating missing
|
||||
directories. @p path ends either with '/' (a directory) or a filename (its
|
||||
basename is ignored). A regular file blocking a needed directory is renamed
|
||||
to "<name>.txt". @p path is NOT UTF-8. @return 0 on success or if it already
|
||||
exists, -1 on error. */
|
||||
HTSEXT_API int structcheck(const char *path);
|
||||
|
||||
/** Like structcheck() but @p path is UTF-8. @return 0 on success, -1 on error.
|
||||
*/
|
||||
HTSEXT_API int structcheck_utf8(const char *path);
|
||||
|
||||
/** Whether the directory containing @p path exists. The basename is stripped
|
||||
first, so passing a file path tests its parent directory. @return 1 if it is
|
||||
a directory, 0 otherwise. */
|
||||
HTSEXT_API int dir_exists(const char *path);
|
||||
|
||||
/** Write the HTTP reason phrase for @p statuscode into @p msg, a caller buffer
|
||||
of at least 64 bytes. For an unknown code a non-empty @p msg is kept,
|
||||
otherwise it is set to "Unknown error". */
|
||||
HTSEXT_API void infostatuscode(char *msg, int statuscode);
|
||||
|
||||
/** Return the static reason-phrase string for @p statuscode, or NULL if
|
||||
unknown. The pointer is a string literal; do not free it. */
|
||||
HTSEXT_API const char *infostatuscode_const(int statuscode);
|
||||
|
||||
/** Current wall-clock time in milliseconds since the Unix epoch. */
|
||||
HTSEXT_API TStamp mtime_local(void);
|
||||
|
||||
/** Format a duration @p t (in seconds) into a compact string in @p st, for
|
||||
example "3d,02h,04min05s". @p st is caller-allocated and not bounds-checked.
|
||||
*/
|
||||
HTSEXT_API void qsec2str(char *st, TStamp t);
|
||||
|
||||
/* The int2* helpers below write into the caller-supplied strc and return
|
||||
pointers into it. No allocation happens; the result is valid only until strc
|
||||
is reused, and a given strc is not reentrant. Use one strc per
|
||||
concurrently-live result. */
|
||||
/** Format @p n as a decimal string into @p strc and return it. */
|
||||
HTSEXT_API char *int2char(strc_int2bytes2 * strc, int n);
|
||||
|
||||
/** Format byte count @p n as "<num><unit>" (B/KiB/MiB/GiB and so on) into
|
||||
@p strc and return it. */
|
||||
HTSEXT_API char *int2bytes(strc_int2bytes2 * strc, LLint n);
|
||||
|
||||
/** Format a transfer rate @p n as "<num><unit>/s" into @p strc and return it.
|
||||
*/
|
||||
HTSEXT_API char *int2bytessec(strc_int2bytes2 * strc, long int n);
|
||||
|
||||
/** Split byte count @p n into number and unit, returning a 2-element array
|
||||
{number, unit} stored inside @p strc. */
|
||||
HTSEXT_API char **int2bytes2(strc_int2bytes2 * strc, LLint n);
|
||||
|
||||
/** Skip any "user[:pass]@" identification prefix in a URL, returning a pointer
|
||||
into the argument past it (or past the protocol if none). The result aliases
|
||||
the input string. */
|
||||
HTSEXT_API char *jump_identification(char *);
|
||||
|
||||
HTSEXT_API const char *jump_identification_const(const char *);
|
||||
|
||||
/** Like jump_identification() and also strip a leading "www." host prefix,
|
||||
returning a pointer into the input to the normalized host. */
|
||||
HTSEXT_API char *jump_normalized(char *);
|
||||
|
||||
HTSEXT_API const char *jump_normalized_const(const char *);
|
||||
|
||||
/** Return a pointer (into the input) to the ":port" part of a URL host, or NULL
|
||||
if there is no explicit port. Handles bracketed IPv6 literals. */
|
||||
HTSEXT_API char *jump_toport(char *);
|
||||
|
||||
HTSEXT_API const char *jump_toport_const(const char *);
|
||||
|
||||
/** Canonicalize a URL path into @p dest: collapse duplicate '/' and sort the
|
||||
query-string arguments, so "?b=2&a=1" and "?a=1&b=2" compare equal. Returns
|
||||
@p dest, a caller buffer of at least strlen(source)+1 bytes (the output is
|
||||
never longer than the input). */
|
||||
HTSEXT_API char *fil_normalized(const char *source, char *dest);
|
||||
|
||||
/** Write the normalized host of @p source (identification and "www." stripped)
|
||||
into @p dest, truncated to @p destsize. Returns @p dest. */
|
||||
HTSEXT_API char *adr_normalized_sized(const char *source, char *dest,
|
||||
size_t destsize);
|
||||
|
||||
/** @deprecated Use adr_normalized_sized(). This form has no destination size
|
||||
and assumes @p dest is the engine URL buffer of HTS_URLMAXSIZE*2 bytes; a
|
||||
smaller buffer can overflow. */
|
||||
HTS_DEPRECATED("use adr_normalized_sized(source, dest, destsize)")
|
||||
|
||||
HTSEXT_API char *adr_normalized(const char *source, char *dest);
|
||||
|
||||
/** Get or set the process executable root directory (with trailing '/'). The
|
||||
first call with non-NULL @p file initializes it and returns NULL; later
|
||||
initialization calls are ignored. Call with NULL to query: returns the
|
||||
stored directory, or "" if never set. The result is a static internal buffer;
|
||||
do not free it, and do not set it from multiple threads. */
|
||||
HTSEXT_API const char *hts_rootdir(char *file);
|
||||
|
||||
/* Escaping URLs */
|
||||
/*
|
||||
* Size contract shared by the escape/unescape family below.
|
||||
* For the escape_* / append_escape_* / inplace_escape_* /
|
||||
* escape_for_html_print* / make_content_id / x_escape_http functions, `size` is
|
||||
* the total capacity of `dest` including the terminating NUL. The size_t return
|
||||
* is the number of bytes written, NOT counting the NUL; on overflow it returns
|
||||
* `size` and `dest` is still NUL-terminated (truncated). Passing sizeof(a
|
||||
* pointer) as the size trips a runtime assert. The unescape_http* functions
|
||||
* instead return `dest` (the catbuff pointer) and truncate to fit `size`.
|
||||
*/
|
||||
/** Decode HTML entities in @p s in place (for example "&" becomes "&"). */
|
||||
HTSEXT_API void unescape_amp(char *s);
|
||||
|
||||
/** Percent-escape only spaces (' ' becomes "%20"); copy everything else
|
||||
* verbatim. */
|
||||
HTSEXT_API size_t escape_spc_url(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Aggressively percent-escape @p src for use as a single URL path segment
|
||||
(reserved, delimiter, unwise, special, avoid and mark characters). */
|
||||
HTSEXT_API size_t escape_in_url(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Percent-escape @p src as a URI, escaping only what is necessary and keeping
|
||||
'/' and other reserved characters. */
|
||||
HTSEXT_API size_t escape_uri(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Like escape_uri() for a UTF-8 URI: also escapes reserved characters other
|
||||
than '/'. */
|
||||
HTSEXT_API size_t escape_uri_utf(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Minimal "make safe" escape: percent-escapes only '"', ' ' and control
|
||||
characters, leaving an already-formed URL otherwise intact. */
|
||||
HTSEXT_API size_t escape_check_url(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Append-variant of escape_spc_url(): escapes @p src after the existing
|
||||
NUL-terminated content of @p dest. Returns the bytes appended (excluding the
|
||||
NUL). */
|
||||
HTSEXT_API size_t append_escape_spc_url(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Append-variant of escape_in_url(). See append_escape_spc_url(). */
|
||||
HTSEXT_API size_t append_escape_in_url(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Append-variant of escape_uri(). See append_escape_spc_url(). */
|
||||
HTSEXT_API size_t append_escape_uri(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Append-variant of escape_uri_utf(). See append_escape_spc_url(). */
|
||||
HTSEXT_API size_t append_escape_uri_utf(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Append-variant of escape_check_url(). See append_escape_spc_url(). */
|
||||
HTSEXT_API size_t append_escape_check_url(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** In-place variant of escape_spc_url(): escapes the NUL-terminated string in
|
||||
@p dest back into @p dest. */
|
||||
HTSEXT_API size_t inplace_escape_spc_url(char *const dest, const size_t size);
|
||||
|
||||
/** In-place variant of escape_in_url(). See inplace_escape_spc_url(). */
|
||||
HTSEXT_API size_t inplace_escape_in_url(char *const dest, const size_t size);
|
||||
|
||||
/** In-place variant of escape_uri(). See inplace_escape_spc_url(). */
|
||||
HTSEXT_API size_t inplace_escape_uri(char *const dest, const size_t size);
|
||||
|
||||
/** In-place variant of escape_uri_utf(). See inplace_escape_spc_url(). */
|
||||
HTSEXT_API size_t inplace_escape_uri_utf(char *const dest, const size_t size);
|
||||
|
||||
/** In-place variant of escape_check_url(). See inplace_escape_spc_url(). */
|
||||
HTSEXT_API size_t inplace_escape_check_url(char *const dest, const size_t size);
|
||||
|
||||
/** Same escaping as escape_check_url() but returns @p dest instead of the byte
|
||||
count. */
|
||||
HTSEXT_API char *escape_check_url_addr(const char *const src, char *const dest, const size_t size);
|
||||
|
||||
/** Build a MIME/MHTML content-id token in @p dest from @p adr and @p fil:
|
||||
escape_in_url() both, then replace every '%' with 'X' so the result is one
|
||||
opaque token. */
|
||||
HTSEXT_API size_t make_content_id(const char *const adr, const char *const fil, char *const dest, const size_t size);
|
||||
|
||||
/** Low-level percent-escaper backing the escape_* family. @p mode selects the
|
||||
character class to escape: 0 check_url, 1 in_url, 2 spc_url, 3 uri,
|
||||
30 uri_utf. @p max_size is the dest capacity including the NUL. */
|
||||
HTSEXT_API size_t x_escape_http(const char *const s, char *const dest, const size_t max_size, const int mode);
|
||||
|
||||
/** Strip all control characters (byte value < 32) from @p s in place. */
|
||||
HTSEXT_API void escape_remove_control(char *const s);
|
||||
|
||||
/** HTML-escape for text output: rewrite '&' to "&" and pass every other
|
||||
byte through unchanged. */
|
||||
HTSEXT_API size_t escape_for_html_print(const char *const s, char *const dest, const size_t size);
|
||||
|
||||
/** Like escape_for_html_print() but also convert every high byte (>= 128) to a
|
||||
numeric entity "&#xNN;". */
|
||||
HTSEXT_API size_t escape_for_html_print_full(const char *const s, char *const dest, const size_t size);
|
||||
|
||||
/** Percent-decode @p s into @p catbuff (capacity @p size) and return @p
|
||||
catbuff. Decodes every "%xx" hex escape. */
|
||||
HTSEXT_API char *unescape_http(char *const catbuff, const size_t size, const char *const s);
|
||||
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size, const char *s, const int no_high);
|
||||
HTSEXT_API char *antislash_unescaped(char *catbuff, const char *s);
|
||||
|
||||
HTSEXT_API void escape_remove_control(char *s);
|
||||
/** Percent-decode @p s into @p catbuff, but only the escapes that are safe to
|
||||
decode while keeping a valid URI (reserved, delimiter, unwise, control and
|
||||
must-avoid escapes are kept encoded, and %25 is never decoded). @p no_high &
|
||||
1 also decodes high (>= 128) bytes; @p no_high & 2 also decodes an escaped
|
||||
space. Returns @p catbuff. */
|
||||
HTSEXT_API char *unescape_http_unharm(char *const catbuff, const size_t size, const char *s, const int no_high);
|
||||
|
||||
/** Determine the MIME type of local file name @p fil into @p s (capacity
|
||||
@p ssize): user --assume rules, then ".html", then the built-in extension
|
||||
table. @p flag != 0 forces a fallback type. @return 1 if a type was written,
|
||||
0 otherwise. */
|
||||
HTSEXT_API int get_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
const char *fil, int flag);
|
||||
|
||||
/** @deprecated Use get_httptype_sized(). Assumes @p s has at least
|
||||
HTS_MIMETYPE_SIZE capacity. */
|
||||
HTS_DEPRECATED("use get_httptype_sized(opt, s, ssize, fil, flag)")
|
||||
|
||||
HTSEXT_API void get_httptype(httrackp * opt, char *s, const char *fil,
|
||||
int flag);
|
||||
|
||||
/** Classify @p fil by its extension: 0 unknown, 1 known non-HTML, 2 known HTML.
|
||||
Consults the built-in table then user --assume rules. 0 for a NULL @p fil.
|
||||
*/
|
||||
HTSEXT_API int is_knowntype(httrackp * opt, const char *fil);
|
||||
|
||||
/** Like is_knowntype() but consults only the user --assume rules: 0 no rule,
|
||||
1 non-HTML, 2 HTML. */
|
||||
HTSEXT_API int is_userknowntype(httrackp * opt, const char *fil);
|
||||
|
||||
/** 1 if @p fil, an extension such as "asp" or "php" (not a full filename), is a
|
||||
known dynamic-page type, else 0. */
|
||||
HTSEXT_API int is_dyntype(const char *fil);
|
||||
|
||||
/** Extract the extension of @p fil (text after the last '.', stopping at '?')
|
||||
into caller scratch @p catbuff (capacity @p size) and return it. Returns ""
|
||||
(a literal, not @p catbuff) when there is no extension or it does not fit.
|
||||
*/
|
||||
HTSEXT_API const char *get_ext(char *catbuff, size_t size, const char *fil);
|
||||
|
||||
/** 1 if MIME type @p st must not be reclassified or renamed (hypertext types
|
||||
and a built-in keep-list of commonly mislabeled types), else 0. */
|
||||
HTSEXT_API int may_unknown(httrackp * opt, const char *st);
|
||||
|
||||
/** Guess the MIME type of local file @p fil into @p s (capacity @p ssize),
|
||||
always producing a type. @return 1 if a type was written. */
|
||||
HTSEXT_API int guess_httptype_sized(httrackp *opt, char *s, size_t ssize,
|
||||
const char *fil);
|
||||
|
||||
/** @deprecated Use guess_httptype_sized(). Assumes @p s has at least
|
||||
HTS_MIMETYPE_SIZE capacity. */
|
||||
HTS_DEPRECATED("use guess_httptype_sized(opt, s, ssize, fil)")
|
||||
|
||||
HTSEXT_API void guess_httptype(httrackp * opt, char *s, const char *fil);
|
||||
|
||||
/* Ugly string tools */
|
||||
/* These take a caller scratch buffer catbuff of capacity size and return it. On
|
||||
overflow they stop without writing past size and return the truncated buffer.
|
||||
size must be a real array sizeof (the macros below check this at compile
|
||||
time), not a pointer. */
|
||||
/** Concatenate @p a and @p b into @p catbuff (NULL or empty operands are
|
||||
* skipped). */
|
||||
HTSEXT_API char *concat(char *catbuff, size_t size, const char *a, const char *b);
|
||||
|
||||
/** Like concat(a, b) but convert '/' to the platform path separator (Windows).
|
||||
*/
|
||||
HTSEXT_API char *fconcat(char *catbuff, size_t size, const char *a, const char *b);
|
||||
|
||||
/** Copy @p a into @p catbuff, converting '/' to the platform path separator
|
||||
(Windows). */
|
||||
HTSEXT_API char *fconv(char *catbuff, size_t size, const char *a);
|
||||
|
||||
/** Copy @p a into @p catbuff, converting every '\\' to '/' on all platforms. */
|
||||
HTSEXT_API char *fslash(char *catbuff, size_t size, const char *a);
|
||||
|
||||
/* Debugging */
|
||||
/** Set the process-global debug verbosity (0 is off); higher levels log more to
|
||||
stderr. Bit 0x80 redirects debug output to "hts-debug.txt". */
|
||||
HTSEXT_API void hts_debug(int level);
|
||||
|
||||
/* Portable directory API */
|
||||
@@ -276,6 +653,7 @@ HTSEXT_API void hts_debug(int level);
|
||||
#ifndef HTS_DEF_FWSTRUCT_find_handle_struct
|
||||
#define HTS_DEF_FWSTRUCT_find_handle_struct
|
||||
typedef struct find_handle_struct find_handle_struct;
|
||||
|
||||
typedef find_handle_struct *find_handle;
|
||||
#endif
|
||||
|
||||
@@ -283,22 +661,53 @@ typedef find_handle_struct *find_handle;
|
||||
#define HTS_DEF_FWSTRUCT_topindex_chain
|
||||
typedef struct topindex_chain topindex_chain;
|
||||
#endif
|
||||
/** One node of the index/category listing built when generating the top index.
|
||||
*/
|
||||
struct topindex_chain {
|
||||
int level; /* sort level */
|
||||
char *category; /* category */
|
||||
char name[2048]; /* path */
|
||||
struct topindex_chain *next; /* next element */
|
||||
int level; /**< sort level */
|
||||
char *category; /**< category (heap string) */
|
||||
char name[2048]; /**< path */
|
||||
struct topindex_chain *next; /**< next element */
|
||||
};
|
||||
|
||||
/** Open directory @p path for iteration, positioned on the first entry. Returns
|
||||
an opaque handle to free with hts_findclose(), or NULL on empty path or open
|
||||
failure. */
|
||||
HTSEXT_API find_handle hts_findfirst(char *path);
|
||||
|
||||
/** Advance to the next directory entry. Returns 1 if an entry is available, 0
|
||||
at end of directory. */
|
||||
HTSEXT_API int hts_findnext(find_handle find);
|
||||
|
||||
/** Close the iteration and free @p find. Always returns 0; NULL is accepted. */
|
||||
HTSEXT_API int hts_findclose(find_handle find);
|
||||
|
||||
/** Name of the current entry, or NULL. Points into the handle's storage; valid
|
||||
only until the next hts_findnext()/hts_findclose(). */
|
||||
HTSEXT_API char *hts_findgetname(find_handle find);
|
||||
|
||||
/** Size in bytes of the current entry, or -1. Truncated to int, so unreliable
|
||||
for files larger than 2 GB. */
|
||||
HTSEXT_API int hts_findgetsize(find_handle find);
|
||||
|
||||
/** 1 if the current entry is a directory, else 0 (a system/special entry, see
|
||||
hts_findissystem(), reports 0). */
|
||||
HTSEXT_API int hts_findisdir(find_handle find);
|
||||
|
||||
/** 1 if the current entry is a regular file, else 0 (a system/special entry,
|
||||
see hts_findissystem(), reports 0). */
|
||||
HTSEXT_API int hts_findisfile(find_handle find);
|
||||
|
||||
/** 1 if the current entry is a special/system entry to skip: "." or "..", on
|
||||
POSIX also device/fifo/socket nodes, on Windows also system, hidden or
|
||||
temporary entries. Else 0. */
|
||||
HTSEXT_API int hts_findissystem(find_handle find);
|
||||
|
||||
/* UTF-8 aware FILE API */
|
||||
/* On non-Windows these macros resolve directly to the POSIX calls. On Windows
|
||||
they map to the hts_*_utf8 wrappers below, which convert the UTF-8 path to
|
||||
UTF-16 and call the wide CRT, falling back to the narrow CRT if conversion
|
||||
fails. Always pass UTF-8 paths through these. */
|
||||
#ifndef HTS_DEF_FILEAPI
|
||||
#ifdef _WIN32
|
||||
#define FOPEN hts_fopen_utf8
|
||||
@@ -306,6 +715,7 @@ HTSEXT_API FILE *hts_fopen_utf8(const char *path, const char *mode);
|
||||
|
||||
#define STAT hts_stat_utf8
|
||||
typedef struct _stat STRUCT_STAT;
|
||||
|
||||
HTSEXT_API int hts_stat_utf8(const char *path, STRUCT_STAT * buf);
|
||||
|
||||
#define UNLINK hts_unlink_utf8
|
||||
@@ -315,10 +725,13 @@ HTSEXT_API int hts_unlink_utf8(const char *pathname);
|
||||
HTSEXT_API int hts_rename_utf8(const char *oldpath, const char *newpath);
|
||||
|
||||
#define MKDIR(F) hts_mkdir_utf8(F)
|
||||
|
||||
HTSEXT_API int hts_mkdir_utf8(const char *pathname);
|
||||
|
||||
#define UTIME(A,B) hts_utime_utf8(A,B)
|
||||
|
||||
typedef struct _utimbuf STRUCT_UTIMBUF;
|
||||
|
||||
HTSEXT_API int hts_utime_utf8(const char *filename,
|
||||
const STRUCT_UTIMBUF * times);
|
||||
#else
|
||||
@@ -329,6 +742,7 @@ typedef struct stat STRUCT_STAT;
|
||||
#define UNLINK unlink
|
||||
#define RENAME rename
|
||||
#define MKDIR(F) mkdir(F, HTS_ACCESS_FOLDER)
|
||||
|
||||
typedef struct utimbuf STRUCT_UTIMBUF;
|
||||
|
||||
#define UTIME(A,B) utime(A,B)
|
||||
@@ -336,19 +750,22 @@ typedef struct utimbuf STRUCT_UTIMBUF;
|
||||
#define HTS_DEF_FILEAPI
|
||||
#endif
|
||||
|
||||
/** Macro aimed to break at build-time if a size is not a sizeof() strictly
|
||||
/** Macro aimed to break at build-time if a size is not a sizeof() strictly
|
||||
* greater than sizeof(char*). **/
|
||||
#undef COMPILE_TIME_CHECK_SIZE
|
||||
#define COMPILE_TIME_CHECK_SIZE(A) (void) ((void (*)(char[A - sizeof(char*) - 1])) NULL)
|
||||
|
||||
/** Macro aimed to break at compile-time if a size is not a sizeof() strictly
|
||||
/** Macro aimed to break at compile-time if a size is not a sizeof() strictly
|
||||
* greater than sizeof(char*). **/
|
||||
#undef RUNTIME_TIME_CHECK_SIZE
|
||||
#define RUNTIME_TIME_CHECK_SIZE(A) assertf((A) != sizeof(void*))
|
||||
|
||||
#define fconv(A,B,C) (COMPILE_TIME_CHECK_SIZE(B), fconv(A,B,C))
|
||||
|
||||
#define concat(A,B,C,D) (COMPILE_TIME_CHECK_SIZE(B), concat(A,B,C,D))
|
||||
|
||||
#define fconcat(A,B,C,D) (COMPILE_TIME_CHECK_SIZE(B), fconcat(A,B,C,D))
|
||||
|
||||
#define fslash(A,B,C) (COMPILE_TIME_CHECK_SIZE(B), fslash(A,B,C))
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
48
tests/01_engine-cache-golden.test
Normal file
48
tests/01_engine-cache-golden.test
Normal file
@@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Keep this POSIX-portable: the harness runs it via $(BASH), which is a plain
|
||||
# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
|
||||
# tool flags despite the #!/bin/bash above.
|
||||
|
||||
# Golden cache-format regression test (driven by 'httrack -#B <dir>').
|
||||
#
|
||||
# 01_engine-cache.test writes the cache with the same build it reads back (a
|
||||
# round-trip), so it cannot catch a read-path or ZIP-format regression where
|
||||
# writer and reader drift together. This reads a *committed* cache frozen by an
|
||||
# earlier build and asserts a fixed set of entries still decodes field- and
|
||||
# byte-exact.
|
||||
#
|
||||
# Regenerate the fixture after a deliberate format change with
|
||||
# 'httrack -#B <dir> regen', then copy <dir>/hts-cache/new.zip over the
|
||||
# committed file.
|
||||
|
||||
set -eu
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
fixture="$top_srcdir/tests/fixtures/cache-golden"
|
||||
|
||||
test -e "$fixture/hts-cache/new.zip" || {
|
||||
echo "missing committed cache fixture: $fixture/hts-cache/new.zip" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
# Read against a private copy so the source tree is never touched (a read
|
||||
# session does not write, but copying keeps the test hermetic). Create the dir
|
||||
# with mkdir so it is writable for the cleanup trap: under "make distcheck" the
|
||||
# srcdir is read-only, and "cp -r" of that directory would carry its read-only
|
||||
# mode over and defeat the rm -rf.
|
||||
mkdir -p "$dir/hts-cache"
|
||||
cp "$fixture/hts-cache/new.zip" "$dir/hts-cache/new.zip"
|
||||
|
||||
out=$(httrack -#B "$dir")
|
||||
|
||||
# Match the exact success line: the read must have found and verified every
|
||||
# entry, not merely failed to enter the mode (a bad -#B falls through to the
|
||||
# usage screen, which also exits non-zero but never prints this).
|
||||
test "$out" = "cache-golden: OK" || {
|
||||
echo "expected 'cache-golden: OK', got: $out" >&2
|
||||
exit 1
|
||||
}
|
||||
@@ -1,4 +1,8 @@
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh
|
||||
# Committed binary fixture read by 01_engine-cache-golden.test. List it
|
||||
# explicitly: automake does not expand wildcards in EXTRA_DIST, so a glob would
|
||||
# silently drop it from the dist tarball and break "make distcheck".
|
||||
EXTRA_DIST = $(TESTS) crawl-test.sh run-all-tests.sh check-network.sh \
|
||||
fixtures/cache-golden/hts-cache/new.zip
|
||||
|
||||
TESTS_ENVIRONMENT =
|
||||
TESTS_ENVIRONMENT += PATH=$(top_builddir)/src$(PATH_SEPARATOR)$$PATH
|
||||
@@ -17,6 +21,7 @@ TEST_LOG_COMPILER = $(BASH)
|
||||
TESTS = \
|
||||
00_runnable.test \
|
||||
01_engine-cache.test \
|
||||
01_engine-cache-golden.test \
|
||||
01_engine-charset.test \
|
||||
01_engine-cmdline.test \
|
||||
01_engine-doitlog.test \
|
||||
|
||||
BIN
tests/fixtures/cache-golden/hts-cache/new.zip
vendored
Normal file
BIN
tests/fixtures/cache-golden/hts-cache/new.zip
vendored
Normal file
Binary file not shown.
Reference in New Issue
Block a user