mirror of
https://github.com/xroche/httrack.git
synced 2026-06-29 21:45:24 +03:00
Compare commits
4 Commits
accept-enc
...
worktree-a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f0cb0b18d7 | ||
|
|
13b31986d5 | ||
|
|
bd7e0989f6 | ||
|
|
bd74ec7cab |
150
src/htscore.c
150
src/htscore.c
@@ -406,29 +406,40 @@ void hts_invalidate_link(httrackp * opt, int lpos) {
|
||||
opt->liens[lpos]->pass2 = -1;
|
||||
}
|
||||
|
||||
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF, link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),StringBuff(opt->path_html_utf8),"index.html"),"",""); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
// Write the makeindex footer (refresh meta when makeindex_links==1), close
|
||||
// the file, then run usercommand.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil) {
|
||||
if (!*makeindex_done) {
|
||||
if (*makeindex_fp) {
|
||||
char BIGSTK tempo[1024];
|
||||
if (makeindex_links == 1) {
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE * 2];
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped));
|
||||
snprintf(tempo, sizeof(tempo),
|
||||
"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">" CRLF,
|
||||
link_escaped);
|
||||
} else
|
||||
tempo[0] = '\0';
|
||||
hts_template_format(*makeindex_fp, template_footer,
|
||||
"<!-- Mirror and index made by HTTrack Website "
|
||||
"Copier/" HTTRACK_VERSION " " HTTRACK_AFF_AUTHORS
|
||||
" -->",
|
||||
tempo, /* EOF */ NULL);
|
||||
fflush(*makeindex_fp);
|
||||
fclose(*makeindex_fp);
|
||||
*makeindex_fp = NULL;
|
||||
usercommand(opt, 0, NULL,
|
||||
fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
|
||||
StringBuff(opt->path_html_utf8), "index.html"),
|
||||
adr, fil);
|
||||
}
|
||||
}
|
||||
*makeindex_done = 1;
|
||||
}
|
||||
|
||||
/* does it look like XML ? (SVG et al.) */
|
||||
static int look_like_xml(const char *s) {
|
||||
@@ -1796,90 +1807,18 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
if (strnotempty(savename()) == 0) { // pas de chemin de sauvegarde
|
||||
if (strcmp(urlfil(), "/robots.txt") == 0) { // robots.txt
|
||||
if (r.adr) {
|
||||
int bptr = 0;
|
||||
char BIGSTK line[1024];
|
||||
char BIGSTK buff[8192];
|
||||
char BIGSTK infobuff[8192];
|
||||
int record = 0;
|
||||
|
||||
line[0] = '\0';
|
||||
buff[0] = '\0';
|
||||
infobuff[0] = '\0';
|
||||
//
|
||||
#if DEBUG_ROBOTS
|
||||
printf("robots.txt dump:\n%s\n", r.adr);
|
||||
#endif
|
||||
do {
|
||||
char *comm;
|
||||
int llen;
|
||||
|
||||
bptr += binput(r.adr + bptr, line, sizeof(line) - 2);
|
||||
/* strip comment */
|
||||
comm = strchr(line, '#');
|
||||
if (comm != NULL) {
|
||||
*comm = '\0';
|
||||
}
|
||||
/* strip spaces */
|
||||
llen = (int) strlen(line);
|
||||
while(llen > 0 && is_realspace(line[llen - 1])) {
|
||||
line[llen - 1] = '\0';
|
||||
llen--;
|
||||
}
|
||||
if (strfield(line, "user-agent:")) {
|
||||
char *a;
|
||||
|
||||
a = line + 11;
|
||||
while(is_realspace(*a))
|
||||
a++; // sauter espace(s)
|
||||
if (*a == '*') {
|
||||
if (record != 2)
|
||||
record = 1; // c pour nous
|
||||
} else if (strfield(a, "httrack") || strfield(a, "winhttrack")
|
||||
|| strfield(a, "webhttrack")) {
|
||||
buff[0] = '\0'; // re-enregistrer
|
||||
infobuff[0] = '\0';
|
||||
record = 2; // locked
|
||||
#if DEBUG_ROBOTS
|
||||
printf("explicit disallow for httrack\n");
|
||||
#endif
|
||||
} else
|
||||
record = 0;
|
||||
} else if (record) {
|
||||
if (strfield(line, "disallow:")) {
|
||||
char *a = line + 9;
|
||||
|
||||
while(is_realspace(*a))
|
||||
a++; // sauter espace(s)
|
||||
if (strnotempty(a)) {
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
if (strcmp(a, "/") != 0 ||
|
||||
opt->robots >= HTS_ROBOTS_ALWAYS_STRICT)
|
||||
hts_boolean keep_root = (opt->robots >= HTS_ROBOTS_ALWAYS_STRICT)
|
||||
? HTS_TRUE
|
||||
: HTS_FALSE;
|
||||
#else
|
||||
hts_boolean keep_root = HTS_TRUE;
|
||||
#endif
|
||||
{ /* ignoring disallow: / */
|
||||
if ((strlen(buff) + strlen(a) + 8) < sizeof(buff)) {
|
||||
strcatbuff(buff, a);
|
||||
strcatbuff(buff, "\n");
|
||||
if ((strlen(infobuff) + strlen(a) + 8) <
|
||||
sizeof(infobuff)) {
|
||||
if (strnotempty(infobuff))
|
||||
strcatbuff(infobuff, ", ");
|
||||
strcatbuff(infobuff, a);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef IGNORE_RESTRICTIVE_ROBOTS
|
||||
else {
|
||||
hts_log_print(opt, LOG_NOTICE,
|
||||
"Note: %s robots.txt rules are too restrictive, ignoring /",
|
||||
urladr());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
} while((bptr < r.size) && (strlen(buff) < (sizeof(buff) - 32)));
|
||||
if (strnotempty(buff)) {
|
||||
checkrobots_set(&robots, urladr(), buff);
|
||||
|
||||
robots_parse(&robots, urladr(), r.adr, r.size, infobuff,
|
||||
sizeof(infobuff), keep_root);
|
||||
if (strnotempty(infobuff)) {
|
||||
hts_log_print(opt, LOG_INFO,
|
||||
"Note: robots.txt forbidden links for %s are: %s",
|
||||
urladr(), infobuff);
|
||||
@@ -2116,7 +2055,8 @@ int httpmirror(char *url1, httrackp * opt) {
|
||||
/*
|
||||
Ensure the index is being closed
|
||||
*/
|
||||
HT_INDEX_END;
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp, makeindex_links,
|
||||
makeindex_firstlink, template_footer, "", "");
|
||||
|
||||
/*
|
||||
updating-a-remotely-deteted-website hack
|
||||
|
||||
@@ -362,6 +362,14 @@ void usercommand(httrackp * opt, int exe, const char *cmd, const char *file,
|
||||
|
||||
void usercommand_exe(const char *cmd, const char *file);
|
||||
|
||||
// Finish the makeindex index.html (footer + refresh meta), run usercommand.
|
||||
// Updates *makeindex_done/*makeindex_fp in place; adr/fil are the mode strings.
|
||||
void hts_finish_makeindex(httrackp *opt, int *makeindex_done,
|
||||
FILE **makeindex_fp, int makeindex_links,
|
||||
const char *makeindex_firstlink,
|
||||
const char *template_footer, const char *adr,
|
||||
const char *fil);
|
||||
|
||||
int filters_init(char ***ptrfilters, int maxfilter, int filterinc);
|
||||
|
||||
int fspc(httrackp * opt, FILE * fp, const char *type);
|
||||
|
||||
@@ -167,30 +167,6 @@ Please visit our Website: http://www.httrack.com
|
||||
}
|
||||
#define HT_ADD_FOP
|
||||
|
||||
// COPY IN HTSCORE.C
|
||||
#define HT_INDEX_END do { \
|
||||
if (!makeindex_done) { \
|
||||
if (makeindex_fp) { \
|
||||
char BIGSTK tempo[1024]; \
|
||||
if (makeindex_links == 1) { \
|
||||
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
|
||||
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
|
||||
snprintf(tempo,sizeof(tempo),"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,link_escaped); \
|
||||
} else \
|
||||
tempo[0]='\0'; \
|
||||
hts_template_format(makeindex_fp,template_footer, \
|
||||
"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->", \
|
||||
tempo, /* EOF */ NULL \
|
||||
); \
|
||||
fflush(makeindex_fp); \
|
||||
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
|
||||
makeindex_fp=NULL; \
|
||||
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_html_utf8),"index.html"),"primary","primary"); \
|
||||
} \
|
||||
} \
|
||||
makeindex_done=1; /* ok c'est fait */ \
|
||||
} while(0)
|
||||
|
||||
#define ENGINE_DEFINE_CONTEXT() \
|
||||
ENGINE_DEFINE_CONTEXT_BASE(); \
|
||||
/* */ \
|
||||
@@ -709,7 +685,9 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
}
|
||||
|
||||
} else if (heap(ptr)->depth < opt->depth) { // on a sauté level1+1 et level1
|
||||
HT_INDEX_END;
|
||||
hts_finish_makeindex(opt, &makeindex_done, &makeindex_fp,
|
||||
makeindex_links, makeindex_firstlink,
|
||||
template_footer, "primary", "primary");
|
||||
}
|
||||
} // if (opt->makeindex)
|
||||
}
|
||||
|
||||
167
src/htsrobots.c
167
src/htsrobots.c
@@ -44,28 +44,84 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
// -- robots --
|
||||
|
||||
/* RFC 9309 path-prefix match; '*' any run, '$' anchors end; linear. */
|
||||
static hts_boolean robots_pattern_match(const char *pattern, const char *path) {
|
||||
size_t patlen = strlen(pattern);
|
||||
hts_boolean anchored = HTS_FALSE;
|
||||
const char *p, *pend, *s;
|
||||
const char *star = NULL, *star_s = NULL;
|
||||
|
||||
if (patlen > 0 && pattern[patlen - 1] == '$') {
|
||||
anchored = HTS_TRUE;
|
||||
patlen--;
|
||||
}
|
||||
p = pattern;
|
||||
pend = pattern + patlen;
|
||||
s = path;
|
||||
while (*s != '\0') {
|
||||
if (p == pend) {
|
||||
if (!anchored)
|
||||
return HTS_TRUE; // prefix matched
|
||||
if (star != NULL) { // anchored: '*' must eat the rest
|
||||
p = star + 1;
|
||||
s = ++star_s;
|
||||
continue;
|
||||
}
|
||||
return HTS_FALSE;
|
||||
}
|
||||
if (*p == '*') {
|
||||
star = p++;
|
||||
star_s = s;
|
||||
} else if (*p == *s) {
|
||||
p++;
|
||||
s++;
|
||||
} else if (star != NULL) {
|
||||
p = star + 1;
|
||||
s = ++star_s;
|
||||
} else {
|
||||
return HTS_FALSE;
|
||||
}
|
||||
}
|
||||
while (p < pend && *p == '*')
|
||||
p++;
|
||||
return (p == pend) ? HTS_TRUE : HTS_FALSE;
|
||||
}
|
||||
|
||||
// fil="" : vérifier si règle déja enregistrée
|
||||
int checkrobots(robots_wizard * robots, const char *adr, const char *fil) {
|
||||
while(robots) {
|
||||
if (strfield2(robots->adr, adr)) {
|
||||
if (fil[0]) {
|
||||
/* RFC 9309: longest pattern wins, Allow beats Disallow on ties. */
|
||||
int ptr = 0;
|
||||
char line[250];
|
||||
char line[HTS_ROBOTS_TOKEN_SIZE];
|
||||
size_t toklen = strlen(robots->token);
|
||||
size_t best_len = 0;
|
||||
hts_boolean matched = HTS_FALSE;
|
||||
hts_boolean best_allow = HTS_FALSE;
|
||||
|
||||
if (strnotempty(robots->token)) {
|
||||
do {
|
||||
ptr += binput(robots->token + ptr, line, 200);
|
||||
if (line[0] == '/') { // absolu
|
||||
if (strfield(fil, line)) { // commence avec ligne
|
||||
return -1; // interdit
|
||||
}
|
||||
} else { // relatif
|
||||
if (strstrcase(fil, line)) {
|
||||
return -1;
|
||||
while (ptr < (int) toklen) {
|
||||
ptr += binput(robots->token + ptr, line, sizeof(line) - 1);
|
||||
if (line[0] != 'A' && line[0] != 'D')
|
||||
continue;
|
||||
{
|
||||
const hts_boolean is_allow =
|
||||
(line[0] == 'A') ? HTS_TRUE : HTS_FALSE;
|
||||
const char *pat = line + 1;
|
||||
|
||||
if (robots_pattern_match(pat, fil)) {
|
||||
const size_t len = strlen(pat);
|
||||
|
||||
if (!matched || len > best_len || (len == best_len && is_allow)) {
|
||||
matched = HTS_TRUE;
|
||||
best_len = len;
|
||||
best_allow = is_allow;
|
||||
}
|
||||
}
|
||||
} while((strnotempty(line)) && (ptr < (int) strlen(robots->token)));
|
||||
}
|
||||
}
|
||||
if (matched && !best_allow)
|
||||
return -1; // forbidden
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
@@ -74,6 +130,93 @@ int checkrobots(robots_wizard * robots, const char *adr, const char *fil) {
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Append "<marker><pattern>\n" to the bounded rule blob if it fits. */
|
||||
static void robots_blob_add(char *blob, size_t blobsize, char marker,
|
||||
const char *pat) {
|
||||
const size_t used = strlen(blob);
|
||||
const size_t need = strlen(pat) + 2; // marker + '\n'
|
||||
|
||||
if (need < blobsize - used) { // overflow-safe: used <= blobsize-1
|
||||
blob[used] = marker;
|
||||
blob[used + 1] = '\0';
|
||||
strlcatbuff(blob, pat, blobsize);
|
||||
strlcatbuff(blob, "\n", blobsize);
|
||||
}
|
||||
}
|
||||
|
||||
void robots_parse(robots_wizard *robots, const char *adr, const char *body,
|
||||
size_t bodysize, char *info, size_t infosize,
|
||||
hts_boolean keep_root_disallow) {
|
||||
size_t bptr = 0;
|
||||
int record = 0;
|
||||
char BIGSTK line[1024];
|
||||
char BIGSTK blob[HTS_ROBOTS_TOKEN_SIZE];
|
||||
|
||||
blob[0] = '\0';
|
||||
if (info != NULL && infosize > 0)
|
||||
info[0] = '\0';
|
||||
#if DEBUG_ROBOTS
|
||||
printf("robots.txt dump:\n%s\n", body);
|
||||
#endif
|
||||
while (bptr < bodysize) {
|
||||
char *comm;
|
||||
int llen;
|
||||
|
||||
bptr += binput(body + bptr, line, sizeof(line) - 2);
|
||||
comm = strchr(line, '#'); // strip comment
|
||||
if (comm != NULL)
|
||||
*comm = '\0';
|
||||
llen = (int) strlen(line); // strip trailing spaces
|
||||
while (llen > 0 && is_realspace(line[llen - 1])) {
|
||||
line[llen - 1] = '\0';
|
||||
llen--;
|
||||
}
|
||||
if (strfield(line, "user-agent:")) {
|
||||
char *a = line + 11;
|
||||
|
||||
while (is_realspace(*a))
|
||||
a++;
|
||||
if (*a == '*') {
|
||||
if (record != 2)
|
||||
record = 1; // generic group applies to us
|
||||
} else if (strfield(a, "httrack") || strfield(a, "winhttrack") ||
|
||||
strfield(a, "webhttrack")) {
|
||||
blob[0] = '\0'; // explicit group: restart capture
|
||||
if (info != NULL && infosize > 0)
|
||||
info[0] = '\0';
|
||||
record = 2; // locked to the httrack group
|
||||
} else
|
||||
record = 0;
|
||||
} else if (record) {
|
||||
hts_boolean is_allow = strfield(line, "allow:");
|
||||
hts_boolean is_disallow = !is_allow && strfield(line, "disallow:");
|
||||
|
||||
if (is_allow || is_disallow) {
|
||||
char *a = line + (is_allow ? 6 : 9);
|
||||
|
||||
while (is_realspace(*a))
|
||||
a++;
|
||||
if (strnotempty(a)) {
|
||||
if (is_disallow && !keep_root_disallow && strcmp(a, "/") == 0) {
|
||||
// dropped: site-wide disallow ignored by option
|
||||
} else {
|
||||
robots_blob_add(blob, sizeof(blob), is_allow ? 'A' : 'D', a);
|
||||
if (is_disallow && info != NULL &&
|
||||
strlen(a) + 2 < infosize - strlen(info)) {
|
||||
if (strnotempty(info))
|
||||
strlcatbuff(info, ", ", infosize);
|
||||
strlcatbuff(info, a, infosize);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (strnotempty(blob))
|
||||
checkrobots_set(robots, adr, blob);
|
||||
}
|
||||
|
||||
int checkrobots_set(robots_wizard * robots, const char *adr, const char *data) {
|
||||
if (((int) strlen(adr)) >= sizeof(robots->adr) - 2)
|
||||
return 0;
|
||||
|
||||
@@ -39,17 +39,27 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_DEF_FWSTRUCT_robots_wizard
|
||||
typedef struct robots_wizard robots_wizard;
|
||||
#endif
|
||||
|
||||
/* Per-host blob: one rule per line, first byte 'A'/'D' then path pattern. */
|
||||
#define HTS_ROBOTS_TOKEN_SIZE 4096
|
||||
|
||||
struct robots_wizard {
|
||||
char adr[128];
|
||||
char token[4096];
|
||||
char token[HTS_ROBOTS_TOKEN_SIZE];
|
||||
struct robots_wizard *next;
|
||||
};
|
||||
|
||||
/* Library internal definictions */
|
||||
#ifdef HTS_INTERNAL_BYTECODE
|
||||
/* -1 if `fil` disallowed for `adr` (RFC 9309); empty: -1 if rules exist. */
|
||||
int checkrobots(robots_wizard * robots, const char *adr, const char *fil);
|
||||
void checkrobots_free(robots_wizard * robots);
|
||||
int checkrobots_set(robots_wizard * robots, const char *adr, const char *data);
|
||||
/* Parse robots.txt `body` for `adr`, storing the HTTrack group's rules; `info`
|
||||
gets a disallow summary, `keep_root_disallow` FALSE drops "Disallow: /". */
|
||||
void robots_parse(robots_wizard *robots, const char *adr, const char *body,
|
||||
size_t bodysize, char *info, size_t infosize,
|
||||
hts_boolean keep_root_disallow);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1305,6 +1305,55 @@ static int st_urlhack(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// hts_finish_makeindex writes the footer, emits the refresh meta only when
|
||||
// makeindex_links==1, and clears *fp / sets *done. argv[0] is a writable dir.
|
||||
static int st_makeindex(httrackp *opt, int argc, char **argv) {
|
||||
char path[HTS_URLMAXSIZE];
|
||||
char buf[4096];
|
||||
FILE *fp;
|
||||
size_t n;
|
||||
int done;
|
||||
|
||||
assertf(argc >= 1);
|
||||
snprintf(path, sizeof(path), "%s/index.html", argv[0]);
|
||||
|
||||
/* single first link: footer + a refresh meta carrying the escaped URL */
|
||||
done = 0;
|
||||
fp = fopen(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
hts_finish_makeindex(opt, &done, &fp, 1, "http://example.com/a b", "%s%s", "",
|
||||
"");
|
||||
assertf(fp == NULL); /* the function closed and cleared it */
|
||||
assertf(done != 0);
|
||||
fp = fopen(path, "rb");
|
||||
assertf(fp != NULL);
|
||||
n = fread(buf, 1, sizeof(buf) - 1, fp);
|
||||
fclose(fp);
|
||||
buf[n] = '\0';
|
||||
assertf(strstr(buf, "Mirror and index made by HTTrack") != NULL);
|
||||
assertf(strstr(buf, "Refresh") != NULL);
|
||||
assertf(strstr(buf, "example.com") != NULL);
|
||||
|
||||
/* no single link: footer only, no refresh meta */
|
||||
done = 0;
|
||||
fp = fopen(path, "wb");
|
||||
assertf(fp != NULL);
|
||||
hts_finish_makeindex(opt, &done, &fp, 0, NULL, "%s%s", "", "");
|
||||
assertf(fp == NULL);
|
||||
assertf(done != 0);
|
||||
fp = fopen(path, "rb");
|
||||
assertf(fp != NULL);
|
||||
n = fread(buf, 1, sizeof(buf) - 1, fp);
|
||||
fclose(fp);
|
||||
buf[n] = '\0';
|
||||
assertf(strstr(buf, "Mirror and index made by HTTrack") != NULL);
|
||||
assertf(strstr(buf, "Refresh") == NULL);
|
||||
|
||||
UNLINK(path);
|
||||
printf("makeindex self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Default User-Agent: honest HTTrack token, no resurrected Windows 98. */
|
||||
static int st_useragent(httrackp *opt, int argc, char **argv) {
|
||||
const char *ua = StringBuff(opt->user_agent);
|
||||
@@ -1491,6 +1540,89 @@ static int st_acceptencoding(httrackp *opt, int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Each call parses `txt` under a fresh host, then checkrobots() for `path`. */
|
||||
static int rb_decide(robots_wizard *r, const char *txt, const char *path) {
|
||||
static int n = 0;
|
||||
char host[64];
|
||||
|
||||
snprintf(host, sizeof(host), "h%d.example", n++);
|
||||
robots_parse(r, host, txt, strlen(txt), NULL, 0, HTS_TRUE);
|
||||
return checkrobots(r, host, path);
|
||||
}
|
||||
|
||||
static int st_robots(httrackp *opt, int argc, char **argv) {
|
||||
robots_wizard robots;
|
||||
(void) opt;
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
memset(&robots, 0, sizeof(robots));
|
||||
|
||||
/* Longer Allow re-opens subtree under Disallow: / (old matcher couldn't). */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /\nAllow: /public/\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/public/x") == 0); /* allowed */
|
||||
assertf(rb_decide(&robots, txt, "/private") == -1); /* denied */
|
||||
assertf(rb_decide(&robots, txt, "/") == -1); /* denied */
|
||||
}
|
||||
|
||||
/* Equal-length match: Allow wins the tie over Disallow. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /foo\nAllow: /foo\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/foo/bar") == 0);
|
||||
}
|
||||
|
||||
/* Longest match wins even when it is not the last rule. */
|
||||
{
|
||||
assertf(rb_decide(&robots, "User-agent: *\nDisallow: /a/b\nAllow: /a\n",
|
||||
"/a/b/c") == -1);
|
||||
assertf(rb_decide(&robots, "User-agent: *\nAllow: /a/b\nDisallow: /a\n",
|
||||
"/a/b/c") == 0);
|
||||
}
|
||||
|
||||
/* '*' matches any run of characters. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /*.php\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/a/b/index.php") == -1);
|
||||
assertf(rb_decide(&robots, txt, "/a/b/index.html") == 0);
|
||||
}
|
||||
|
||||
/* Trailing '$' anchors the end of the path. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /a$\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/a") == -1);
|
||||
assertf(rb_decide(&robots, txt, "/ab") == 0);
|
||||
assertf(rb_decide(&robots, txt, "/a/b") == 0);
|
||||
}
|
||||
|
||||
/* The httrack-specific group replaces the generic '*' group entirely. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /everyone\n"
|
||||
"User-agent: httrack\nDisallow: /\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/anything") == -1);
|
||||
}
|
||||
|
||||
/* Replace, not merge: the generic group does not bind the httrack group. */
|
||||
{
|
||||
const char *txt = "User-agent: *\nDisallow: /x\n"
|
||||
"User-agent: httrack\nDisallow: /y\n";
|
||||
|
||||
assertf(rb_decide(&robots, txt, "/x") == 0);
|
||||
assertf(rb_decide(&robots, txt, "/y") == -1);
|
||||
}
|
||||
|
||||
/* No rules: everything is allowed. */
|
||||
assertf(rb_decide(&robots, "User-agent: *\nDisallow:\n", "/x") == 0);
|
||||
|
||||
checkrobots_free(&robots);
|
||||
printf("robots self-test OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------ */
|
||||
/* Registry: name -> handler, with a usage hint and a one-line description. */
|
||||
/* ------------------------------------------------------------ */
|
||||
@@ -1538,9 +1670,13 @@ static const struct selftest_entry {
|
||||
{"dns", "", "DNS resolver/cache self-test", st_dns},
|
||||
{"cookies", "", "cookie request-header self-test", st_cookies},
|
||||
{"useragent", "", "default User-Agent self-test", st_useragent},
|
||||
{"makeindex", "[dir]", "hts_finish_makeindex footer/refresh self-test",
|
||||
st_makeindex},
|
||||
{"status", "", "HTTP status code -> reason phrase self-test", st_status},
|
||||
{"acceptencoding", "[dir]",
|
||||
"Accept-Encoding advertises gzip+deflate, both decode", st_acceptencoding},
|
||||
{"robots", "", "robots.txt RFC 9309 Allow/Disallow precedence self-test",
|
||||
st_robots},
|
||||
};
|
||||
|
||||
static void list_selftests(void) {
|
||||
|
||||
12
tests/01_engine-makeindex.test
Executable file
12
tests/01_engine-makeindex.test
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# hts_finish_makeindex writes the footer and gates the refresh meta on a single
|
||||
# first link (guards the macro->function extraction).
|
||||
dir=$(mktemp -d)
|
||||
trap 'rm -rf "$dir"' EXIT
|
||||
|
||||
httrack -O /dev/null -#test=makeindex "$dir" run |
|
||||
grep -q "makeindex self-test OK"
|
||||
7
tests/01_engine-robots.test
Executable file
7
tests/01_engine-robots.test
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# robots.txt RFC 9309 Allow/Disallow precedence (#452): longest match wins.
|
||||
httrack -O /dev/null -#test=robots run | grep -q "robots self-test OK"
|
||||
@@ -20,6 +20,14 @@ if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "python3 missing, skipping"
|
||||
exit 77
|
||||
fi
|
||||
# The fixture needs a second loopback IP (dead 127.0.0.2 + live 127.0.0.1) for
|
||||
# the fallback to have a target; GNU/Hurd has only 127.0.0.1, so skip there.
|
||||
case "$(uname -s)" in
|
||||
GNU | GNU/*)
|
||||
echo "GNU/Hurd: single loopback IP, connect-fallback fixture unbuildable, skipping"
|
||||
exit 77
|
||||
;;
|
||||
esac
|
||||
|
||||
server="$top_srcdir/tests/local-server.py"
|
||||
root="$top_srcdir/tests/server-root"
|
||||
|
||||
@@ -36,11 +36,13 @@ TESTS = \
|
||||
01_engine-filter.test \
|
||||
01_engine-hashtable.test \
|
||||
01_engine-idna.test \
|
||||
01_engine-makeindex.test \
|
||||
01_engine-mime.test \
|
||||
01_engine-parse.test \
|
||||
01_engine-pause.test \
|
||||
01_engine-rcfile.test \
|
||||
01_engine-relative.test \
|
||||
01_engine-robots.test \
|
||||
01_engine-savename.test \
|
||||
01_engine-selftest-dispatch.test \
|
||||
01_engine-simplify.test \
|
||||
|
||||
Reference in New Issue
Block a user