mirror of
https://github.com/xroche/httrack.git
synced 2026-06-22 10:07:11 +03:00
Compare commits
1 Commits
debian-cop
...
fix/empty-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
02180549f6 |
@@ -1,6 +1,6 @@
|
||||
AC_PREREQ([2.71])
|
||||
|
||||
AC_INIT([httrack], [3.49.9], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_INIT([httrack], [3.49.8], [roche+packaging@httrack.com], [httrack], [http://www.httrack.com/])
|
||||
AC_COPYRIGHT([
|
||||
HTTrack Website Copier, Offline Browser for Windows and Unix
|
||||
Copyright (C) 1998-2015 Xavier Roche and other contributors
|
||||
@@ -29,10 +29,9 @@ AC_CONFIG_SRCDIR(src/httrack.c)
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
AM_INIT_AUTOMAKE([subdir-objects])
|
||||
# 3:1:0: 3.49.9 changed code but not the exported interface vs 3.49.8 (same 164
|
||||
# symbols, no struct-layout change), so bump revision only. (3:0:0 was the htsblk
|
||||
# mime-buffer widening, an ABI break that moved the soname .so.2 -> .so.3.)
|
||||
VERSION_INFO="3:1:0"
|
||||
# 4:0:0: htsblk gained the contenttype_given field, an incompatible ABI break,
|
||||
# so bump current and reset revision/age.
|
||||
VERSION_INFO="4:0:0"
|
||||
AM_MAINTAINER_MODE
|
||||
AC_USE_SYSTEM_EXTENSIONS
|
||||
|
||||
|
||||
15
debian/changelog
vendored
15
debian/changelog
vendored
@@ -1,13 +1,12 @@
|
||||
httrack (3.49.9-1) unstable; urgency=medium
|
||||
httrack (3.49.8-3) unstable; urgency=medium
|
||||
|
||||
* New upstream release: Content-Type and file-type detection fixes (trust a
|
||||
declared Content-Type over a binary URL extension, honor --assume under the
|
||||
delayed type check, keep a known extension against a bogus or empty
|
||||
Content-Type, and avoid an uninitialised read on an empty Content-Type), and
|
||||
restored C++ source-compatibility of the installed headers so reverse
|
||||
dependencies (httraqt) build again.
|
||||
* Rename libhttrack3 to libhttrack4 to follow the SONAME bump to
|
||||
libhttrack.so.4: htsblk gained a contenttype_given field, an
|
||||
incompatible ABI change (VERSION_INFO 3 -> 4). The .files wildcard
|
||||
now tracks .so.4* so the runtime libraries land in the right
|
||||
package. New binary package, via NEW.
|
||||
|
||||
-- Xavier Roche <xavier@debian.org> Sun, 21 Jun 2026 17:59:38 +0200
|
||||
-- Xavier Roche <xavier@debian.org> Sat, 20 Jun 2026 19:46:16 +0200
|
||||
|
||||
httrack (3.49.8-2) unstable; urgency=medium
|
||||
|
||||
|
||||
6
debian/control
vendored
6
debian/control
vendored
@@ -58,13 +58,13 @@ Description: webhttrack common files
|
||||
This package is the common files of webhttrack, website copier and
|
||||
mirroring utility
|
||||
|
||||
Package: libhttrack3
|
||||
Package: libhttrack4
|
||||
Architecture: any
|
||||
Multi-Arch: same
|
||||
Section: libs
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}
|
||||
Replaces: libhttrack2, httrack (<< 3.49.8-2~)
|
||||
Breaks: libhttrack2, httrack (<< 3.49.8-2~)
|
||||
Replaces: libhttrack3, httrack (<< 3.49.8-3~)
|
||||
Breaks: libhttrack3, httrack (<< 3.49.8-3~)
|
||||
Description: Httrack website copier library
|
||||
This package is the library part of httrack, website copier and mirroring
|
||||
utility
|
||||
|
||||
118
debian/copyright
vendored
118
debian/copyright
vendored
@@ -1,109 +1,21 @@
|
||||
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: httrack
|
||||
Upstream-Contact: Xavier Roche <roche@httrack.com>
|
||||
Source: https://www.httrack.com/
|
||||
This package was debianized by Xavier Roche <roche@httrack.com> on
|
||||
Fri, 27 Sep 2002 16:42:26 +0200
|
||||
|
||||
Files: *
|
||||
Copyright: 1998-2026 Xavier Roche and other contributors
|
||||
License: GPL-3+
|
||||
Comment:
|
||||
The engine includes contributions from Yann Philippot (src/htsjava.c,
|
||||
src/htsjava.h). htsbasenet.h links against the system OpenSSL library
|
||||
(originally by Eric Young); no OpenSSL/SSLeay code is bundled here.
|
||||
The current Debian maintainer is Xavier Roche <xavier@debian.org>
|
||||
|
||||
Files: src/minizip/*
|
||||
Copyright: 1998-2010 Gilles Vollant
|
||||
2007-2008 Even Rouault
|
||||
2009-2010 Mathias Svensson
|
||||
1990-2000 Info-ZIP
|
||||
License: Zlib
|
||||
Comment:
|
||||
The decryption code in src/minizip/crypt.h and src/minizip/unzip.c derives
|
||||
from the Info-ZIP distribution, distributed under the same terms.
|
||||
Upstream author: Xavier Roche <roche@httrack.com>
|
||||
|
||||
Files: src/md5.c
|
||||
Copyright: 1993 Colin Plumb
|
||||
License: public-domain-md5
|
||||
This code implements the MD5 message-digest algorithm, due to Ron Rivest.
|
||||
It was written by Colin Plumb in 1993, no copyright is claimed. This code
|
||||
is in the public domain; do with it what you wish.
|
||||
Copyright: 1998-2014 Xavier Roche and other contributors
|
||||
|
||||
Files: src/coucal/*
|
||||
Copyright: 2013-2014 Xavier Roche
|
||||
License: BSD-3-clause
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Files: src/coucal/murmurhash3.h*
|
||||
Copyright: Austin Appleby
|
||||
License: public-domain-murmurhash3
|
||||
MurmurHash3 was written by Austin Appleby, and is placed in the public
|
||||
domain. The author hereby disclaims copyright to this source code.
|
||||
On Debian systems, the complete text of the GNU General Public
|
||||
License version 3 can be found in /usr/share/common-licenses/GPL-3 file.
|
||||
|
||||
Files: html/server/div/com.httrack.WebHTTrack.metainfo.xml
|
||||
Copyright: 1998-2026 Xavier Roche and other contributors
|
||||
License: FSFAP
|
||||
Copying and distribution of this file, with or without modification, are
|
||||
permitted in any medium without royalty provided the copyright notice and
|
||||
this notice are preserved. This file is offered as-is, without any warranty.
|
||||
|
||||
Files: debian/*
|
||||
Copyright: 2002-2026 Xavier Roche <xavier@debian.org>
|
||||
License: GPL-3+
|
||||
|
||||
License: GPL-3+
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
.
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
.
|
||||
On Debian systems, the complete text of the GNU General Public License
|
||||
version 3 can be found in /usr/share/common-licenses/GPL-3.
|
||||
|
||||
License: Zlib
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the
|
||||
use of this software.
|
||||
.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
.
|
||||
1. The origin of this software must not be misrepresented; you must not claim
|
||||
that you wrote the original software. If you use this software in a product,
|
||||
an acknowledgment in the product documentation would be appreciated but is
|
||||
not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
License: BSD-3-clause
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
.
|
||||
1. Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
3. Neither the name of the copyright holder nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
3
debian/libhttrack3.files
vendored
3
debian/libhttrack3.files
vendored
@@ -1,3 +0,0 @@
|
||||
usr/lib/*/libhttrack.so.3*
|
||||
usr/lib/*/libhtsjava.so.3*
|
||||
usr/share/httrack/templates
|
||||
3
debian/libhttrack4.files
vendored
Normal file
3
debian/libhttrack4.files
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
usr/lib/*/libhttrack.so.4*
|
||||
usr/lib/*/libhtsjava.so.4*
|
||||
usr/share/httrack/templates
|
||||
@@ -1,3 +1,3 @@
|
||||
# The shared libraries ship without a versioned symbols control file (ABI is
|
||||
# tracked via the SONAME plus a >= upstream-version dependency, see debian/rules).
|
||||
libhttrack3: no-symbols-control-file usr/lib/*
|
||||
libhttrack4: no-symbols-control-file usr/lib/*
|
||||
2
debian/rules
vendored
2
debian/rules
vendored
@@ -135,7 +135,7 @@ binary-arch: build install
|
||||
dh_makeshlibs -a -X/usr/lib/$(DEB_HOST_MULTIARCH)/httrack/libtest --version-info
|
||||
dh_installdeb -a
|
||||
# we depend on the current version (ABI may change)
|
||||
dh_shlibdeps -a -ldebian/libhttrack3/usr/lib/$(DEB_HOST_MULTIARCH)
|
||||
dh_shlibdeps -a -ldebian/libhttrack4/usr/lib/$(DEB_HOST_MULTIARCH)
|
||||
dh_gencontrol -a
|
||||
dh_md5sums -a
|
||||
dh_builddeb -a
|
||||
|
||||
@@ -4,12 +4,6 @@ HTTrack Website Copier release history:
|
||||
|
||||
This file lists all changes and fixes that have been made for HTTrack
|
||||
|
||||
3.49-9
|
||||
+ Fixed: file-type detection from the Content-Type header: trust a declared type over a binary URL extension, honor --assume under the delayed type check, and keep a known extension against a bogus or empty Content-Type (#267, #29, #56)
|
||||
+ Fixed: an uninitialized-buffer read when the Content-Type is empty (#411)
|
||||
+ Fixed: restored C++ source-compatibility of the installed headers so reverse dependencies (httraqt) build again (#413)
|
||||
+ Changed: multiple internal build, packaging and test-harness improvements
|
||||
|
||||
3.49-8
|
||||
+ New: tunnel HTTPS downloads through the configured HTTP proxy via CONNECT (#85)
|
||||
+ New: parse every candidate URL in <img> and <source> srcset lists (#326)
|
||||
|
||||
@@ -3703,9 +3703,9 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
||||
if (from->maxsoc > 0)
|
||||
to->maxsoc = from->maxsoc;
|
||||
|
||||
/* hts_tristate fields use HTS_DEFAULT (-1) for "unspecified": copy_htsopt
|
||||
skips them so the target keeps its value. */
|
||||
if (from->nearlink > -1)
|
||||
/* hts_boolean/enum fields are unsigned (GCC), so a bare `> -1` unset-guard
|
||||
is always false; cast to int to keep the -1 "unset" sentinel test. */
|
||||
if ((int) from->nearlink > -1)
|
||||
to->nearlink = from->nearlink;
|
||||
|
||||
if (from->timeout > -1)
|
||||
@@ -3732,10 +3732,10 @@ HTSEXT_API int copy_htsopt(const httrackp * from, httrackp * to) {
|
||||
if (from->hostcontrol > -1)
|
||||
to->hostcontrol = from->hostcontrol;
|
||||
|
||||
if (from->errpage > -1)
|
||||
if ((int) from->errpage > -1)
|
||||
to->errpage = from->errpage;
|
||||
|
||||
if (from->parseall > -1)
|
||||
if ((int) from->parseall > -1)
|
||||
to->parseall = from->parseall;
|
||||
|
||||
// test all: bit 8 de travel
|
||||
|
||||
@@ -2579,7 +2579,7 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
(r.size >= 0) ? r.size : (-r.size));
|
||||
if (r.contenttype >= 0) {
|
||||
fprintf(stdout, "Content-Type: %s\r\n",
|
||||
hts_effective_mime(r.contenttype));
|
||||
r.contenttype);
|
||||
}
|
||||
if (r.cdispo[0]) {
|
||||
fprintf(stdout, "Content-Disposition: %s\r\n",
|
||||
@@ -3166,16 +3166,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
|
||||
if (to->parseall != HTS_FALSE)
|
||||
err = 1;
|
||||
|
||||
/* HTS_DEFAULT (-1) is "unspecified": copy_htsopt must skip it,
|
||||
leaving the target intact. Only a signed (int-backed) field
|
||||
can hold -1, so this also guards the type against regressing
|
||||
to an unsigned hts_boolean. */
|
||||
from->parseall = HTS_DEFAULT;
|
||||
to->parseall = HTS_TRUE;
|
||||
copy_htsopt(from, to);
|
||||
if (to->parseall != HTS_TRUE)
|
||||
err = 1;
|
||||
|
||||
hts_free_opt(from);
|
||||
hts_free_opt(to);
|
||||
printf("copy-htsopt: %s\n", err ? "FAIL" : "OK");
|
||||
|
||||
@@ -43,8 +43,8 @@ Please visit our Website: http://www.httrack.com
|
||||
configure.ac, decoupled from these). VERSION is the display form, VERSIONID
|
||||
the dotted numeric form, AFF_VERSION the short form shown in footers,
|
||||
LIB_VERSION the data/cache format generation. */
|
||||
#define HTTRACK_VERSION "3.49-9"
|
||||
#define HTTRACK_VERSIONID "3.49.9"
|
||||
#define HTTRACK_VERSION "3.49-8"
|
||||
#define HTTRACK_VERSIONID "3.49.8"
|
||||
#define HTTRACK_AFF_VERSION "3.x"
|
||||
#define HTTRACK_LIB_VERSION "2.0"
|
||||
|
||||
@@ -247,23 +247,13 @@ Please visit our Website: http://www.httrack.com
|
||||
#define HTS_NOPARAM "(none)"
|
||||
#define HTS_NOPARAM2 "\"(none)\""
|
||||
|
||||
/* Boolean flag for option fields and API yes/no returns. Int-backed, not an
|
||||
enum: an enum makes C++ reject `field = 1` / `f(0)` on the exported fields
|
||||
and params. Int-sized, so the httrackp layout and the ABI are unchanged. */
|
||||
/* Boolean flag for option fields and API yes/no returns. An enum (not C bool)
|
||||
so it stays int-sized: option fields keep the httrackp layout/ABI, and a
|
||||
return type stays compatible with the int it replaces. */
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_boolean
|
||||
#define HTS_DEF_DEFSTRUCT_hts_boolean
|
||||
|
||||
typedef int hts_boolean;
|
||||
#define HTS_FALSE 0
|
||||
#define HTS_TRUE 1
|
||||
#endif
|
||||
|
||||
#ifndef HTS_DEF_DEFSTRUCT_hts_tristate
|
||||
#define HTS_DEF_DEFSTRUCT_hts_tristate
|
||||
/* Tri-state hts_boolean: HTS_DEFAULT (-1) = "unspecified" (copy_htsopt leaves
|
||||
the target untouched); HTS_FALSE/HTS_TRUE = off/on. */
|
||||
typedef int hts_tristate;
|
||||
#define HTS_DEFAULT (-1)
|
||||
typedef enum hts_boolean { HTS_FALSE = 0, HTS_TRUE = 1 } hts_boolean;
|
||||
#endif
|
||||
|
||||
/* Larger/smaller of two values. Macros: arguments are evaluated twice. */
|
||||
|
||||
17
src/htslib.c
17
src/htslib.c
@@ -1396,6 +1396,8 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
|
||||
void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
const char *a = rcvd;
|
||||
|
||||
retour->contenttype_given = HTS_FALSE; /* set when a Content-Type is seen */
|
||||
|
||||
// exemple:
|
||||
// HTTP/1.0 200 OK
|
||||
if (*a) {
|
||||
@@ -1423,7 +1425,7 @@ void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
else
|
||||
infostatuscode(retour->msg, retour->statuscode);
|
||||
// type MIME par défaut2
|
||||
strcpybuff(retour->contenttype, HTS_UNKNOWN_MIME);
|
||||
strcpybuff(retour->contenttype, HTS_HYPERTEXT_DEFAULT_MIME);
|
||||
} else { // pas de code!
|
||||
retour->statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(retour->msg, "Unknown response structure");
|
||||
@@ -1438,7 +1440,7 @@ void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
retour->statuscode = HTTP_OK;
|
||||
retour->keep_alive = 0;
|
||||
strcpybuff(retour->msg, "Unknown, assuming junky server");
|
||||
strcpybuff(retour->contenttype, HTS_UNKNOWN_MIME);
|
||||
strcpybuff(retour->contenttype, HTS_HYPERTEXT_DEFAULT_MIME);
|
||||
} else if (strnotempty(a)) {
|
||||
retour->statuscode = STATUSCODE_INVALID;
|
||||
strcpybuff(retour->msg, "Unknown (not HTTP/xx) response structure");
|
||||
@@ -1447,7 +1449,7 @@ void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
retour->statuscode = HTTP_OK;
|
||||
retour->keep_alive = 0;
|
||||
strcpybuff(retour->msg, "Unknown, assuming junky server");
|
||||
strcpybuff(retour->contenttype, HTS_UNKNOWN_MIME);
|
||||
strcpybuff(retour->contenttype, HTS_HYPERTEXT_DEFAULT_MIME);
|
||||
}
|
||||
}
|
||||
} else { // vide!
|
||||
@@ -1458,7 +1460,7 @@ void treatfirstline(htsblk * retour, const char *rcvd) {
|
||||
/* This is dirty .. */
|
||||
retour->statuscode = HTTP_OK;
|
||||
strcpybuff(retour->msg, "Unknown, assuming junky server");
|
||||
strcpybuff(retour->contenttype, HTS_UNKNOWN_MIME);
|
||||
strcpybuff(retour->contenttype, HTS_HYPERTEXT_DEFAULT_MIME);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1589,14 +1591,16 @@ void treathead(t_cookie * cookie, const char *adr, const char *fil, htsblk * ret
|
||||
}
|
||||
}
|
||||
}
|
||||
// An empty/whitespace Content-Type value yields no token: keep the
|
||||
// sentinel default rather than reading an uninitialized tempo.
|
||||
// An empty/whitespace Content-Type value yields no token; keep the
|
||||
// default type and the "not given" flag instead of reading uninit tempo.
|
||||
if (sscanf(rcvd + p, "%s", tempo) == 1) {
|
||||
if (strlen(tempo) < sizeof(retour->contenttype) - 2) // pas trop long!!
|
||||
strcpybuff(retour->contenttype, tempo);
|
||||
else
|
||||
strcpybuff(retour->contenttype,
|
||||
"application/octet-stream-unknown"); // erreur
|
||||
retour->contenttype_given =
|
||||
HTS_TRUE; /* server declared a usable type */
|
||||
}
|
||||
}
|
||||
} else if ((p = strfield(rcvd, "Content-Range:")) != 0) {
|
||||
@@ -4314,7 +4318,6 @@ int give_mimext(char *s, size_t ssize, const char *st) {
|
||||
int ok = 0;
|
||||
int j = 0;
|
||||
|
||||
st = hts_effective_mime(st); /* no declared type: derive an html ext */
|
||||
s[0] = '\0';
|
||||
while((!ok) && (strnotempty(hts_mime[j][1]))) {
|
||||
if (strfield2(hts_mime[j][0], st)) {
|
||||
|
||||
18
src/htslib.h
18
src/htslib.h
@@ -481,22 +481,10 @@ HTS_STATIC int strcmpnocase(const char *a, const char *b) {
|
||||
|
||||
// is this MIME an hypertext MIME (text/html), html/js-style or other script/text type?
|
||||
#define HTS_HYPERTEXT_DEFAULT_MIME "text/html"
|
||||
/* Sentinel stored when the server declared no Content-Type. It is html-ish
|
||||
for every type test (so a typeless response still parses/stores as today),
|
||||
but the naming code (wire_patches_ext) treats it as "no declared type" and
|
||||
keeps the URL extension. It rides the cache, so updates name consistently. */
|
||||
#define HTS_UNKNOWN_MIME "unknown/unknown"
|
||||
/* Map the no-declared-type sentinel back to a real type for any header or
|
||||
record we EMIT or PERSIST, so "unknown/unknown" never reaches a consumer
|
||||
(a served Content-Type, a ProxyTrack .arc record, ...). */
|
||||
#define hts_effective_mime(m) \
|
||||
(strfield2((m), HTS_UNKNOWN_MIME) ? HTS_HYPERTEXT_DEFAULT_MIME : (m))
|
||||
|
||||
#define is_html_mime_type(a) \
|
||||
((strfield2((a), "text/html") != 0) || \
|
||||
(strfield2((a), "application/xhtml+xml") != 0) || \
|
||||
(strfield2((a), HTS_UNKNOWN_MIME) != \
|
||||
0) /* no declared type: treat as html */ \
|
||||
#define is_html_mime_type(a) \
|
||||
( (strfield2((a),"text/html")!=0)\
|
||||
|| (strfield2((a),"application/xhtml+xml")!=0) \
|
||||
)
|
||||
#define is_hypertext_mime__(a) \
|
||||
( \
|
||||
|
||||
@@ -142,11 +142,10 @@ static void cleanEndingSpaceOrDot(char *s) {
|
||||
saved file? True when the type is patchable (may_unknown2) and either the URL
|
||||
extension implies no specific type or the server declared a disagreeing one.
|
||||
A URL extension mapping to a specific non-HTML type is kept only when the
|
||||
server declared NO type (the HTS_UNKNOWN_MIME sentinel; the #267 mangle
|
||||
guard): a typeless .png stays .png, but a .pdf explicitly served as text/html
|
||||
is named .html. The sentinel rides the cache, so updates stay consistent. */
|
||||
server sent NO Content-Type (the #267 mangle guard): a typeless .png stays
|
||||
.png, but a .pdf explicitly served as text/html is named .html. */
|
||||
static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
const char *file) {
|
||||
const char *file, int contenttype_given) {
|
||||
char urlmime[256];
|
||||
|
||||
if (may_unknown2(opt, wiremime, file))
|
||||
@@ -158,11 +157,11 @@ static int wire_patches_ext(httrackp *opt, const char *wiremime,
|
||||
if (strfield2(wiremime, urlmime))
|
||||
return 0; /* wire agrees with the ext: keep it (no .htm->.html churn) */
|
||||
/* wire disagrees with a specific non-HTML URL ext. Keep the ext only when
|
||||
the server declared no type (the sentinel); an explicitly declared type,
|
||||
even text/html, is trusted, so a binary-looking URL that really serves
|
||||
HTML (login/error interstitial, soft-404) is named .html. */
|
||||
if (!is_hypertext_mime(opt, urlmime, file) &&
|
||||
strfield2(wiremime, HTS_UNKNOWN_MIME))
|
||||
the server sent NO Content-Type: a missing type is defaulted to text/html
|
||||
upstream and must not clobber e.g. a .png. An explicitly declared type is
|
||||
trusted, so a binary-looking URL that really serves HTML (login/error
|
||||
interstitial, soft-404) is named .html instead of kept as .pdf/.jpg. */
|
||||
if (!is_hypertext_mime(opt, urlmime, file) && !contenttype_given)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
@@ -412,7 +411,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (strnotempty(r.cdispo)) { /* filename given */
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, r.cdispo);
|
||||
} else if (wire_patches_ext(opt, r.contenttype, fil)) {
|
||||
} else if (wire_patches_ext(opt, r.contenttype, fil,
|
||||
r.contenttype_given)) {
|
||||
if (give_mimext(s, sizeof(s),
|
||||
r.contenttype)) { // recognized extension
|
||||
ext_chg = 1;
|
||||
@@ -458,7 +458,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, headers->r.cdispo);
|
||||
} else if (wire_patches_ext(opt, headers->r.contenttype,
|
||||
headers->url_fil)) {
|
||||
headers->url_fil,
|
||||
headers->r.contenttype_given)) {
|
||||
char s[16];
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
@@ -674,8 +675,7 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
if (!has_been_moved) {
|
||||
if (back[b].r.statuscode != -10) { // erreur
|
||||
if (strnotempty(back[b].r.contenttype) == 0)
|
||||
strcpybuff(back[b].r.contenttype,
|
||||
HTS_UNKNOWN_MIME); // no declared type
|
||||
strcpybuff(back[b].r.contenttype, "text/html"); // message d'erreur en html
|
||||
// Finalement on, renvoie un erreur, pour ne toucher à rien dans le code
|
||||
// libérer emplacement backing
|
||||
}
|
||||
@@ -688,7 +688,8 @@ int url_savename(lien_adrfilsave *const afs,
|
||||
ext_chg = 2; /* change filename */
|
||||
strcpybuff(ext, back[b].r.cdispo);
|
||||
} else if (wire_patches_ext(opt, back[b].r.contenttype,
|
||||
back[b].url_fil)) {
|
||||
back[b].url_fil,
|
||||
back[b].r.contenttype_given)) {
|
||||
if (give_mimext(
|
||||
s, sizeof(s),
|
||||
back[b].r.contenttype)) { // recognized extension
|
||||
|
||||
12
src/htsopt.h
12
src/htsopt.h
@@ -428,11 +428,11 @@ struct httrackp {
|
||||
LLint maxfile_html; /**< max bytes per HTML file */
|
||||
int maxsoc; /**< max simultaneous sockets (-cN) */
|
||||
LLint fragment; /**< split site after this many bytes */
|
||||
hts_tristate
|
||||
hts_boolean
|
||||
nearlink; /**< also fetch images/data adjacent to a page but off-site */
|
||||
hts_boolean makeindex; /**< build a top-level index.html */
|
||||
hts_boolean kindex; /**< build a keyword index */
|
||||
hts_tristate delete_old; /**< delete locally obsolete files after update */
|
||||
hts_boolean delete_old; /**< delete locally obsolete files after update */
|
||||
int timeout; /**< connection timeout in seconds */
|
||||
int rateout; /**< minimum transfer rate (bytes/s) before abort */
|
||||
int maxtime; /**< max total mirror duration in seconds */
|
||||
@@ -465,13 +465,13 @@ struct httrackp {
|
||||
hts_boolean maketrack; /**< maintain an operations-statistics log */
|
||||
int parsejava; /**< Java/JS parsing mode; see htsparsejava_flags */
|
||||
int hostcontrol; /**< ban slow/timing-out hosts; see hts_hostcontrol bits */
|
||||
hts_tristate errpage; /**< generate an error page on 404 and similar */
|
||||
hts_boolean errpage; /**< generate an error page on 404 and similar */
|
||||
hts_boolean
|
||||
check_type; /**< probe unknown-type links (cgi/asp/dir) and follow moves
|
||||
*/
|
||||
hts_boolean all_in_cache; /**< keep all retrieved data in the cache */
|
||||
hts_robots robots; /**< robots.txt handling level */
|
||||
hts_tristate external; /**< render external links as error pages */
|
||||
hts_boolean external; /**< render external links as error pages */
|
||||
hts_boolean passprivacy; /**< strip passwords from external links */
|
||||
hts_boolean includequery; /**< include the query string in saved names */
|
||||
hts_boolean mirror_first_page; /**< only mirror the links of the first page */
|
||||
@@ -485,7 +485,7 @@ struct httrackp {
|
||||
hts_boolean sizehack; /**< treat same-size response as "updated" */
|
||||
hts_boolean urlhack; // force "url normalization" to avoid loops
|
||||
hts_boolean tolerant; /**< accept an incorrect Content-Length */
|
||||
hts_tristate
|
||||
hts_boolean
|
||||
parseall; /**< parse aggressively, including unknown tags with links */
|
||||
hts_boolean parsedebug; /**< parser debug mode */
|
||||
hts_boolean norecatch; /**< do not re-fetch files the user deleted locally */
|
||||
@@ -651,6 +651,8 @@ struct htsblk {
|
||||
int debugid; /**< connection debug id */
|
||||
/* */
|
||||
htsrequest req; /**< parameters used for the request */
|
||||
/* a Content-Type header was received (else contenttype holds a default) */
|
||||
hts_boolean contenttype_given;
|
||||
/*char digest[32+2]; // md5 digest generated by the engine ("" if none) */
|
||||
};
|
||||
|
||||
|
||||
@@ -1176,15 +1176,11 @@ static void proxytrack_process_HTTP(PT_Indexes indexes, T_SOC soc_c) {
|
||||
if (element != NULL) {
|
||||
msgCode = element->statuscode;
|
||||
StringRoom(headers, 8192);
|
||||
sprintf(StringBuffRW(headers),
|
||||
"HTTP/1.1 %d %s\r\n"
|
||||
sprintf(StringBuffRW(headers), "HTTP/1.1 %d %s\r\n"
|
||||
#ifndef NO_WEBDAV
|
||||
"%s"
|
||||
#endif
|
||||
"Content-Type: %s%s%s%s\r\n"
|
||||
"%s%s%s"
|
||||
"%s%s%s"
|
||||
"%s%s%s",
|
||||
"Content-Type: %s%s%s%s\r\n" "%s%s%s" "%s%s%s" "%s%s%s",
|
||||
/* */
|
||||
msgCode, element->msg,
|
||||
#ifndef NO_WEBDAV
|
||||
@@ -1192,18 +1188,16 @@ static void proxytrack_process_HTTP(PT_Indexes indexes, T_SOC soc_c) {
|
||||
StringBuff(davHeaders),
|
||||
#endif
|
||||
/* Content-type: foo; [ charset=bar ] */
|
||||
hts_effective_mime(element->contenttype),
|
||||
element->contenttype,
|
||||
((element->charset[0]) ? "; charset=\"" : ""),
|
||||
element->charset, ((element->charset[0]) ? "\"" : ""),
|
||||
/* location */
|
||||
((element->location != NULL && element->location[0])
|
||||
? "Location: "
|
||||
: ""),
|
||||
((element->location != NULL && element->location[0])
|
||||
? element->location
|
||||
: ""),
|
||||
((element->location != NULL && element->location[0]) ? "\r\n"
|
||||
: ""),
|
||||
((element->location != NULL
|
||||
&& element->location[0]) ? "Location: " : ""),
|
||||
((element->location != NULL
|
||||
&& element->location[0]) ? element->location : ""),
|
||||
((element->location != NULL
|
||||
&& element->location[0]) ? "\r\n" : ""),
|
||||
/* last-modified */
|
||||
((element->lastmodified[0]) ? "Last-Modified: " : ""),
|
||||
((element->lastmodified[0]) ? element->lastmodified : ""),
|
||||
@@ -1211,7 +1205,8 @@ static void proxytrack_process_HTTP(PT_Indexes indexes, T_SOC soc_c) {
|
||||
/* etag */
|
||||
((element->etag[0]) ? "ETag: " : ""),
|
||||
((element->etag[0]) ? element->etag : ""),
|
||||
((element->etag[0]) ? "\r\n" : ""));
|
||||
((element->etag[0]) ? "\r\n" : "")
|
||||
);
|
||||
StringLength(headers) = (int) strlen(StringBuff(headers));
|
||||
} else {
|
||||
/* No query string, no ending / : check the the <url>/ page */
|
||||
|
||||
@@ -52,7 +52,6 @@ Please visit our Website: http://www.httrack.com
|
||||
|
||||
#include "htscore.h"
|
||||
#include "htsback.h"
|
||||
#include "htslib.h" /* hts_effective_mime */
|
||||
|
||||
#include "store.h"
|
||||
#include "proxystrings.h"
|
||||
@@ -2290,17 +2289,10 @@ static int PT_SaveCache__Arc_Fun(void *arg, const char *url, PT_Element element)
|
||||
int size_headers;
|
||||
|
||||
sprintf(st->headers,
|
||||
"HTTP/1.0 %d %s"
|
||||
"\r\n"
|
||||
"X-Server: ProxyTrack " PROXYTRACK_VERSION "\r\n"
|
||||
"Content-type: %s%s%s%s"
|
||||
"\r\n"
|
||||
"Last-modified: %s"
|
||||
"\r\n"
|
||||
"Content-length: %d"
|
||||
"\r\n",
|
||||
element->statuscode, element->msg,
|
||||
/**/ hts_effective_mime(element->contenttype),
|
||||
"HTTP/1.0 %d %s" "\r\n" "X-Server: ProxyTrack " PROXYTRACK_VERSION
|
||||
"\r\n" "Content-type: %s%s%s%s" "\r\n" "Last-modified: %s" "\r\n"
|
||||
"Content-length: %d" "\r\n", element->statuscode, element->msg,
|
||||
/**/ element->contenttype,
|
||||
(element->charset[0] ? "; charset=\"" : ""),
|
||||
(element->charset[0] ? element->charset : ""),
|
||||
(element->charset[0] ? "\"" : ""), /**/ element->lastmodified,
|
||||
@@ -2336,10 +2328,10 @@ static int PT_SaveCache__Arc_Fun(void *arg, const char *url, PT_Element element)
|
||||
/* args */
|
||||
(link_has_authority(url) ? "" : "http://"), url, "0.0.0.0",
|
||||
tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour,
|
||||
tm->tm_min, tm->tm_sec, hts_effective_mime(element->contenttype),
|
||||
element->statuscode, st->md5,
|
||||
(element->location ? element->location : "-"), (long int) ftell(fp),
|
||||
st->filename, (long int) (size_headers + element->size));
|
||||
tm->tm_min, tm->tm_sec, element->contenttype, element->statuscode,
|
||||
st->md5, (element->location ? element->location : "-"),
|
||||
(long int) ftell(fp), st->filename,
|
||||
(long int) (size_headers + element->size));
|
||||
/* network_doc */
|
||||
if (fwrite(st->headers, 1, size_headers, fp) != size_headers
|
||||
|| (element->size > 0
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/notype.png' \
|
||||
--found 'types/notype.pdf' --not-found 'types/notype.html' \
|
||||
--found 'types/photo.png' \
|
||||
--found 'types/doc.pdf' \
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# A second (update) pass must keep the names the first crawl chose. The stored
|
||||
# Content-Type rides the cache, so the update reads back the same value -- the
|
||||
# unknown/unknown sentinel for a typeless response, the declared type otherwise
|
||||
# -- and names consistently: a declared-text/html .pdf stays .html and a
|
||||
# typeless .png stays .png across the update rather than reverting.
|
||||
|
||||
: "${top_srcdir:=..}"
|
||||
|
||||
bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
|
||||
--found 'types/report.html' --not-found 'types/report.pdf' \
|
||||
--found 'types/notype.png' --not-found 'types/notype.html' \
|
||||
--found 'types/lie.html' \
|
||||
httrack 'BASEURL/types/index.html'
|
||||
@@ -54,7 +54,6 @@ TESTS = \
|
||||
14_local-https.test \
|
||||
15_local-types.test \
|
||||
16_local-assume.test \
|
||||
17_local-empty-ct.test \
|
||||
18_local-update.test
|
||||
17_local-empty-ct.test
|
||||
|
||||
CLEANFILES = check-network_sh.cache
|
||||
|
||||
@@ -26,7 +26,6 @@ key="${testdir}/server.key"
|
||||
|
||||
tls=
|
||||
verbose=
|
||||
rerun=
|
||||
tmpdir=
|
||||
serverpid=
|
||||
crawlpid=
|
||||
@@ -90,7 +89,6 @@ nargs=$#
|
||||
while test "$pos" -lt "$nargs"; do
|
||||
case "${args[$pos]}" in
|
||||
--debug) verbose=1 ;;
|
||||
--rerun) rerun=1 ;; # run httrack a second time (update pass) before auditing
|
||||
--no-purge)
|
||||
nopurge=1
|
||||
audit+=("--no-purge")
|
||||
@@ -182,22 +180,6 @@ test "$crawlres" -eq 0 || ! result "httrack exited $crawlres" || {
|
||||
result "OK"
|
||||
grep -iE "^[0-9:]*[[:space:]]Error:" "${out}/hts-log.txt" >&2
|
||||
|
||||
# --- optional second pass: re-mirror into the same dir (cache/update path) ----
|
||||
if test -n "$rerun"; then
|
||||
info "re-running httrack (update pass)"
|
||||
httrack -O "$out" --user-agent="httrack $ver local ($(uname -omrs))" \
|
||||
"${moreargs[@]}" "${hts[@]}" >"${log}.2" 2>&1 &
|
||||
crawlpid=$!
|
||||
wait "$crawlpid"
|
||||
crawlres=$?
|
||||
crawlpid=
|
||||
test "$crawlres" -eq 0 || ! result "update pass exited $crawlres" || {
|
||||
cat "${log}.2" >&2
|
||||
exit 1
|
||||
}
|
||||
result "OK (update)"
|
||||
fi
|
||||
|
||||
# --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
|
||||
hostroot=
|
||||
for cand in "${out}/127.0.0.1_${port}" "${out}/127.0.0.1"; do
|
||||
|
||||
Reference in New Issue
Block a user