Compare commits

...

1 Commits

Author SHA1 Message Date
Xavier Roche
e2e6a4d4e4 Advertise deflate in Accept-Encoding and decode it
The request Accept-Encoding offered only gzip even though the response
parser already recognized deflate/x-deflate. But the actual decode path
(hts_zunpack) used zlib's gzread, which only inflates gzip and copies any
deflate body through verbatim, so a deflate response would have been
written out still compressed. Advertising deflate without fixing that
would corrupt files.

Rewrite hts_zunpack to inflate via inflateInit2 with format detection:
gzip and zlib (RFC1950) auto-detect with +32 windowBits, everything else
is treated as raw deflate (RFC1951). Then add deflate to the advertised
list through a small hts_acceptencoding() helper shared with the test.

A new -#test=acceptencoding self-test asserts the advertised header
carries both gzip and deflate, and round-trips gzip, zlib and raw-deflate
bodies through hts_zunpack on disk. Both halves fail on the old binary.

Brotli is intentionally out of scope (new dependency, larger change).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-29 08:41:16 +02:00
6 changed files with 253 additions and 39 deletions

View File

@@ -1326,16 +1326,12 @@ int http_sendhead(httrackp * opt, t_cookie * cookie, int mode,
// Compression accepted ?
if (retour->req.http11) {
hts_boolean compressible = HTS_FALSE;
#if HTS_USEZLIB
if ((!retour->req.range_used)
&& (!retour->req.nocompression))
print_buffer(&bstr, "Accept-Encoding: " "gzip" /* gzip if the preffered encoding */
", " "identity;q=0.9" H_CRLF);
else
print_buffer(&bstr, "Accept-Encoding: identity" H_CRLF); /* no compression */
#else
print_buffer(&bstr, "Accept-Encoding: identity" H_CRLF); /* no compression */
compressible = (!retour->req.range_used && !retour->req.nocompression);
#endif
print_buffer(&bstr, "Accept-Encoding: %s" H_CRLF,
hts_acceptencoding(compressible));
}
/* Authentification */
@@ -4414,6 +4410,11 @@ HTSEXT_API void get_httptype(httrackp *opt, char *s, const char *fil,
(void) get_httptype_sized(opt, s, HTS_MIMETYPE_SIZE, fil, flag);
}
/* Advertised Accept-Encoding; gzip and deflate both decode via hts_zunpack */
const char *hts_acceptencoding(hts_boolean compressible) {
return compressible ? "gzip, deflate, identity;q=0.9" : "identity";
}
// get type of fil (php)
// s: buffer (text/html) or NULL
// return: 1 if known by user

View File

@@ -285,6 +285,9 @@ int ishttperror(int err);
int get_userhttptype(httrackp * opt, char *s, const char *fil);
int give_mimext(char *s, size_t ssize, const char *st);
/* Advertised Accept-Encoding value (no header name/CRLF); see htslib.c. */
const char *hts_acceptencoding(hts_boolean compressible);
int may_bogus_multiple(httrackp * opt, const char *mime, const char *filename);
int may_unknown2(httrackp * opt, const char *mime, const char *filename);

View File

@@ -50,6 +50,9 @@ Please visit our Website: http://www.httrack.com
#include "htscharset.h"
#include "htsencoding.h"
#include "htsmd5.h"
#if HTS_USEZLIB
#include "htszlib.h"
#endif
#include "coucal/coucal.h"
#include <ctype.h>
@@ -1336,6 +1339,158 @@ static int st_status(httrackp *opt, int argc, char **argv) {
return 0;
}
#if HTS_USEZLIB
/* Deflate src->path at windowBits (16+ gzip, + zlib, - raw); 0 on success. */
static int ae_write_packed(const char *path, int windowBits,
const unsigned char *src, size_t len) {
unsigned char out[8192];
z_stream strm;
FILE *f;
int zerr;
memset(&strm, 0, sizeof(strm));
if (deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, windowBits, 8,
Z_DEFAULT_STRATEGY) != Z_OK)
return 1;
if ((f = FOPEN(path, "wb")) == NULL) {
deflateEnd(&strm);
return 1;
}
strm.next_in = (Bytef *) src;
strm.avail_in = (uInt) len;
do {
size_t n;
strm.next_out = out;
strm.avail_out = sizeof(out);
zerr = deflate(&strm, Z_FINISH);
n = sizeof(out) - strm.avail_out;
if (n > 0 && fwrite(out, 1, n, f) != n) {
deflateEnd(&strm);
fclose(f);
return 1;
}
} while (zerr == Z_OK);
deflateEnd(&strm);
fclose(f);
return (zerr == Z_STREAM_END) ? 0 : 1;
}
/* Forged raw deflate (08 1D) that misdetects as zlib; only fallback decodes */
static int ae_write_collision(const char *path, const unsigned char *src,
size_t len) {
/* block-1 LEN low byte 0x1D: with 0x08, (0x081D)%31==0 */
const size_t n1 = 29;
size_t n2, p = 0;
unsigned char *buf;
FILE *f;
int ok;
if (len < n1 || len - n1 > 0xFFFF)
return 1;
n2 = len - n1;
buf = malloct(10 + len);
if (buf == NULL)
return 1;
buf[p++] = 0x08; /* BFINAL=0, BTYPE=00, forged padding -> zlib CMF nibble */
buf[p++] = (unsigned char) (n1 & 0xff);
buf[p++] = (unsigned char) (n1 >> 8);
buf[p++] = (unsigned char) (~n1 & 0xff);
buf[p++] = (unsigned char) ((~n1 >> 8) & 0xff);
memcpy(buf + p, src, n1);
p += n1;
buf[p++] = 0x01; /* BFINAL=1, BTYPE=00 */
buf[p++] = (unsigned char) (n2 & 0xff);
buf[p++] = (unsigned char) (n2 >> 8);
buf[p++] = (unsigned char) (~n2 & 0xff);
buf[p++] = (unsigned char) ((~n2 >> 8) & 0xff);
memcpy(buf + p, src + n1, n2);
p += n2;
f = FOPEN(path, "wb");
ok = (f != NULL && fwrite(buf, 1, p, f) == p);
if (f != NULL)
fclose(f);
freet(buf);
return ok ? 0 : 1;
}
/* Compare path's bytes to expect[0..len); 0 if equal. Streams (large files). */
static int ae_check_decoded(const char *path, const unsigned char *expect,
size_t len) {
unsigned char buf[8192];
FILE *f = FOPEN(path, "rb");
size_t off = 0, n;
if (f == NULL)
return 1;
while ((n = fread(buf, 1, sizeof(buf), f)) > 0) {
if (n > len - off || memcmp(buf, expect + off, n) != 0) {
fclose(f);
return 1;
}
off += n;
}
fclose(f);
return (off == len) ? 0 : 1;
}
#endif
/* Accept-Encoding (#450): advertise gzip+deflate; both decode (hts_zunpack) */
static int st_acceptencoding(httrackp *opt, int argc, char **argv) {
const char *off = hts_acceptencoding(HTS_FALSE);
const char *on = hts_acceptencoding(HTS_TRUE);
(void) opt;
assertf(strcmp(off, "identity") == 0);
assertf(strstr(on, "gzip") != NULL);
assertf(strstr(on, "deflate") != NULL); /* fails on the old gzip-only list */
#if HTS_USEZLIB
if (argc >= 1) {
static const int windowBits[] = {16 + MAX_WBITS, MAX_WBITS, -MAX_WBITS};
const unsigned char small[] =
"deflate round-trip: HTTrack decodes gzip and deflate alike. "
"deflate round-trip: HTTrack decodes gzip and deflate alike.";
const size_t slen = sizeof(small) - 1;
/* 64 KiB of varied (LCG) bytes: forces the multi-fread loop */
const size_t blen = 64 * 1024;
unsigned char *body = malloct(blen);
uint32_t x = 0x1234567u;
char inpath[HTS_URLMAXSIZE], outpath[HTS_URLMAXSIZE];
size_t i;
assertf(body != NULL);
for (i = 0; i < blen; i++) {
x = x * 1103515245u + 12345u;
body[i] = (unsigned char) (x >> 16);
}
/* gzip, zlib (RFC1950) and raw deflate (RFC1951), both small and large. */
for (i = 0; i < sizeof(windowBits) / sizeof(windowBits[0]); i++) {
snprintf(inpath, sizeof(inpath), "%s/ae-in-%d.z", argv[0], windowBits[i]);
snprintf(outpath, sizeof(outpath), "%s/ae-out-%d", argv[0],
windowBits[i]);
assertf(ae_write_packed(inpath, windowBits[i], small, slen) == 0);
assertf(hts_zunpack(inpath, outpath) == (int) slen);
assertf(ae_check_decoded(outpath, small, slen) == 0);
assertf(ae_write_packed(inpath, windowBits[i], body, blen) == 0);
assertf(hts_zunpack(inpath, outpath) == (int) blen);
assertf(ae_check_decoded(outpath, body, blen) == 0);
}
/* Fallback teeth: raw deflate misdetected as zlib; -1 without the retry. */
snprintf(inpath, sizeof(inpath), "%s/ae-collide.z", argv[0]);
snprintf(outpath, sizeof(outpath), "%s/ae-collide.out", argv[0]);
assertf(ae_write_collision(inpath, body, 64) == 0);
assertf(hts_zunpack(inpath, outpath) == 64);
assertf(ae_check_decoded(outpath, body, 64) == 0);
freet(body);
}
#else
(void) argc;
(void) argv;
#endif
printf("acceptencoding self-test OK: %s\n", on);
return 0;
}
/* ------------------------------------------------------------ */
/* Registry: name -> handler, with a usage hint and a one-line description. */
/* ------------------------------------------------------------ */
@@ -1384,6 +1539,8 @@ static const struct selftest_entry {
{"cookies", "", "cookie request-header self-test", st_cookies},
{"useragent", "", "default User-Agent self-test", st_useragent},
{"status", "", "HTTP status code -> reason phrase self-test", st_status},
{"acceptencoding", "[dir]",
"Accept-Encoding advertises gzip+deflate, both decode", st_acceptencoding},
};
static void list_selftests(void) {

View File

@@ -47,48 +47,89 @@ Please visit our Website: http://www.httrack.com
*/
/*
Unpack file into a new file
Unpack file into a new file (gzip, zlib RFC1950 or raw deflate RFC1951).
Return value: size of the new file, or -1 if an error occurred
*/
/* Note: utf-8 */
int hts_zunpack(char *filename, char *newfile) {
int ret = -1;
if (filename != NULL && newfile != NULL) {
if (filename[0] && newfile[0]) {
char catbuff[CATBUFF_SIZE];
FILE *const in = FOPEN(fconv(catbuff, sizeof(catbuff), filename), "rb");
const int fd = in != NULL ? fileno(in) : -1;
const int dup_fd = fd != -1 ? dup(fd) : -1;
// Note: we must dup to be able to flose cleanly.
const gzFile gz = dup_fd != -1 ? gzdopen(dup_fd, "rb") : NULL;
if (filename != NULL && newfile != NULL && filename[0] && newfile[0]) {
char catbuff[CATBUFF_SIZE];
FILE *const in = FOPEN(fconv(catbuff, sizeof(catbuff), filename), "rb");
if (gz) {
FILE *const fpout = FOPEN(fconv(catbuff, sizeof(catbuff), newfile), "wb");
int size = 0;
if (in != NULL) {
unsigned char BIGSTK inbuf[8192];
size_t navail = fread(inbuf, 1, sizeof(inbuf), in);
/* gzip/zlib headers -> +32 windowBits; else raw deflate (RFC1951) */
const hts_boolean wrapped =
(navail >= 2 &&
((inbuf[0] == 0x1f && inbuf[1] == 0x8b) ||
((inbuf[0] & 0x0f) == Z_DEFLATED &&
(((unsigned) inbuf[0] << 8 | inbuf[1]) % 31) == 0)));
int attempt;
if (fpout) {
int nr;
/* deflate is ambiguous; on failure retry with the other windowBits */
for (attempt = 0; attempt < 2 && ret < 0; attempt++) {
const int windowBits =
(attempt == 0 ? wrapped : !wrapped) ? (32 + MAX_WBITS) : -MAX_WBITS;
FILE *fpout;
z_stream strm;
do {
char BIGSTK buff[1024];
nr = gzread(gz, buff, sizeof(buff));
if (nr > 0) {
size += nr;
if (fwrite(buff, 1, nr, fpout) != nr)
nr = size = -1;
}
} while(nr > 0);
if (attempt > 0) {
/* rewind input; reopening fpout "wb" discards the partial output */
if (fseek(in, 0, SEEK_SET) != 0)
break;
navail = fread(inbuf, 1, sizeof(inbuf), in);
}
fpout = FOPEN(fconv(catbuff, sizeof(catbuff), newfile), "wb");
if (fpout == NULL)
break;
memset(&strm, 0, sizeof(strm));
if (inflateInit2(&strm, windowBits) != Z_OK) {
fclose(fpout);
} else
size = -1;
gzclose(gz);
ret = (int) size;
}
if (in != NULL) {
fclose(in);
break;
}
{
hts_boolean ok = HTS_TRUE;
int size = 0;
int zerr = Z_OK;
/* chunked inflate; first chunk in inbuf, single member */
do {
strm.next_in = inbuf;
strm.avail_in = (uInt) navail;
do {
unsigned char BIGSTK outbuf[8192];
size_t produced;
strm.next_out = outbuf;
strm.avail_out = sizeof(outbuf);
zerr = inflate(&strm, Z_NO_FLUSH);
if (zerr == Z_NEED_DICT || zerr == Z_DATA_ERROR ||
zerr == Z_MEM_ERROR || zerr == Z_STREAM_ERROR) {
ok = HTS_FALSE;
break;
}
produced = sizeof(outbuf) - strm.avail_out;
if (produced > 0 &&
fwrite(outbuf, 1, produced, fpout) != produced) {
ok = HTS_FALSE;
break;
}
size += (int) produced;
} while (strm.avail_out == 0);
if (!ok || zerr == Z_STREAM_END)
break;
navail = fread(inbuf, 1, sizeof(inbuf), in);
} while (navail > 0);
if (ok && zerr == Z_STREAM_END)
ret = size;
}
inflateEnd(&strm);
fclose(fpout);
}
fclose(in);
}
}
return ret;

View File

@@ -0,0 +1,11 @@
#!/bin/bash
#
set -euo pipefail
# Accept-Encoding (#450): advertise gzip+deflate; decode gzip/zlib/raw-deflate.
dir=$(mktemp -d)
trap 'rm -rf "$dir"' EXIT
httrack -O /dev/null -#test=acceptencoding "$dir" run |
grep -q "acceptencoding self-test OK"

View File

@@ -49,6 +49,7 @@ TESTS = \
01_engine-strsafe.test \
01_engine-urlhack.test \
01_engine-useragent.test \
01_zlib-acceptencoding.test \
01_zlib-cache.test \
01_zlib-cache-golden.test \
01_zlib-cache-writefail.test \