Compare commits

..

1 Commits

Author SHA1 Message Date
Xavier Roche
aecbaa9993 Release 3.49.10
Bump the package version to 3.49.10 and curate the release notes.

VERSION_INFO goes 3:1:0 -> 3:2:0: the cycle only appended tail fields to the
installed options struct (--cookies-file, --pause, --strip-query, the -%u
split), no existing symbol or offset changed, so the soname stays .so.3.

history.txt gets the 3.49-10 block; debian/changelog gets 3.49.10-1 with the
Debian-specific items (DEP-5 copyright, chromium-first browser dep, minizip
embedded-library override). Standards-Version 4.7.0 -> 4.7.4: the intervening
Policy changes (usr-merge, /usr/games, Priority recommendation, -dev linker
scripts, non-free-firmware) need no package change.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-28 14:21:37 +02:00
5 changed files with 1562 additions and 12264 deletions

View File

@@ -3,12 +3,12 @@
# Change this to download files
if false; then
echo "mget https://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
echo "mget https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT" | lftp
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP*.TXT" | lftp
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP*.TXT" | lftp
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/EBCDIC/CP*.TXT" | lftp
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/CP*.TXT" | lftp
echo "mget ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8*.TXT" | lftp
rm -f CP932.TXT CP936.TXT CP949.TXT CP950.TXT
fi

View File

@@ -30,14 +30,12 @@ Please visit our Website: http://www.httrack.com
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
#include <stdint.h>
#include "htscharset.h"
#include "htsencoding.h"
#include "htssafe.h"
/* static int decode_entity(const uint64_t hash, const size_t len);
*/
/* static int decode_entity(const unsigned int hash, const size_t len);
*/
#include "htsentities.h"
/* hexadecimal conversion */
@@ -52,31 +50,30 @@ static int get_hex_value(char c) {
return -1;
}
/* 64-bit FNV-1a; must match htsentities.sh, which keys the entity table on it.
*/
#define HASH_INIT 0xcbf29ce484222325ULL
#define HASH_PRIME 0x100000001b3ULL
#define HASH_ADD(HASH, C) \
do { \
(HASH) ^= (unsigned char) (C); \
(HASH) *= HASH_PRIME; \
} while (0)
/* Numerical Recipes,
see <http://en.wikipedia.org/wiki/Linear_congruential_generator> */
#define HASH_PRIME ( 1664525 )
#define HASH_CONST ( 1013904223 )
#define HASH_ADD(HASH, C) do { \
(HASH) *= HASH_PRIME; \
(HASH) += HASH_CONST; \
(HASH) += (C); \
} while(0)
int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) {
size_t i, j, ampStart, ampStartDest;
int uc;
int hex;
uint64_t hash;
unsigned int hash;
assertf(max != 0);
for (i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0, uc = -1, hex = 0,
hash = HASH_INIT;
src[i] != '\0'; i++) {
for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0,
uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) {
/* start of entity */
if (src[i] == '&') {
ampStart = i;
ampStartDest = j;
hash = HASH_INIT;
hash = 0;
uc = -1;
}
/* inside a potential entity */
@@ -177,11 +174,14 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma
}
/* alphanumerical entity */
else {
/* alphanum, capped at the longest name
* '&CounterClockwiseContourIntegral;' (31) */
if (i <= ampStart + 31 && ((src[i] >= '0' && src[i] <= '9') ||
(src[i] >= 'A' && src[i] <= 'Z') ||
(src[i] >= 'a' && src[i] <= 'z'))) {
/* alphanum and not too far ('&thetasym;' is the longest) */
if (i <= ampStart + 10 &&
(
(src[i] >= '0' && src[i] <= '9')
|| (src[i] >= 'A' && src[i] <= 'Z')
|| (src[i] >= 'a' && src[i] <= 'z')
)
) {
/* compute hash */
HASH_ADD(hash, (unsigned char) src[i]);
} else {

File diff suppressed because it is too large Load Diff

View File

@@ -1,92 +1,75 @@
#!/bin/bash
#
# Regenerate htsentities.h from the WHATWG named character references.
set -euo pipefail
src=entities.json
url=https://html.spec.whatwg.org/entities.json
src=html40.txt
url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt
dest=htsentities.h
# 64-bit FNV-1a of $1, printed as a C constant. Must match the hash in
# htsencoding.c. The offset basis is stored as its wrapped (signed) bit pattern;
# bash arithmetic is 64-bit two's complement, so the result is bit-exact.
fnv1a() {
local s=$1 i c h=$((0xcbf29ce484222325))
for ((i = 0; i < ${#s}; i++)); do
printf -v c '%d' "'${s:i:1}"
h=$(((h ^ (c & 0xff)) * 0x100000001b3))
done
printf '0x%016xULL' "$h"
}
(
cat <<EOF
/*
-- ${dest} --
FILE GENERATED BY $0, DO NOT MODIFY
if [ ! -f "$src" ]; then
curl -fsS "$url" -o "$src"
fi
We compute the LCG hash
(see <http://en.wikipedia.org/wiki/Linear_congruential_generator>)
for each entity. We should in theory check using strncmp() that we
actually have the correct entity, but this is actually statistically
not needed.
# Keep ';'-terminated single-codepoint names; the ~93 multi-codepoint refs can't
# fit decode_entity's single-codepoint return and are skipped (left verbatim).
pairs=$(jq -r '
to_entries
| map(select((.key | endswith(";")) and (.value.codepoints | length == 1)))
| sort_by(.key)
| .[] | "\(.key | ltrimstr("&") | rtrimstr(";"))\t\(.value.codepoints[0])"' "$src")
We may want to do better, but we expect the hash function to be uniform, and
let the compiler be smart enough to optimize the switch (for example by
checking in log2() intervals)
This code has been generated using the evil $0 script.
*/
# Skipped multi-codepoint names, kept to prove none aliases an emitted hash.
skipped=$(jq -r '
to_entries
| map(select((.key | endswith(";")) and (.value.codepoints | length > 1)))
| .[] | .key | ltrimstr("&") | rtrimstr(";")' "$src")
cases=""
emit_hashes=""
while IFS=$'\t' read -r name cp; do
hash=$(fnv1a "$name")
cases+=" /* $name */"$'\n'
cases+=" case $hash:"$'\n'
cases+=" if (len == ${#name}) {"$'\n'
cases+=" return $cp;"$'\n'
cases+=" }"$'\n'
cases+=" break;"$'\n'
emit_hashes+="$hash"$'\n'
done <<<"$pairs"
skip_hashes=""
while IFS= read -r name; do
[ -n "$name" ] && skip_hashes+="$(fnv1a "$name")"$'\n'
done <<<"$skipped"
# The switch keys on the hash alone, so the dispatch is correct only while every
# emitted name hashes uniquely; prove it here, no runtime name compare needed.
dups=$(printf '%s' "$emit_hashes" | sort | uniq -d || true)
if [ -n "$dups" ]; then
echo "FATAL: two entity names share a hash (duplicate switch case); change the hash:" >&2
echo "$dups" >&2
exit 1
fi
# A skipped name colliding with an emitted hash would mis-decode instead of
# staying verbatim; forbid that too.
aliased=$(comm -12 <(printf '%s' "$emit_hashes" | sort -u) <(printf '%s' "$skip_hashes" | sort -u) || true)
if [ -n "$aliased" ]; then
echo "FATAL: a skipped multi-codepoint name aliases an emitted hash:" >&2
echo "$aliased" >&2
exit 1
fi
cat >"$dest" <<EOF
/* GENERATED by $0 from the WHATWG named character references
(${url}). DO NOT EDIT.
Dispatch keys on a 64-bit FNV-1a hash of the entity name; the generator
aborts on any hash collision, so no runtime name compare is needed. */
#include <stdint.h>
static int decode_entity(const uint64_t hash, const size_t len) {
static int decode_entity(const unsigned int hash, const size_t len) {
switch(hash) {
${cases} }
EOF
(
if test -f ${src}; then
cat ${src}
else
GET "${url}"
fi
) |
grep -E '^<!ENTITY [a-zA-Z0-9_]' |
sed \
-e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
-e 's/-->$//' \
-e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/' |
(
read -r A
while test -n "$A"; do
ent="${A%% *}"
code=$(echo "$A" | cut -f2 -d' ')
# compute hash
hash=0
i=0
a=1664525
c=1013904223
m="$((1 << 32))"
while test "$i" -lt ${#ent}; do
d="$(echo -n "${ent:${i}:1}" | hexdump -v -e '/1 "%d"')"
hash="$((((hash * a) % (m) + d + c) % (m)))"
i=$((i + 1))
done
echo -e " /* $A */"
echo -e " case ${hash}u:"
echo -e " if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {"
echo -e " return ${code};"
echo -e " }"
echo -e " break;"
# next
read -r A
done
)
cat <<EOF
}
/* unknown */
return -1;
}
EOF
echo "wrote $dest ($(grep -c '^ case ' "$dest") entities)" >&2
) >${dest}

View File

@@ -18,21 +18,6 @@ ent '&amp;' '&'
ent '&lt;&gt;' '<>'
ent '&eacute;' 'é'
# HTML5 names from the WHATWG set
ent '&hellip;' '…'
ent '&bigcup;' ''
# longest name (31 chars) exercises the name-length cap
ent '&CounterClockwiseContourIntegral;' '∳'
# astral codepoint -> 4-byte UTF-8
ent '&Aopf;' '𝔸'
# multi-codepoint refs are skipped at generation, so left verbatim
ent '&fjlig;' '&fjlig;'
# common HTML4 names still decode (regression guard against accidental drops)
ent '&copy;&reg;&trade;' '©®™'
ent '&mdash;&ndash;' '—–'
ent '&alpha;&beta;' 'αβ'
# numeric: decimal and hex
ent '&#65;&#66;' 'AB'
ent '&#x41;' 'A'