Files
httrack/src/htsencoding.c
Xavier Roche cca83e5f4a Modernize HTML entity decoding to the WHATWG named character references (#444)
* Modernize HTML entity decoding to the WHATWG named character references

Regenerate htsentities.h from the WHATWG entities.json (2032 single-codepoint
names) instead of the 1998 HTML 4.0 set (252 names). The dispatch hash moves
from a 32-bit LCG to 64-bit FNV-1a; the generator now aborts on any (hash,len)
collision, so the hash-only switch stays correct without a runtime name compare.
Bump the consumer name-length cap from 10 to 31, the longest name
(CounterClockwiseContourIntegral), or long names would be rejected outright.
Multi-codepoint references (~93 obscure math entities) can't fit the
single-codepoint return and are skipped, left verbatim as before.

Also fix the dead ftp://ftp.unicode.org URLs in htsbasiccharsets.sh.

Closes #443

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>

* entities: harden the generator collision guard and widen test coverage

Review follow-up. The switch keys on the hash alone, so check hash-alone
uniqueness among emitted names (a same-hash/different-len pair would otherwise
slip the old (hash,len) check and surface only as a cryptic duplicate-case
compile error). Also hash the ~93 skipped multi-codepoint names and abort if any
aliases an emitted hash, so "skipped means verbatim" is enforced rather than
assumed on future regens.

Add a runtime sweep of common HTML4 names (copy/reg/trade/mdash/ndash/alpha/beta)
to 01_engine-entities.test: a regression guard against accidental drops and a
generator-vs-consumer hash cross-check on names beyond the handful already
probed.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Xavier Roche <roche@httrack.com>

---------

Signed-off-by: Xavier Roche <roche@httrack.com>
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 15:29:03 +02:00

333 lines
9.7 KiB
C

/* ------------------------------------------------------------ */
/*
HTTrack Website Copier, Offline Browser for Windows and Unix
Copyright (C) 2013 Xavier Roche and other contributors
SPDX-License-Identifier: GPL-3.0-or-later
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Ethical use: we kindly ask that you NOT use this software to harvest email
addresses or to collect any other private information about people. Doing so
would dishonor our work and waste the many hours we have spent on it.
Please visit our Website: http://www.httrack.com
*/
/* ------------------------------------------------------------ */
/* File: Encoding conversion functions */
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
#include <stdint.h>
#include "htscharset.h"
#include "htsencoding.h"
#include "htssafe.h"
/* static int decode_entity(const uint64_t hash, const size_t len);
*/
#include "htsentities.h"
/* hexadecimal conversion */
static int get_hex_value(char c) {
if (c >= '0' && c <= '9')
return c - '0';
else if (c >= 'a' && c <= 'f')
return (c - 'a' + 10);
else if (c >= 'A' && c <= 'F')
return (c - 'A' + 10);
else
return -1;
}
/* 64-bit FNV-1a; must match htsentities.sh, which keys the entity table on it.
*/
#define HASH_INIT 0xcbf29ce484222325ULL
#define HASH_PRIME 0x100000001b3ULL
#define HASH_ADD(HASH, C) \
do { \
(HASH) ^= (unsigned char) (C); \
(HASH) *= HASH_PRIME; \
} while (0)
int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) {
size_t i, j, ampStart, ampStartDest;
int uc;
int hex;
uint64_t hash;
assertf(max != 0);
for (i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0, uc = -1, hex = 0,
hash = HASH_INIT;
src[i] != '\0'; i++) {
/* start of entity */
if (src[i] == '&') {
ampStart = i;
ampStartDest = j;
hash = HASH_INIT;
uc = -1;
}
/* inside a potential entity */
else if (ampStart != (size_t) -1) {
/* &#..; entity */
if (ampStart + 1 == i && src[ampStart + 1] == '#') {
uc = 0;
hex = 0;
}
/* &#x..; entity */
else if (ampStart + 2 == i && src[ampStart + 1] == '#'
&& src[ampStart + 2] == 'x') {
hex = 1;
}
/* end of entity */
else if (src[i] == ';') {
size_t len;
/* decode entity */
if (uc == -1) {
/* &foo; */
uc = decode_entity(hash, /*&src[ampStart + 1],*/
i - ampStart - 1);
/* FIXME: TEMPORARY HACK FROM PREVIOUS VERSION TO BE INVESTIGATED */
if (uc == 160) {
uc = 32;
}
}
/* end */
ampStart = (size_t) -1;
/* success ? */
if (uc > 0) {
const size_t maxOut = max - ampStartDest;
/* write at position */
if (charset != NULL && hts_isCharsetUTF8(charset)) {
len = hts_writeUTF8(uc, &dest[ampStartDest], maxOut);
} else {
size_t ulen;
char buffer[32];
len = 0;
if ( ( ulen = hts_writeUTF8(uc, buffer, sizeof(buffer)) ) != 0) {
char *s;
buffer[ulen] = '\0';
s = hts_convertStringFromUTF8(buffer, strlen(buffer), charset);
if (s != NULL) {
const size_t sLen = strlen(s);
if (sLen < maxOut) {
/* Do not copy \0. */
memcpy(&dest[ampStartDest], s, sLen);
len = sLen;
}
free(s);
}
}
}
if (len > 0) {
/* new dest position */
j = ampStartDest + len;
/* do not copy ; */
continue;
}
}
}
/* numerical entity */
else if (uc != -1) {
/* decimal */
if (!hex) {
if (src[i] >= '0' && src[i] <= '9') {
const int h = src[i] - '0';
/* Guard before multiplying: a codepoint past the Unicode max
(0x10FFFF) is invalid anyway, so stop rather than overflow uc. */
if (uc > (0x10FFFF - h) / 10) {
ampStart = (size_t) -1;
} else {
uc = uc * 10 + h;
}
} else {
/* abandon */
ampStart = (size_t) -1;
}
}
/* hex */
else {
const int h = get_hex_value(src[i]);
if (h != -1) {
if (uc > (0x10FFFF - h) / 16) {
ampStart = (size_t) -1;
} else {
uc = uc * 16 + h;
}
} else {
/* abandon */
ampStart = (size_t) -1;
}
}
}
/* alphanumerical entity */
else {
/* alphanum, capped at the longest name
* '&CounterClockwiseContourIntegral;' (31) */
if (i <= ampStart + 31 && ((src[i] >= '0' && src[i] <= '9') ||
(src[i] >= 'A' && src[i] <= 'Z') ||
(src[i] >= 'a' && src[i] <= 'z'))) {
/* compute hash */
HASH_ADD(hash, (unsigned char) src[i]);
} else {
/* abandon */
ampStart = (size_t) -1;
}
}
}
/* copy */
if (j + 1 > max) {
/* overflow */
return -1;
}
if (src != dest || i != j) {
dest[j] = src[i];
}
j++;
}
dest[j] = '\0';
return 0;
}
int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8");
}
int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
const int flags) {
size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize;
int seenQuery = 0;
char utfBuffer[32];
assertf(src != dest);
assertf(max != 0);
for(i = 0, j = 0, k = 0, utfBufferJ = 0, utfBufferSize = 0,
lastI = (size_t) -1, lastJ = (size_t) -1
; src[i] != '\0' ; i++) {
char c = src[i];
unsigned char cUtf = (unsigned char) c;
/* Replacement for ' ' */
if (c == '+' && seenQuery) {
c = cUtf = ' ';
k = 0; /* cancel any sequence */
}
/* Escape sequence start */
else if (c == '%') {
/* last known position of % written on destination
copy blindly c, we'll rollback later */
lastI = i;
lastJ = j;
}
/* End of sequence seen */
else if (i >= 2 && i == lastI + 2) {
const int a1 = get_hex_value(src[lastI + 1]);
const int a2 = get_hex_value(src[lastI + 2]);
if (a1 != -1 && a2 != -1) {
const char ec = a1*16 + a2; /* new character */
cUtf = (unsigned char) ec;
/* Shortcut for ASCII (do not unescape non-printable) */
if (
(cUtf < 0x80 && cUtf >= 32)
&& ( flags & UNESCAPE_URL_NO_ASCII ) == 0
) {
/* Rollback new write position and character */
j = lastJ;
c = ec;
}
} else {
k = 0; /* cancel any sequence */
}
}
/* ASCII (and not in %xx) */
else if (cUtf < 0x80 && i != lastI + 1) {
k = 0; /* cancel any sequence */
if (c == '?' && !seenQuery) {
seenQuery = 1;
}
}
/* UTF-8 sequence in progress (either a raw or a %xx character) */
if (cUtf >= 0x80) {
/* Leading UTF ? */
if (HTS_IS_LEADING_UTF8(cUtf)) {
k = 0; /* cancel any sequence */
}
/* Copy */
if (k < sizeof(utfBuffer)) {
/* First character */
if (k == 0) {
/* New destination-centric offset of utf-8 buffer beginning */
if (lastI != (size_t) -1 && i == lastI + 2) { /* just read a %xx */
utfBufferJ = lastJ; /* position of % */
} else {
utfBufferJ = j; /* current position otherwise */
}
/* Sequence length */
utfBufferSize = hts_getUTF8SequenceLength(cUtf);
}
/* Copy */
utfBuffer[k++] = cUtf;
/* Flush UTF-8 buffer when completed. */
if (k == utfBufferSize) {
const size_t nRead = hts_readUTF8(utfBuffer, utfBufferSize, NULL);
/* Reset UTF-8 buffer in all cases. */
k = 0;
/* Was the character read successfully ? */
if (nRead == utfBufferSize) {
/* Rollback write position to sequence start write position */
j = utfBufferJ;
/* Copy full character sequence */
memcpy(&dest[j], utfBuffer, utfBufferSize);
j += utfBufferSize;
/* Skip current character */
continue;
}
}
}
}
/* Check for overflow */
if (j + 1 > max) {
return -1;
}
/* Copy current */
dest[j++] = c;
}
dest[j] = '\0';
return 0;
}
int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
return hts_unescapeUrlSpecial(src, dest, max, 0);
}