Store the DNS cache in a coucal hashtable

The resolver cache was a hand-rolled singly-linked list with a dummy head node: O(n) lookup, O(n^2) build, and each record carried its own next pointer plus an inline copy of the hostname key. Swap it for coucal, the hashtable already used for the backing cache and the ready slots, keyed by hostname with the address record as the value. coucal owns the records (freed through a value handler on coucal_delete) and dups the key itself, so t_dnscache sheds both its next link and its inline iadr string and becomes a pure address record. The state field keeps the same pointer width (t_dnscache* -> coucal), so the installed htsopt.h layout and the ABI are unchanged. Behaviour is identical: same -1/0/>0 lookup contract, same negative caching, same resolve-once semantics, all under the existing opt->state.lock (coucal is not internally serialized against the FTP/web threads). The DNS self-test exercises the full contract black-box and passes unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-29 05:26:32 +03:00 · 2026-06-22 21:12:48 +02:00
16 changed files with 55 additions and 473 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -232,42 +232,30 @@ jobs:
  deb:
    name: deb package (lintian)
    runs-on: ubuntu-24.04
-    # Build and gate inside Debian sid, the upload target. A Debian dpkg-deb
-    # produces archive-legal xz members (an Ubuntu host defaults to zstd, which
-    # the archive's lintian rejects), and sid's lintian carries the same
-    # data-driven checks (embedded-lib fingerprints and the like) the buildds and
-    # UDD apply -- so issues surface here instead of after upload.
-    container: debian:sid
    steps:
-      - name: Install packaging toolchain
-        run: |
-          set -euo pipefail
-          apt-get update
-          apt-get install -y --no-install-recommends \
-            ca-certificates git \
-            build-essential autoconf automake libtool autoconf-archive \
-            zlib1g-dev libssl-dev \
-            debhelper devscripts lintian fakeroot
-
      - uses: actions/checkout@v6
        with:
          submodules: recursive

+      - name: Install packaging toolchain
+        run: |
+          set -euo pipefail
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            build-essential autoconf automake libtool autoconf-archive \
+            zlib1g-dev libssl-dev \
+            debhelper devscripts lintian fakeroot
+
      # --unsigned: CI has no GPG key (also skips the release sig/checksums).
-      # mkdeb builds every package then runs the lintian gate (--fail-on=error,
-      # warning); debuild runs the packaged test pass.
+      # debuild builds every package, then lintian gates on errors.
      #
      # DEB_BUILD_OPTIONS trims work CI does not need (release builds via
      # mkdeb.sh are untouched): noautodbgsym drops the -dbgsym packages whose
      # LTO payloads are slow to compress and that CI never ships; parallel uses
-      # every core.
-      - name: Build and lint Debian packages
+      # every core. We let debuild run its test pass -- the only one now that
+      # mkdeb no longer runs its own -- so CI exercises the packaged tests.
+      - name: Build Debian packages
        run: |
-          set -euo pipefail
-          # The workspace volume is owned by the host runner uid, but the
-          # container runs as root, so mkdeb's git calls (superproject and the
-          # coucal submodule) trip "dubious ownership"; mark them all safe.
-          git config --global --add safe.directory "*"
          export DEB_BUILD_OPTIONS="noautodbgsym parallel=$(nproc)"
          bash tools/mkdeb.sh --unsigned --no-release-artifacts

--- a/debian/libhttrack3.lintian-overrides
+++ b/debian/libhttrack3.lintian-overrides
@@ -1,8 +1,3 @@
 # The shared libraries ship without a versioned symbols control file (ABI is
 # tracked via the SONAME plus a >= upstream-version dependency, see debian/rules).
 libhttrack3: no-symbols-control-file usr/lib/*
-
-# Bundled, locally patched minizip (src/minizip): it adds a zipFlush() API the
-# system libminizip lacks (htscache.c flushes the cache .zip so an interrupted
-# crawl leaves a valid archive), plus Android/old-zlib portability fixes.
-libhttrack3: embedded-library *libminizip*
--- a/debian/proxytrack.lintian-overrides
+++ b/debian/proxytrack.lintian-overrides
@@ -1,3 +0,0 @@
-# Statically linked against httrack's bundled, patched minizip (see src/minizip
-# and libhttrack3's override): the zipFlush() API is absent from the system one.
-proxytrack: embedded-library *libminizip*
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -353,14 +353,6 @@ static void basic_selftests(void) {
    assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
                               "noextfile", 1) == 1);
    assertf(strcmp(r.contenttype, "application/octet-stream") == 0);
-    // empty fil: no extension to scan; must not over-read before the string.
-    // flag==0 -> 0 (nothing written), flag==1 -> octet-stream.
-    assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype), "",
-                               0) == 0);
-    assertf(r.contenttype[0] == '\0');
-    assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype), "",
-                               1) == 1);
-    assertf(strcmp(r.contenttype, "application/octet-stream") == 0);
    // a user --assume rule with an empty value matches but writes nothing:
    // get_userhttptype returns 1 with the buffer empty, so get_httptype_sized
    // must still report 0 (callers test the return like the old
@@ -2476,44 +2468,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
                htsmain_free();
                return err;
              } break;
-              case 'N': { // url_savename name resolution: httrack -#N <fil>
-                          // <content-type>
-                if (na + 2 < argc) {
-                  lien_adrfilsave afs;
-                  cache_back cache;
-                  struct_back *sback;
-                  hash_struct hash;
-                  lien_back headers;
-
-                  memset(&afs, 0, sizeof(afs));
-                  strcpybuff(afs.af.adr, "www.example.com");
-                  strcpybuff(afs.af.fil, argv[na + 1]);
-
-                  memset(&cache, 0, sizeof(cache));
-                  cache.hashtable = (void *) coucal_new(0);
-
-                  sback = back_new(opt, opt->maxsoc * 32 + 1024);
-                  hash_init(opt, &hash, opt->urlhack);
-
-                  memset(&headers, 0, sizeof(headers));
-                  headers.status = 0;
-                  headers.r.statuscode = HTTP_OK;
-                  strcpybuff(headers.r.contenttype, argv[na + 2]);
-                  strcpybuff(headers.url_fil, argv[na + 1]);
-
-                  url_savename(&afs, NULL, NULL, NULL, opt, sback, &cache,
-                               &hash, 0, 0, &headers);
-                  printf("savename: %s\n", afs.save);
-                  htsmain_free();
-                  return 0;
-                } else {
-                  fprintf(
-                      stderr,
-                      "Option #N requires <fil> <content-type> arguments\n");
-                  htsmain_free();
-                  return 1;
-                }
-              } break;
              case 'C':        // list cache files : httrack -#C '*spid*.gif' will attempt to find the matching file
                {
                  int hasFilter = 0;
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -4177,10 +4177,9 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
    /* Check html -> text/html */
    const char *a = fil + strlen(fil) - 1;

-    /* a < fil when fil is empty: bound before dereferencing */
-    while ((a > fil) && (*a != '.') && (*a != '/'))
+    while((*a != '.') && (*a != '/') && (a > fil))
      a--;
-    if (a >= fil && *a == '.' && strlen(a) < 32) {
+    if (*a == '.' && strlen(a) < 32) {
      int j = 0;

      a++;
--- a/src/htsname.c
+++ b/src/htsname.c
@@ -760,9 +760,9 @@ int url_savename(lien_adrfilsave *const afs,
        strcatbuff(fil, DEFAULT_HTML);  // nommer page par défaut (à priori ici html depuis un proxy http)
    }
  }
-  // Change the extension? e.g. php3 saved as html, cgi as html or gif/xbm
-  // depending on the resolved type.
-  if (ext_chg && !opt->no_type_change) {
+  // Changer extension?
+  // par exemple, php3 sera sauvé en html, cgi en html ou gif, xbm etc.. selon les cas
+  if (ext_chg && !opt->no_type_change) {                // changer ext
    char *a = fil + strlen(fil) - 1;

    if ((opt->debug > 1) && (opt->log != NULL)) {
@@ -774,19 +774,11 @@ int url_savename(lien_adrfilsave *const afs,
                      adr_complete, fil_complete, ext);
    }
    if (ext_chg == 1) {
-      // Cut the old extension only when it is empty (a bare trailing dot), the
-      // new one, or a recognized one; an unknown trailing ".token" (e.g.
-      // /article-1.884291, #115) is part of the name, not an extension.
-      const char *const old_ext = get_ext(catbuff, sizeof(catbuff), fil);
-      const int known_ext = !*old_ext || strfield2(old_ext, ext) ||
-                            is_knowntype(opt, fil) || is_dyntype(old_ext) ||
-                            ishtml_ext(old_ext) != -1;
-
      while((a > fil) && (*a != '.') && (*a != '/'))
        a--;
-      if (*a == '.' && known_ext)
-        *a = '\0';          // cut
-      strcatbuff(fil, "."); // re-add the dot
+      if (*a == '.')
+        *a = '\0';              // couper
+      strcatbuff(fil, ".");     // recopier point
    } else {
      while((a > fil) && (*a != '/'))
        a--;
@@ -794,7 +786,7 @@ int url_savename(lien_adrfilsave *const afs,
        a++;
      *a = '\0';
    }
-    strcatbuff(fil, ext); // append ext/name
+    strcatbuff(fil, ext);       // copier ext/nom
  }
  // Rechercher premier / et dernier .
  {
@@ -1729,10 +1721,10 @@ char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
    StringBuff(opt->path_log), digest_filename);
 }

-/* remove refname if any; HTS_TRUE if it was removed */
-hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
-                                        const char *fil) {
+/* remove refname if any */
+void url_savename_refname_remove(httrackp * opt, const char *adr,
+                                 const char *fil) {
  char *filename = url_savename_refname_fullpath(opt, adr, fil);

-  return UNLINK(filename) == 0 ? HTS_TRUE : HTS_FALSE;
+  (void) UNLINK(filename);
 }
--- a/src/htsname.h
+++ b/src/htsname.h
@@ -104,9 +104,8 @@ char *url_md5(char *digest_buffer, const char *fil_complete);
 void url_savename_refname(const char *adr, const char *fil, char *filename);
 char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
                                    const char *fil);
-/* Remove the temp-ref for (adr,fil); HTS_TRUE if it was removed. */
-hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
-                                        const char *fil);
+void url_savename_refname_remove(httrackp * opt, const char *adr,
+                                 const char *fil);
 #endif

 #endif
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -3749,60 +3749,44 @@ int hts_mirror_check_moved(htsmoduleStruct * str,

      }                         // bloc
      // erreur HTTP (ex: 404, not found)
-    } else if ((r->statuscode == HTTP_PRECONDITION_FAILED) ||
-               (r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)) {
-      // 412/416: the resume partial is stale; re-get the whole file (#206)
-      lien_back *itemback = NULL;
-      int had_partial = 0;
-      int ref_existed = 0;
-      int ref_gone;
-
-      // Drop the temp-ref, its partial, and heap->sav so the re-get carries no
-      // Range; else back_add rebuilds the same Range and loops.
-      if (back_unserialize_ref(opt, heap(ptr)->adr, heap(ptr)->fil,
-                               &itemback) == 0) {
-        had_partial = 1;
-        ref_existed = 1;
-        // best-effort: an orphaned partial cannot re-Range once the ref is gone
-        if (fexist_utf8(itemback->url_sav))
-          (void) UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
-                              itemback->url_sav));
-        back_clear_entry(itemback);
-        freet(itemback);
-      }
-      // don't re-record if the ref survived (it would re-Range and loop)
-      ref_gone =
-          url_savename_refname_remove(opt, heap(ptr)->adr, heap(ptr)->fil) ||
-          !ref_existed;
+    } else if ((r->statuscode == HTTP_PRECONDITION_FAILED)
+               || (r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)
+      ) {                       // Precondition Failed, c'est à dire pour nous redemander TOUT le fichier
      if (fexist_utf8(heap(ptr)->sav)) {
-        had_partial = 1;
-        remove(heap(ptr)->sav);
+        remove(heap(ptr)->sav);        // Eliminer
+      } else {
+        hts_log_print(opt, LOG_WARNING,
+                      "Unexpected 412/416 error (%s) for %s%s, '%s' could not be found on disk",
+                      r->msg, urladr(), urlfil(),
+                      heap(ptr)->sav != NULL ? heap(ptr)->sav : "");
      }
-
-      // Re-get once, only if a partial existed and both Range triggers are
-      // gone; a failed removal gives up rather than looping. range_used is
-      // unreliable (it does not survive the delayed-type two-pass).
-      if (had_partial && ref_gone && !fexist_utf8(heap(ptr)->sav)) {
+      if (!fexist_utf8(heap(ptr)->sav)) {    // Bien éliminé? (sinon on boucle..)
+#if HDEBUG
+        printf("Partial content NOT up-to-date, reget all file for %s\n",
+               heap(ptr)->sav);
+#endif
        hts_log_print(opt, LOG_DEBUG, "Partial file reget (%s) for %s%s",
                      r->msg, urladr(), urlfil());
+        // enregistrer le MEME lien
        if (hts_record_link(opt, heap(ptr)->adr, heap(ptr)->fil, heap(ptr)->sav, "", "", NULL)) {
-          heap_top()->testmode = heap(ptr)->testmode;
-          heap_top()->link_import = 0;
+          heap_top()->testmode = heap(ptr)->testmode;   // mode test?
+          heap_top()->link_import = 0;   // pas mode import
          heap_top()->depth = heap(ptr)->depth;
          heap_top()->pass2 = max(heap(ptr)->pass2, numero_passe);
          heap_top()->retry = heap(ptr)->retry;
          heap_top()->premier = heap(ptr)->premier;
          heap_top()->precedent = ptr;
+          //
+          // canceller lien actuel
          error = 1;
-          hts_invalidate_link(opt, ptr); // invalidate hashtable entry
-        } else {                         // out of memory
-          XH_uninit;
+          hts_invalidate_link(opt, ptr);  // invalidate hashtable entry
+          //
+        } else {              // oups erreur, plus de mémoire!!
+          XH_uninit;          // désallocation mémoire & buffers
          return 0;
        }
      } else {
-        hts_log_print(opt, LOG_WARNING,
-                      "Giving up on partial reget (%s) for %s%s", r->msg,
-                      urladr(), urlfil());
+        hts_log_print(opt, LOG_ERROR, "Can not remove old file %s", urlfil());
        error = 1;
      }

--- a/tests/01_engine-savename.test
+++ b/tests/01_engine-savename.test
@@ -1,41 +0,0 @@
-#!/bin/bash
-#
-
-set -euo pipefail
-
-# Local save-name extension resolution (url_savename via -#N <fil> <content-type>).
-# Asserts on the basename of "savename: <path>".
-
-name() {
-    out="$(httrack -O /dev/null -#N "$1" "$2" | sed -n 's/^savename: //p')"
-    test "${out##*/}" == "$3" || {
-        echo "FAIL: '$1' '$2' -> '$out' (want '$3')"
-        exit 1
-    }
-}
-
-# #115: an unknown trailing ".token" is part of the name, keep it and append the type.
-name '/article-1.884291' 'text/html' 'article-1.884291.html'
-name '/news/story-12345.987654' 'text/html' 'story-12345.987654.html'
-
-# Recognized extensions still collapse to the resolved type.
-name '/page.php' 'text/html' 'page.html'
-name '/page.asp' 'text/html' 'page.html'
-name '/foo' 'text/html' 'foo.html'
-
-# A bare trailing dot is not a tail to keep.
-name '/page.' 'text/html' 'page.html'
-
-# Soft-404 (#267/#408): a binary URL served as HTML is named .html.
-name '/x.pdf' 'text/html' 'x.html'
-name '/x.gif' 'text/html' 'x.html'
-
-# Type agrees with the extension: keep it, no churn, no double extension.
-name '/x.pdf' 'application/pdf' 'x.pdf'
-name '/x.jpg' 'image/jpeg' 'x.jpg'
-name '/x.html' 'text/html' 'x.html'
-name '/x.js' 'application/x-javascript' 'x.js'
-name '/types/data.json' 'application/json' 'data.json'
-
-# Agreeing type must not rewrite the extension's casing (no strip-and-reappend).
-name '/x.JPG' 'image/jpeg' 'x.JPG'
--- a/tests/20_local-resume-loop.test
+++ b/tests/20_local-resume-loop.test
@@ -1,113 +0,0 @@
-#!/bin/bash
-# Issue #206: a continue/update crawl looped forever when the resume Range got a
-# 416. Pass 1 leaves a partial + temp-ref; pass 2 must terminate and not loop.
-set -u
-
-: "${top_srcdir:=..}"
-testdir=$(cd "$(dirname "$0")" && pwd)
-server="${testdir}/local-server.py"
-
-command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
-
-tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_206.XXXXXX") || exit 1
-serverpid=
-crawlpid=
-cleanup() {
-    test -n "$crawlpid" && kill -9 "$crawlpid" 2>/dev/null
-    if test -n "$serverpid"; then
-        kill "$serverpid" 2>/dev/null
-        wait "$serverpid" 2>/dev/null
-    fi
-    rm -rf "$tmpdir"
-}
-trap cleanup EXIT HUP INT QUIT PIPE TERM
-
-# --- start the server, discover its ephemeral port --------------------------
-# RESUME_COUNTER gets a byte per /resume/blob.txt request (pass-2 delta bounds re-gets).
-serverlog="${tmpdir}/server.log"
-counter="${tmpdir}/blobcount"
-RESUME_COUNTER="$counter" python3 "$server" --root "${testdir}/server-root" >"$serverlog" 2>&1 &
-serverpid=$!
-port=
-for _ in $(seq 1 50); do
-    line=$(head -n1 "$serverlog" 2>/dev/null)
-    if test "${line%% *}" == "PORT"; then
-        port="${line#PORT }"
-        break
-    fi
-    kill -0 "$serverpid" 2>/dev/null || {
-        echo "server exited early: $(cat "$serverlog")"
-        exit 1
-    }
-    sleep 0.1
-done
-test -n "$port" || {
-    echo "could not discover server port"
-    exit 1
-}
-base="http://127.0.0.1:${port}"
-
-which httrack >/dev/null || {
-    echo "could not find httrack"
-    exit 1
-}
-out="${tmpdir}/crawl"
-mkdir "$out"
-common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0)
-refdir="${out}/hts-cache/ref"
-
-# --- pass 1: crawl, interrupt once the blob download is underway -------------
-printf '[pass 1: interrupt mid-download] ..\t'
-httrack "${common[@]}" "${base}/resume/index.html" >"${tmpdir}/log1" 2>&1 &
-crawlpid=$!
-# Wait until blob.txt is requested, then SIGTERM so httrack's exit handler
-# finalizes the cache and serializes the temp-ref.
-for _ in $(seq 1 300); do
-    test -s "$counter" && break
-    kill -0 "$crawlpid" 2>/dev/null || break
-    sleep 0.1
-done
-sleep 0.5
-kill -TERM "$crawlpid" 2>/dev/null
-wait "$crawlpid" 2>/dev/null
-crawlpid=
-test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
-    echo "FAIL: no temp-ref survived pass 1; cannot drive #206"
-    exit 1
-}
-echo "OK (temp-ref present)"
-before=$(wc -c <"$counter" 2>/dev/null || echo 0)
-
-# --- pass 2: --continue -> resume Range -> 416, bounded against the #206 loop -
-# Kill pass 2 after a deadline (portable stand-in for `timeout`, absent on macOS).
-printf '[pass 2: resume must terminate] ..\t'
-HANG_RC=137 # 128 + SIGKILL
-httrack "${common[@]}" --continue "${base}/resume/index.html" >"${tmpdir}/log2" 2>&1 &
-crawlpid=$!
-(sleep 30 && kill -9 "$crawlpid" 2>/dev/null) &
-guard=$!
-rc=0
-wait "$crawlpid" 2>/dev/null || rc=$?
-crawlpid=
-kill "$guard" 2>/dev/null || true
-wait "$guard" 2>/dev/null || true
-if test "$rc" -eq "$HANG_RC"; then
-    echo "FAIL: pass 2 did not terminate (#206 resume->416 loop)"
-    exit 1
-fi
-echo "OK (terminated, rc=$rc)"
-
-# The fix re-gets once (resume Range + range-less re-get = 2): the lower bound
-# rejects a drop-the-link non-fix (1), the upper bound rejects the loop (many).
-after=$(wc -c <"$counter" 2>/dev/null || echo 0)
-hits=$((after - before))
-printf '[bounded re-get count] ..\t'
-if test "$hits" -lt 2; then
-    echo "FAIL: only ${hits} pass-2 request(s); the stale partial was not re-got"
-    exit 1
-fi
-if test "$hits" -gt 8; then
-    echo "FAIL: ${hits} pass-2 requests for blob.txt (resume is looping)"
-    exit 1
-fi
-echo "OK (${hits} requests)"
--- a/tests/21_local-intl-update.test
+++ b/tests/21_local-intl-update.test
@@ -1,11 +0,0 @@
-#!/bin/bash
-#
-# #157: a dotless, accented URL named .html on the first crawl must keep .html
-# across an update -- not revert to the extensionless name.
-
-: "${top_srcdir:=..}"
-
-bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
-    --found 'intl/Instalação_CVS_no_Ubuntu.html' \
-    --not-found 'intl/Instalação_CVS_no_Ubuntu' \
-    httrack 'BASEURL/intl/index.html'
--- a/tests/22_local-broken-size.test
+++ b/tests/22_local-broken-size.test
@@ -1,17 +0,0 @@
-#!/bin/bash
-# Issues #32/#41: a Content-Length that disagrees with the body warns "bogus
-# state (broken size)" and skips the cache; -%B (tolerant) accepts it.
-
-: "${top_srcdir:=..}"
-
-# Default: warn, but the file is still written.
-bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
-    --found 'size/oversize.bin' \
-    --log-found 'bogus state \(broken size' \
-    httrack 'BASEURL/size/index.html'
-
-# -%B (tolerant): no warning, file written.
-bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
-    --found 'size/oversize.bin' \
-    --log-not-found 'bogus state' \
-    httrack 'BASEURL/size/index.html' '-%B'
--- a/tests/23_local-errpage.test
+++ b/tests/23_local-errpage.test
@@ -1,19 +0,0 @@
-#!/bin/bash
-# Issue #17: with "no error pages" (-o0), 4xx/5xx bodies must not be written;
-# a genuine 0-byte 200 stays. Default (-o1) writes the error page. (#17's purge
-# half also does not reproduce; the purge path is not exercised here.)
-set -e
-
-: "${top_srcdir:=..}"
-
-# -o0: 404 suppressed, good page and the legit 0-byte 200 kept.
-bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
-    --found 'errpage/good.html' \
-    --found 'errpage/empty.html' \
-    --not-found 'errpage/missing.html' \
-    httrack 'BASEURL/errpage/index.html' '-o0'
-
-# Control -o1 (default): the 404 error page is written.
-bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
-    --found 'errpage/missing.html' \
-    httrack 'BASEURL/errpage/index.html' '-o1'
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -40,7 +40,6 @@ TESTS = \
 	01_engine-parse.test \
 	01_engine-rcfile.test \
 	01_engine-relative.test \
-	01_engine-savename.test \
 	01_engine-simplify.test \
 	01_engine-strsafe.test \
 	02_manpage-regen.test \
@@ -59,10 +58,6 @@ TESTS = \
 	16_local-assume.test \
 	17_local-empty-ct.test \
 	18_local-update.test \
-	19_local-connect-fallback.test \
-	20_local-resume-loop.test \
-	21_local-intl-update.test \
-	22_local-broken-size.test \
-	23_local-errpage.test
+	19_local-connect-fallback.test

 CLEANFILES = check-network_sh.cache
--- a/tests/local-crawl.sh
+++ b/tests/local-crawl.sh
@@ -14,9 +14,7 @@
 # Usage:
 #   bash local-crawl.sh [--tls] [--root DIR] \
 #       --errors N --files N --found PATH ... --directory PATH ... \
-#       --log-found REGEX ... --log-not-found REGEX ... \
 #       httrack BASEURL/some/path [httrack-args...]
-# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.

 set -u

@@ -109,7 +107,7 @@ while test "$pos" -lt "$nargs"; do
        audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
        pos=$((pos + 1))
        ;;
-    --found | --not-found | --directory | --log-found | --log-not-found)
+    --found | --not-found | --directory)
        audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
        pos=$((pos + 1))
        ;;
@@ -198,15 +196,6 @@ if test -n "$rerun"; then
        exit 1
    }
    result "OK (update)"
-    # The update summary reports "files updated"; a fresh crawl never does. Assert
-    # it so a regression that bypasses the cache (re-crawls fresh) can't pass.
-    info "checking update used the cache"
-    if grep -aqE "mirror complete in .*files updated" "${out}/hts-log.txt"; then
-        result "OK"
-    else
-        result "update pass did not report cache activity"
-        exit 1
-    fi
 fi

 # --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
@@ -259,22 +248,6 @@ while test "$i" -lt "${#audit[@]}"; do
            exit 1
        fi
        ;;
-    --log-found)
-        i=$((i + 1))
-        info "checking log matches ${audit[$i]}"
-        if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then result "OK"; else
-            result "not in log"
-            exit 1
-        fi
-        ;;
-    --log-not-found)
-        i=$((i + 1))
-        info "checking log lacks ${audit[$i]}"
-        if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then
-            result "present in log"
-            exit 1
-        else result "OK"; fi
-        ;;
    esac
    i=$((i + 1))
 done
--- a/tests/local-server.py
+++ b/tests/local-server.py
@@ -15,7 +15,6 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.

 import argparse
 import os
-import time
 from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
 from urllib.parse import quote, unquote, urlsplit

@@ -177,87 +176,6 @@ class Handler(SimpleHTTPRequestHandler):
        body, ctype = self.TYPE_MATRIX[path]
        self.send_raw(body, ctype)

-    # --- special chars in URLs across an update (issue #157) ---------------
-    # A dotless, accented basename served as text/html (MediaWiki style). The
-    # name the first crawl picks (.html) must survive the update pass.
-    INTL_NAME = "Instalação_CVS_no_Ubuntu"
-
-    def route_intl_index(self):
-        self.send_html('\t<a href="%s">accented</a>\n' % self.INTL_NAME)
-
-    def route_intl_page(self):
-        self.send_raw(b"<html><body>accented page</body></html>\n", "text/html")
-
-    # resume / 416 loop (#206): the first GET stalls after a prefix so the crawl
-    # can be interrupted (partial + temp-ref); every later request is 416.
-    RESUME_PREFIX = b"PARTIAL-" + b"x" * 4096  # flushed before the stall
-    RESUME_LEN = len(RESUME_PREFIX) + 4096  # declared length never delivered
-    _resume_started = False
-
-    def route_resume_index(self):
-        self.send_html('\t<a href="blob.txt">blob</a>')
-
-    def route_resume(self):
-        counter = os.environ.get("RESUME_COUNTER")
-        if counter:
-            with open(counter, "a") as fp:
-                fp.write("x")
-        # First GET: stall mid-body so the crawl can be interrupted with a partial.
-        if not Handler._resume_started:
-            Handler._resume_started = True
-            self.send_response(200)
-            self.send_header("Content-Type", "image/png")
-            self.send_header("Content-Length", str(self.RESUME_LEN))
-            self.send_header("Accept-Ranges", "bytes")
-            self.end_headers()
-            if self.command != "HEAD":
-                self.wfile.write(self.RESUME_PREFIX)
-                self.wfile.flush()
-                try:
-                    while True:
-                        time.sleep(3600)
-                except OSError:
-                    pass
-            return
-        self.send_response(416, "Requested Range Not Satisfiable")
-        self.send_header("Content-Type", "image/png")
-        self.send_header("Content-Range", "bytes */%d" % self.RESUME_LEN)
-        self.send_header("Content-Length", "0")
-        self.end_headers()
-
-    # error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
-    # bodies off disk; a genuine 0-byte 200 is a valid file and stays.
-    def route_errpage_index(self):
-        self.send_html(
-            '\t<a href="good.html">good</a>\n'
-            '\t<a href="missing.html">missing</a>\n'
-            '\t<a href="empty.html">empty</a>\n'
-        )
-
-    def route_errpage_good(self):
-        self.send_raw(b"<html><body>good page</body></html>\n", "text/html")
-
-    def route_errpage_missing(self):
-        self.send_html("\t404 error body", status=404, extra_status="Not Found")
-
-    def route_errpage_empty(self):
-        self.send_raw(b"", "text/html")
-
-    # broken Content-Length (#32/#41): declared size != bytes sent. httrack
-    # warns "bogus state (broken size)" and skips the cache unless -%B.
-    def route_size_index(self):
-        self.send_html('\t<a href="oversize.bin">over</a>\n')
-
-    def route_size_oversize(self):
-        body = b"A" * 100
-        self.send_response(200)
-        self.send_header("Content-Type", "application/octet-stream")
-        self.send_header("Content-Length", str(len(body) - 2))  # lie: too short
-        self.send_header("Connection", "close")
-        self.end_headers()
-        if self.command != "HEAD":
-            self.wfile.write(body)
-
    ROUTES = {
        "/cookies/entrance.php": route_entrance,
        "/cookies/second.php": route_second,
@@ -277,16 +195,6 @@ class Handler(SimpleHTTPRequestHandler):
        "/types/style.css": route_types,
        "/types/data.json": route_types,
        "/types/gen.php": route_types,
-        "/intl/index.html": route_intl_index,
-        "/intl/" + INTL_NAME: route_intl_page,
-        "/resume/index.html": route_resume_index,
-        "/resume/blob.txt": route_resume,
-        "/size/index.html": route_size_index,
-        "/size/oversize.bin": route_size_oversize,
-        "/errpage/index.html": route_errpage_index,
-        "/errpage/good.html": route_errpage_good,
-        "/errpage/missing.html": route_errpage_missing,
-        "/errpage/empty.html": route_errpage_empty,
    }

    # --- dispatch ----------------------------------------------------------
@@ -294,8 +202,7 @@ class Handler(SimpleHTTPRequestHandler):
    def dispatch(self):
        self._set_cookies = []
        path = urlsplit(self.path).path
-        # Match percent-encoded paths (accented #157 route) by their decoded form.
-        handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
+        handler = self.ROUTES.get(path)
        if handler is not None:
            handler(self)
            return True