Keep unrecognized URL tails instead of mangling them to .html

url_savename truncated any trailing ".token" when applying a resolved content-type, so /article-1.884291 served as text/html was saved as article-1.html, dropping the .884291 tail and colliding with every sibling sharing the prefix. Cut the old extension only when it is empty (a bare trailing dot), the resolved type, a known MIME extension, a dynamic-page extension, or an html-family extension; otherwise keep the tail and append the type (article-1.884291.html). Recognized extensions still collapse as before, so the #267/#408 soft-404 behavior (a binary URL served as HTML named .html) is preserved, and a type that agrees with the extension causes no churn. Add a hidden -#N <fil> <content-type> self-test driving url_savename offline, plus tests/01_engine-savename.test covering the matrix. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Xavier Roche <roche@httrack.com>
2026-06-26 12:07:54 +03:00 · 2026-06-24 18:25:07 +02:00
17 changed files with 50 additions and 556 deletions
--- a/src/htscache.c
+++ b/src/htscache.c
@@ -220,25 +220,6 @@ struct cache_back_zip_entry {
 	} \
 } while(0)

-/* A cache (new.zip) write failed: storage is gone (disk full / dropped share),
-   so the mirror is doomed too. Abort it via exit_xh, don't crash as assertf
-   did. */
-static void cache_zip_write_failed(httrackp *opt, cache_back *cache,
-                                   const char *what, int zErr) {
-  if (!cache->zipWriteFailed) {
-    cache->zipWriteFailed = HTS_TRUE;
-    if (check_fatal_io_errno()) {
-      hts_log_print(opt, LOG_ERROR,
-                    "Mirror aborted: disk full or filesystem problems");
-    } else {
-      hts_log_print(opt, LOG_ERROR,
-                    "Mirror aborted: cache write failed (%s): %s", what,
-                    hts_get_zerror(zErr));
-    }
-  }
-  opt->state.exit_xh = -1; /* fatal: stop the mirror, exit non-zero */
-}
-
 /* Ajout d'un fichier en cache */
 void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
               const char *url_adr, const char *url_fil, const char *url_save,
@@ -255,10 +236,6 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
  const char *url_save_suffix = url_save;
  int zErr;

-  /* already failed and aborting; don't touch the broken stream again */
-  if (cache->zipWriteFailed)
-    return;
-
  // robots.txt hack
  if (url_save == NULL) {
    dataincache = 0;            // testing links
@@ -369,8 +346,9 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
                                   */
                                  headers, (uInt) strlen(headers), NULL, 0, NULL,       /* comment */
                                  Z_DEFLATED, Z_DEFAULT_COMPRESSION)) != Z_OK) {
-    cache_zip_write_failed(opt, cache, "opening a cache entry", zErr);
-    return;
+    int zip_zipOpenNewFileInZip_failed = 0;
+
+    assertf(zip_zipOpenNewFileInZip_failed);
  }

  /* Write data in cache */
@@ -380,8 +358,9 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
        if ((zErr =
             zipWriteInFileInZip((zipFile) cache->zipOutput, r->adr,
                                 (int) r->size)) != Z_OK) {
-          cache_zip_write_failed(opt, cache, "writing to the cache", zErr);
-          return;
+          int zip_zipWriteInFileInZip_failed = 0;
+
+          assertf(zip_zipWriteInFileInZip_failed);
        }
      }
    } else {
@@ -402,10 +381,9 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,
              if ((zErr =
                   zipWriteInFileInZip((zipFile) cache->zipOutput, buff,
                                       (int) nl)) != Z_OK) {
-                cache_zip_write_failed(opt, cache, "writing to the cache",
-                                       zErr);
-                fclose(fp);
-                return;
+                int zip_zipWriteInFileInZip_failed = 0;
+
+                assertf(zip_zipWriteInFileInZip_failed);
              }
            }
          } while(nl > 0);
@@ -419,14 +397,16 @@ void cache_add(httrackp * opt, cache_back * cache, const htsblk * r,

  /* Close */
  if ((zErr = zipCloseFileInZip((zipFile) cache->zipOutput)) != Z_OK) {
-    cache_zip_write_failed(opt, cache, "closing a cache entry", zErr);
-    return;
+    int zip_zipCloseFileInZip_failed = 0;
+
+    assertf(zip_zipCloseFileInZip_failed);
  }

  /* Flush */
  if ((zErr = zipFlush((zipFile) cache->zipOutput)) != 0) {
-    cache_zip_write_failed(opt, cache, "flushing the cache", zErr);
-    return;
+    int zip_zipFlush_failed = 0;
+
+    assertf(zip_zipFlush_failed);
  }
 }

--- a/src/htscache_selftest.c
+++ b/src/htscache_selftest.c
@@ -47,7 +47,6 @@ Please visit our Website: http://www.httrack.com
 #include "htslib.h"
 #include "htszlib.h"

-#include <errno.h>
 #include <stdio.h>
 #include <string.h>

@@ -317,136 +316,6 @@ static int disk_fallback_selftest(httrackp *opt) {
  return fail;
 }

-typedef struct {
-  size_t budget;  /**< bytes allowed through before writes start failing */
-  int fail_errno; /**< errno set on the failing write (ENOSPC, EIO, ...) */
-  int writes;     /**< zwrite call count, to detect re-entry into the stream */
-} writefail_inject;
-
-/* zwrite that copies until the budget runs out, then fails with inj->fail_errno
-   (the #174/#219 condition). Counts calls so the test can prove a flagged cache
-   never re-enters the stream. */
-static uLong selftest_failing_zwrite(voidpf opaque, voidpf stream,
-                                     const void *buf, uLong size) {
-  writefail_inject *inj = (writefail_inject *) opaque;
-
-  inj->writes++;
-  if (inj->budget >= (size_t) size) {
-    inj->budget -= (size_t) size;
-    return (uLong) fwrite(buf, 1, (size_t) size, (FILE *) stream);
-  }
-  errno = inj->fail_errno;
-  return 0; /* short write -> the minizip op returns an error */
-}
-
-/* Open a ZIP whose writes fail past inj->budget, so cache_add() hits an error.
- */
-static zipFile selftest_open_failing_zip(const char *path,
-                                         writefail_inject *inj) {
-  zlib_filefunc_def ff;
-
-  fill_fopen_filefunc(&ff); /* real fopen/read/seek/close; ignores opaque */
-  ff.zwrite_file = selftest_failing_zwrite;
-  ff.opaque = inj;
-  return zipOpen2(path, APPEND_STATUS_CREATE, NULL, &ff);
-}
-
-/* Store one octet-stream body into `cache` (all-in-cache, body in the ZIP). */
-static void writefail_store(httrackp *opt, cache_back *cache, const char *fil,
-                            const char *body, size_t body_len) {
-  htsblk r;
-  char locbuf[4];
-  char *bodycopy = malloct(body_len);
-
-  hts_init_htsblk(&r);
-  r.statuscode = 200;
-  r.size = (LLint) body_len;
-  strcpybuff(r.msg, "OK");
-  strcpybuff(r.contenttype, "application/octet-stream");
-  locbuf[0] = '\0';
-  r.location = locbuf;
-  r.is_write = 0;
-  memcpy(bodycopy, body, body_len);
-  r.adr = bodycopy;
-  cache_add(opt, cache, &r, "example.com", fil, "example.com/blob.bin", 1,
-            NULL);
-  freet(bodycopy);
-}
-
-/* #174/#219: a failing cache write used to crash via assertf(); it must instead
-   stop the mirror (exit_xh = -1) without crashing. Assert that, plus the cache
-   is flagged and a sibling write doesn't re-enter the broken stream. */
-int cache_write_failure_selftest(httrackp *opt, const char *dir) {
-  int fail = 0;
-  char path[HTS_URLMAXSIZE];
-  /* incompressible + big, so deflate flushes (and fails) mid-write, before
-   * close */
-  static const size_t body_len = 256 * 1024;
-  char *body = malloct(body_len);
-  int phase;
-
-  gen_body(body, body_len, 1 /* incompressible */);
-  fconcat(path, sizeof(path), dir, "/wfail.zip");
-
-  /* phase 0: fail on the body write, fatal errno (ENOSPC, the disk-full
-     branch). phase 1: fail on the open, non-fatal errno (EIO, dropped-share
-     branch). Both must abort the mirror. */
-  for (phase = 0; phase < 2; phase++) {
-    cache_back cache;
-    writefail_inject inj;
-    int writes_after_fail;
-
-    inj.budget = (phase == 0) ? 4096 : 0;
-    inj.fail_errno = (phase == 0) ? ENOSPC : EIO;
-    inj.writes = 0;
-    memset(&cache, 0, sizeof(cache));
-    cache.type = 1;
-    cache.log = stderr;
-    cache.errlog = stderr;
-    cache.hashtable = coucal_new(0);
-    cache.zipOutput = selftest_open_failing_zip(path, &inj);
-    if (cache.zipOutput == NULL) {
-      fprintf(stderr, "cache-writefail: could not open injected ZIP\n");
-      fail++;
-      continue;
-    }
-
-    opt->state.exit_xh = 0; /* clear; the failing write must set it to -1 */
-    writefail_store(opt, &cache, "/blob.bin", body, body_len);
-    if (!cache.zipWriteFailed) {
-      fprintf(stderr, "cache-writefail: phase %d: write error not caught\n",
-              phase);
-      fail++;
-    }
-    if (opt->state.exit_xh != -1) {
-      fprintf(stderr,
-              "cache-writefail: phase %d: mirror not aborted (exit_xh=%d)\n",
-              phase, opt->state.exit_xh);
-      fail++;
-    }
-
-    /* a flagged cache must no-op a sibling write: no further backend write */
-    writes_after_fail = inj.writes;
-    writefail_store(opt, &cache, "/blob2.bin", body, 16);
-    if (inj.writes != writes_after_fail) {
-      fprintf(stderr,
-              "cache-writefail: phase %d: sibling write re-entered the broken "
-              "stream (%d extra backend writes)\n",
-              phase, inj.writes - writes_after_fail);
-      fail++;
-    }
-
-    if (cache.zipOutput != NULL) {
-      zipClose(cache.zipOutput,
-               NULL); /* best-effort; may fail on the backend */
-      cache.zipOutput = NULL;
-    }
-  }
-
-  freet(body);
-  return fail;
-}
-
 int cache_selftests(httrackp *opt, const char *dir) {
  int failures = 0;
  cache_back cache;
--- a/src/htscache_selftest.h
+++ b/src/htscache_selftest.h
@@ -52,10 +52,6 @@ int cache_selftests(httrackp *opt, const char *dir);
   committed file, never by the test). Returns the failed-check count. */
 int cache_golden_selftest(httrackp *opt, const char *dir, int regen);

-/* #174/#219: assert a failing cache write aborts the mirror cleanly instead of
-   crashing. Returns the failed-check count. */
-int cache_write_failure_selftest(httrackp *opt, const char *dir);
-
 #endif

 #endif
--- a/src/htscore.h
+++ b/src/htscore.h
@@ -214,8 +214,6 @@ struct cache_back {
  cache_back_zip_entry *zipEntries;
  int zipEntriesOffs;
  int zipEntriesCapa;
-  hts_boolean
-      zipWriteFailed; /**< a cache write failed; stop touching the stream */
 };

 #ifndef HTS_DEF_FWSTRUCT_hash_struct
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -353,14 +353,6 @@ static void basic_selftests(void) {
    assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype),
                               "noextfile", 1) == 1);
    assertf(strcmp(r.contenttype, "application/octet-stream") == 0);
-    // empty fil: no extension to scan; must not over-read before the string.
-    // flag==0 -> 0 (nothing written), flag==1 -> octet-stream.
-    assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype), "",
-                               0) == 0);
-    assertf(r.contenttype[0] == '\0');
-    assertf(get_httptype_sized(opt, r.contenttype, sizeof(r.contenttype), "",
-                               1) == 1);
-    assertf(strcmp(r.contenttype, "application/octet-stream") == 0);
    // a user --assume rule with an empty value matches but writes nothing:
    // get_userhttptype returns 1 with the buffer empty, so get_httptype_sized
    // must still report 0 (callers test the return like the old
@@ -2453,20 +2445,6 @@ static int hts_main_internal(int argc, char **argv, httrackp * opt) {
                  return 1;
                }
                break;
-              case 'W': // cache write-failure handling: httrack -#W <dir>
-                if (na + 1 < argc) {
-                  const int err =
-                      cache_write_failure_selftest(opt, argv[na + 1]);
-
-                  printf("cache-writefail: %s\n", err ? "FAIL" : "OK");
-                  htsmain_free();
-                  return err;
-                } else {
-                  fprintf(stderr, "Option #W requires a directory argument\n");
-                  htsmain_free();
-                  return 1;
-                }
-                break;
              case 'B': // golden cache fixture read: httrack -#B <dir> [regen]
                if (na + 1 < argc) {
                  const int regen =
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -4177,10 +4177,9 @@ HTSEXT_API hts_boolean get_httptype_sized(httrackp *opt, char *s, size_t ssize,
    /* Check html -> text/html */
    const char *a = fil + strlen(fil) - 1;

-    /* a < fil when fil is empty: bound before dereferencing */
-    while ((a > fil) && (*a != '.') && (*a != '/'))
+    while((*a != '.') && (*a != '/') && (a > fil))
      a--;
-    if (a >= fil && *a == '.' && strlen(a) < 32) {
+    if (*a == '.' && strlen(a) < 32) {
      int j = 0;

      a++;
--- a/src/htsname.c
+++ b/src/htsname.c
@@ -1729,10 +1729,10 @@ char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
    StringBuff(opt->path_log), digest_filename);
 }

-/* remove refname if any; HTS_TRUE if it was removed */
-hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
-                                        const char *fil) {
+/* remove refname if any */
+void url_savename_refname_remove(httrackp * opt, const char *adr,
+                                 const char *fil) {
  char *filename = url_savename_refname_fullpath(opt, adr, fil);

-  return UNLINK(filename) == 0 ? HTS_TRUE : HTS_FALSE;
+  (void) UNLINK(filename);
 }
--- a/src/htsname.h
+++ b/src/htsname.h
@@ -104,9 +104,8 @@ char *url_md5(char *digest_buffer, const char *fil_complete);
 void url_savename_refname(const char *adr, const char *fil, char *filename);
 char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
                                    const char *fil);
-/* Remove the temp-ref for (adr,fil); HTS_TRUE if it was removed. */
-hts_boolean url_savename_refname_remove(httrackp *opt, const char *adr,
-                                        const char *fil);
+void url_savename_refname_remove(httrackp * opt, const char *adr,
+                                 const char *fil);
 #endif

 #endif
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -3749,60 +3749,44 @@ int hts_mirror_check_moved(htsmoduleStruct * str,

      }                         // bloc
      // erreur HTTP (ex: 404, not found)
-    } else if ((r->statuscode == HTTP_PRECONDITION_FAILED) ||
-               (r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)) {
-      // 412/416: the resume partial is stale; re-get the whole file (#206)
-      lien_back *itemback = NULL;
-      int had_partial = 0;
-      int ref_existed = 0;
-      int ref_gone;
-
-      // Drop the temp-ref, its partial, and heap->sav so the re-get carries no
-      // Range; else back_add rebuilds the same Range and loops.
-      if (back_unserialize_ref(opt, heap(ptr)->adr, heap(ptr)->fil,
-                               &itemback) == 0) {
-        had_partial = 1;
-        ref_existed = 1;
-        // best-effort: an orphaned partial cannot re-Range once the ref is gone
-        if (fexist_utf8(itemback->url_sav))
-          (void) UNLINK(fconv(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
-                              itemback->url_sav));
-        back_clear_entry(itemback);
-        freet(itemback);
-      }
-      // don't re-record if the ref survived (it would re-Range and loop)
-      ref_gone =
-          url_savename_refname_remove(opt, heap(ptr)->adr, heap(ptr)->fil) ||
-          !ref_existed;
+    } else if ((r->statuscode == HTTP_PRECONDITION_FAILED)
+               || (r->statuscode == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE)
+      ) {                       // Precondition Failed, c'est à dire pour nous redemander TOUT le fichier
      if (fexist_utf8(heap(ptr)->sav)) {
-        had_partial = 1;
-        remove(heap(ptr)->sav);
+        remove(heap(ptr)->sav);        // Eliminer
+      } else {
+        hts_log_print(opt, LOG_WARNING,
+                      "Unexpected 412/416 error (%s) for %s%s, '%s' could not be found on disk",
+                      r->msg, urladr(), urlfil(),
+                      heap(ptr)->sav != NULL ? heap(ptr)->sav : "");
      }
-
-      // Re-get once, only if a partial existed and both Range triggers are
-      // gone; a failed removal gives up rather than looping. range_used is
-      // unreliable (it does not survive the delayed-type two-pass).
-      if (had_partial && ref_gone && !fexist_utf8(heap(ptr)->sav)) {
+      if (!fexist_utf8(heap(ptr)->sav)) {    // Bien éliminé? (sinon on boucle..)
+#if HDEBUG
+        printf("Partial content NOT up-to-date, reget all file for %s\n",
+               heap(ptr)->sav);
+#endif
        hts_log_print(opt, LOG_DEBUG, "Partial file reget (%s) for %s%s",
                      r->msg, urladr(), urlfil());
+        // enregistrer le MEME lien
        if (hts_record_link(opt, heap(ptr)->adr, heap(ptr)->fil, heap(ptr)->sav, "", "", NULL)) {
-          heap_top()->testmode = heap(ptr)->testmode;
-          heap_top()->link_import = 0;
+          heap_top()->testmode = heap(ptr)->testmode;   // mode test?
+          heap_top()->link_import = 0;   // pas mode import
          heap_top()->depth = heap(ptr)->depth;
          heap_top()->pass2 = max(heap(ptr)->pass2, numero_passe);
          heap_top()->retry = heap(ptr)->retry;
          heap_top()->premier = heap(ptr)->premier;
          heap_top()->precedent = ptr;
+          //
+          // canceller lien actuel
          error = 1;
-          hts_invalidate_link(opt, ptr); // invalidate hashtable entry
-        } else {                         // out of memory
-          XH_uninit;
+          hts_invalidate_link(opt, ptr);  // invalidate hashtable entry
+          //
+        } else {              // oups erreur, plus de mémoire!!
+          XH_uninit;          // désallocation mémoire & buffers
          return 0;
        }
      } else {
-        hts_log_print(opt, LOG_WARNING,
-                      "Giving up on partial reget (%s) for %s%s", r->msg,
-                      urladr(), urlfil());
+        hts_log_print(opt, LOG_ERROR, "Can not remove old file %s", urlfil());
        error = 1;
      }

--- a/tests/01_engine-cache-writefail.test
+++ b/tests/01_engine-cache-writefail.test
@@ -1,24 +0,0 @@
-#!/bin/bash
-#
-# Keep this POSIX-portable: the harness runs it via $(BASH), which is a plain
-# POSIX /bin/sh on some platforms (e.g. macOS), so avoid bashisms and GNU-only
-# tool flags despite the #!/bin/bash above.
-
-# Cache write-failure handling (httrack -#W <dir>). #174/#219.
-# A failing new.zip write (disk full) used to crash the process via assertf; it
-# must instead stop the mirror with a fatal error (exit_xh=-1), no crash. The
-# self-test asserts that; reverting the fix makes -#W abort (SIGABRT) and fail.
-
-set -eu
-
-dir=$(mktemp -d)
-trap 'rm -rf "$dir"' EXIT
-
-out=$(httrack -#W "$dir")
-
-# Match the exact success line (error logs also go to stdout); a bad -#W falls
-# through to the usage screen, which exits non-zero but never prints this.
-printf '%s\n' "$out" | grep -qx "cache-writefail: OK" || {
-    echo "expected 'cache-writefail: OK', got: $out" >&2
-    exit 1
-}
--- a/tests/20_local-resume-loop.test
+++ b/tests/20_local-resume-loop.test
@@ -1,113 +0,0 @@
-#!/bin/bash
-# Issue #206: a continue/update crawl looped forever when the resume Range got a
-# 416. Pass 1 leaves a partial + temp-ref; pass 2 must terminate and not loop.
-set -u
-
-: "${top_srcdir:=..}"
-testdir=$(cd "$(dirname "$0")" && pwd)
-server="${testdir}/local-server.py"
-
-command -v python3 >/dev/null || ! echo "python3 not found; skipping" || exit 77
-
-tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/httrack_206.XXXXXX") || exit 1
-serverpid=
-crawlpid=
-cleanup() {
-    test -n "$crawlpid" && kill -9 "$crawlpid" 2>/dev/null
-    if test -n "$serverpid"; then
-        kill "$serverpid" 2>/dev/null
-        wait "$serverpid" 2>/dev/null
-    fi
-    rm -rf "$tmpdir"
-}
-trap cleanup EXIT HUP INT QUIT PIPE TERM
-
-# --- start the server, discover its ephemeral port --------------------------
-# RESUME_COUNTER gets a byte per /resume/blob.txt request (pass-2 delta bounds re-gets).
-serverlog="${tmpdir}/server.log"
-counter="${tmpdir}/blobcount"
-RESUME_COUNTER="$counter" python3 "$server" --root "${testdir}/server-root" >"$serverlog" 2>&1 &
-serverpid=$!
-port=
-for _ in $(seq 1 50); do
-    line=$(head -n1 "$serverlog" 2>/dev/null)
-    if test "${line%% *}" == "PORT"; then
-        port="${line#PORT }"
-        break
-    fi
-    kill -0 "$serverpid" 2>/dev/null || {
-        echo "server exited early: $(cat "$serverlog")"
-        exit 1
-    }
-    sleep 0.1
-done
-test -n "$port" || {
-    echo "could not discover server port"
-    exit 1
-}
-base="http://127.0.0.1:${port}"
-
-which httrack >/dev/null || {
-    echo "could not find httrack"
-    exit 1
-}
-out="${tmpdir}/crawl"
-mkdir "$out"
-common=(-O "$out" --quiet --disable-security-limits --robots=0 --timeout=30 --retries=0)
-refdir="${out}/hts-cache/ref"
-
-# --- pass 1: crawl, interrupt once the blob download is underway -------------
-printf '[pass 1: interrupt mid-download] ..\t'
-httrack "${common[@]}" "${base}/resume/index.html" >"${tmpdir}/log1" 2>&1 &
-crawlpid=$!
-# Wait until blob.txt is requested, then SIGTERM so httrack's exit handler
-# finalizes the cache and serializes the temp-ref.
-for _ in $(seq 1 300); do
-    test -s "$counter" && break
-    kill -0 "$crawlpid" 2>/dev/null || break
-    sleep 0.1
-done
-sleep 0.5
-kill -TERM "$crawlpid" 2>/dev/null
-wait "$crawlpid" 2>/dev/null
-crawlpid=
-test -n "$(find "$refdir" -name '*.ref' 2>/dev/null)" || {
-    echo "FAIL: no temp-ref survived pass 1; cannot drive #206"
-    exit 1
-}
-echo "OK (temp-ref present)"
-before=$(wc -c <"$counter" 2>/dev/null || echo 0)
-
-# --- pass 2: --continue -> resume Range -> 416, bounded against the #206 loop -
-# Kill pass 2 after a deadline (portable stand-in for `timeout`, absent on macOS).
-printf '[pass 2: resume must terminate] ..\t'
-HANG_RC=137 # 128 + SIGKILL
-httrack "${common[@]}" --continue "${base}/resume/index.html" >"${tmpdir}/log2" 2>&1 &
-crawlpid=$!
-(sleep 30 && kill -9 "$crawlpid" 2>/dev/null) &
-guard=$!
-rc=0
-wait "$crawlpid" 2>/dev/null || rc=$?
-crawlpid=
-kill "$guard" 2>/dev/null || true
-wait "$guard" 2>/dev/null || true
-if test "$rc" -eq "$HANG_RC"; then
-    echo "FAIL: pass 2 did not terminate (#206 resume->416 loop)"
-    exit 1
-fi
-echo "OK (terminated, rc=$rc)"
-
-# The fix re-gets once (resume Range + range-less re-get = 2): the lower bound
-# rejects a drop-the-link non-fix (1), the upper bound rejects the loop (many).
-after=$(wc -c <"$counter" 2>/dev/null || echo 0)
-hits=$((after - before))
-printf '[bounded re-get count] ..\t'
-if test "$hits" -lt 2; then
-    echo "FAIL: only ${hits} pass-2 request(s); the stale partial was not re-got"
-    exit 1
-fi
-if test "$hits" -gt 8; then
-    echo "FAIL: ${hits} pass-2 requests for blob.txt (resume is looping)"
-    exit 1
-fi
-echo "OK (${hits} requests)"
--- a/tests/21_local-intl-update.test
+++ b/tests/21_local-intl-update.test
@@ -1,11 +0,0 @@
-#!/bin/bash
-#
-# #157: a dotless, accented URL named .html on the first crawl must keep .html
-# across an update -- not revert to the extensionless name.
-
-: "${top_srcdir:=..}"
-
-bash "$top_srcdir/tests/local-crawl.sh" --errors 0 --rerun \
-    --found 'intl/Instalação_CVS_no_Ubuntu.html' \
-    --not-found 'intl/Instalação_CVS_no_Ubuntu' \
-    httrack 'BASEURL/intl/index.html'
--- a/tests/22_local-broken-size.test
+++ b/tests/22_local-broken-size.test
@@ -1,17 +0,0 @@
-#!/bin/bash
-# Issues #32/#41: a Content-Length that disagrees with the body warns "bogus
-# state (broken size)" and skips the cache; -%B (tolerant) accepts it.
-
-: "${top_srcdir:=..}"
-
-# Default: warn, but the file is still written.
-bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
-    --found 'size/oversize.bin' \
-    --log-found 'bogus state \(broken size' \
-    httrack 'BASEURL/size/index.html'
-
-# -%B (tolerant): no warning, file written.
-bash "$top_srcdir/tests/local-crawl.sh" --errors 0 \
-    --found 'size/oversize.bin' \
-    --log-not-found 'bogus state' \
-    httrack 'BASEURL/size/index.html' '-%B'
--- a/tests/23_local-errpage.test
+++ b/tests/23_local-errpage.test
@@ -1,19 +0,0 @@
-#!/bin/bash
-# Issue #17: with "no error pages" (-o0), 4xx/5xx bodies must not be written;
-# a genuine 0-byte 200 stays. Default (-o1) writes the error page. (#17's purge
-# half also does not reproduce; the purge path is not exercised here.)
-set -e
-
-: "${top_srcdir:=..}"
-
-# -o0: 404 suppressed, good page and the legit 0-byte 200 kept.
-bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
-    --found 'errpage/good.html' \
-    --found 'errpage/empty.html' \
-    --not-found 'errpage/missing.html' \
-    httrack 'BASEURL/errpage/index.html' '-o0'
-
-# Control -o1 (default): the 404 error page is written.
-bash "$top_srcdir/tests/local-crawl.sh" --errors 1 \
-    --found 'errpage/missing.html' \
-    httrack 'BASEURL/errpage/index.html' '-o1'
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -26,7 +26,6 @@ TESTS = \
 	00_runnable.test \
 	01_engine-cache.test \
 	01_engine-cache-golden.test \
-	01_engine-cache-writefail.test \
 	01_engine-charset.test \
 	01_engine-cmdline.test \
 	01_engine-cookies.test \
@@ -60,10 +59,6 @@ TESTS = \
 	16_local-assume.test \
 	17_local-empty-ct.test \
 	18_local-update.test \
-	19_local-connect-fallback.test \
-	20_local-resume-loop.test \
-	21_local-intl-update.test \
-	22_local-broken-size.test \
-	23_local-errpage.test
+	19_local-connect-fallback.test

 CLEANFILES = check-network_sh.cache
--- a/tests/local-crawl.sh
+++ b/tests/local-crawl.sh
@@ -14,9 +14,7 @@
 # Usage:
 #   bash local-crawl.sh [--tls] [--root DIR] \
 #       --errors N --files N --found PATH ... --directory PATH ... \
-#       --log-found REGEX ... --log-not-found REGEX ... \
 #       httrack BASEURL/some/path [httrack-args...]
-# --log-found/--log-not-found grep (ERE) the crawl's hts-log.txt.

 set -u

@@ -109,7 +107,7 @@ while test "$pos" -lt "$nargs"; do
        audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
        pos=$((pos + 1))
        ;;
-    --found | --not-found | --directory | --log-found | --log-not-found)
+    --found | --not-found | --directory)
        audit+=("${args[$pos]}" "${args[$((pos + 1))]}")
        pos=$((pos + 1))
        ;;
@@ -198,15 +196,6 @@ if test -n "$rerun"; then
        exit 1
    }
    result "OK (update)"
-    # The update summary reports "files updated"; a fresh crawl never does. Assert
-    # it so a regression that bypasses the cache (re-crawls fresh) can't pass.
-    info "checking update used the cache"
-    if grep -aqE "mirror complete in .*files updated" "${out}/hts-log.txt"; then
-        result "OK"
-    else
-        result "update pass did not report cache activity"
-        exit 1
-    fi
 fi

 # --- discover the single host root (127.0.0.1_<port> or 127.0.0.1) -----------
@@ -259,22 +248,6 @@ while test "$i" -lt "${#audit[@]}"; do
            exit 1
        fi
        ;;
-    --log-found)
-        i=$((i + 1))
-        info "checking log matches ${audit[$i]}"
-        if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then result "OK"; else
-            result "not in log"
-            exit 1
-        fi
-        ;;
-    --log-not-found)
-        i=$((i + 1))
-        info "checking log lacks ${audit[$i]}"
-        if grep -aqE "${audit[$i]}" "${out}/hts-log.txt"; then
-            result "present in log"
-            exit 1
-        else result "OK"; fi
-        ;;
    esac
    i=$((i + 1))
 done
--- a/tests/local-server.py
+++ b/tests/local-server.py
@@ -15,7 +15,6 @@ stdlib only (http.server + ssl) -- no new build or runtime dependency.

 import argparse
 import os
-import time
 from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
 from urllib.parse import quote, unquote, urlsplit

@@ -177,87 +176,6 @@ class Handler(SimpleHTTPRequestHandler):
        body, ctype = self.TYPE_MATRIX[path]
        self.send_raw(body, ctype)

-    # --- special chars in URLs across an update (issue #157) ---------------
-    # A dotless, accented basename served as text/html (MediaWiki style). The
-    # name the first crawl picks (.html) must survive the update pass.
-    INTL_NAME = "Instalação_CVS_no_Ubuntu"
-
-    def route_intl_index(self):
-        self.send_html('\t<a href="%s">accented</a>\n' % self.INTL_NAME)
-
-    def route_intl_page(self):
-        self.send_raw(b"<html><body>accented page</body></html>\n", "text/html")
-
-    # resume / 416 loop (#206): the first GET stalls after a prefix so the crawl
-    # can be interrupted (partial + temp-ref); every later request is 416.
-    RESUME_PREFIX = b"PARTIAL-" + b"x" * 4096  # flushed before the stall
-    RESUME_LEN = len(RESUME_PREFIX) + 4096  # declared length never delivered
-    _resume_started = False
-
-    def route_resume_index(self):
-        self.send_html('\t<a href="blob.txt">blob</a>')
-
-    def route_resume(self):
-        counter = os.environ.get("RESUME_COUNTER")
-        if counter:
-            with open(counter, "a") as fp:
-                fp.write("x")
-        # First GET: stall mid-body so the crawl can be interrupted with a partial.
-        if not Handler._resume_started:
-            Handler._resume_started = True
-            self.send_response(200)
-            self.send_header("Content-Type", "image/png")
-            self.send_header("Content-Length", str(self.RESUME_LEN))
-            self.send_header("Accept-Ranges", "bytes")
-            self.end_headers()
-            if self.command != "HEAD":
-                self.wfile.write(self.RESUME_PREFIX)
-                self.wfile.flush()
-                try:
-                    while True:
-                        time.sleep(3600)
-                except OSError:
-                    pass
-            return
-        self.send_response(416, "Requested Range Not Satisfiable")
-        self.send_header("Content-Type", "image/png")
-        self.send_header("Content-Range", "bytes */%d" % self.RESUME_LEN)
-        self.send_header("Content-Length", "0")
-        self.end_headers()
-
-    # error pages / 0-byte files (#17): -o0 ("no error pages") must keep 4xx/5xx
-    # bodies off disk; a genuine 0-byte 200 is a valid file and stays.
-    def route_errpage_index(self):
-        self.send_html(
-            '\t<a href="good.html">good</a>\n'
-            '\t<a href="missing.html">missing</a>\n'
-            '\t<a href="empty.html">empty</a>\n'
-        )
-
-    def route_errpage_good(self):
-        self.send_raw(b"<html><body>good page</body></html>\n", "text/html")
-
-    def route_errpage_missing(self):
-        self.send_html("\t404 error body", status=404, extra_status="Not Found")
-
-    def route_errpage_empty(self):
-        self.send_raw(b"", "text/html")
-
-    # broken Content-Length (#32/#41): declared size != bytes sent. httrack
-    # warns "bogus state (broken size)" and skips the cache unless -%B.
-    def route_size_index(self):
-        self.send_html('\t<a href="oversize.bin">over</a>\n')
-
-    def route_size_oversize(self):
-        body = b"A" * 100
-        self.send_response(200)
-        self.send_header("Content-Type", "application/octet-stream")
-        self.send_header("Content-Length", str(len(body) - 2))  # lie: too short
-        self.send_header("Connection", "close")
-        self.end_headers()
-        if self.command != "HEAD":
-            self.wfile.write(body)
-
    ROUTES = {
        "/cookies/entrance.php": route_entrance,
        "/cookies/second.php": route_second,
@@ -277,16 +195,6 @@ class Handler(SimpleHTTPRequestHandler):
        "/types/style.css": route_types,
        "/types/data.json": route_types,
        "/types/gen.php": route_types,
-        "/intl/index.html": route_intl_index,
-        "/intl/" + INTL_NAME: route_intl_page,
-        "/resume/index.html": route_resume_index,
-        "/resume/blob.txt": route_resume,
-        "/size/index.html": route_size_index,
-        "/size/oversize.bin": route_size_oversize,
-        "/errpage/index.html": route_errpage_index,
-        "/errpage/good.html": route_errpage_good,
-        "/errpage/missing.html": route_errpage_missing,
-        "/errpage/empty.html": route_errpage_empty,
    }

    # --- dispatch ----------------------------------------------------------
@@ -294,8 +202,7 @@ class Handler(SimpleHTTPRequestHandler):
    def dispatch(self):
        self._set_cookies = []
        path = urlsplit(self.path).path
-        # Match percent-encoded paths (accented #157 route) by their decoded form.
-        handler = self.ROUTES.get(path) or self.ROUTES.get(unquote(path))
+        handler = self.ROUTES.get(path)
        if handler is not None:
            handler(self)
            return True