Fixed issue 25 regarding un-encoding of characters such as # in the filename.

2026-05-17 00:16:02 +03:00 · 2013-08-17 09:09:13 +00:00
parent e002254001
commit 2d6017ad06
5 changed files with 55 additions and 13 deletions
--- a/src/htsencoding.c
+++ b/src/htsencoding.c
@@ -204,7 +204,8 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
  return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8");
 }

-int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
+int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
+                           const int flags) {
  size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize;
  int seenQuery = 0;
  char utfBuffer[32];
@@ -239,7 +240,10 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
        cUtf = (unsigned char) ec;

        /* Shortcut for ASCII (do not unescape non-printable) */
-        if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) {
+        if (
+            (cUtf < 0x80 && cUtf >= 32)
+            && ( flags & UNESCAPE_URL_NO_ASCII ) == 0
+            ) {
          /* Rollback new write position and character */
          j = lastJ;
          c = ec;
@@ -251,7 +255,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
    /* ASCII (and not in %xx) */
    else if (cUtf < 0x80 && i != lastI + 1) {
      k = 0;  /* cancel any sequence */
-      if (!seenQuery && c == '?') {
+      if (c == '?' && !seenQuery) {
        seenQuery = 1;
      }
    }
@@ -316,3 +320,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {

  return 0;
 }
+
+int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
+  return hts_unescapeUrlSpecial(src, dest, max, 0);
+}
--- a/src/htsencoding.h
+++ b/src/htsencoding.h
@@ -41,6 +41,14 @@ Please visit our Website: http://www.httrack.com
 #include <windows.h>
 #endif

+/**
+ * Flags for hts_unescapeUrlSpecial().
+ **/
+typedef enum unescapeFlags {
+  /** Do not decode ASCII. **/
+  UNESCAPE_URL_NO_ASCII = 1
+} unescapeFlags;
+
 /**
 * Unescape HTML entities (as per HTML 4.0 Specification)
 * and replace them in-place by their UTF-8 equivalents.
@@ -71,4 +79,16 @@ extern int hts_unescapeEntitiesWithCharset(const char *src,
 **/
 extern int hts_unescapeUrl(const char *src, char *dest, const size_t max);

+/**
+ * Unescape an URL-encoded string. The implicit charset is UTF-8.
+ * In case of UTF-8 decoding error inside URL-encoded characters,
+ * the characters are left undecoded.
+ * "flags" is a mask composed of UNESCAPE_URL_XXX constants.
+ * Note: source and destination MUST NOT be the same.
+ * Returns 0 upon success, -1 upon overflow or error.
+ **/
+extern int hts_unescapeUrlSpecial(const char *src,
+                                  char *dest, const size_t max,
+                                  int flags);
+
 #endif
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -3708,6 +3708,8 @@ HTSEXT_API char *unescape_http(char *catbuff, const char *s) {

 // unescape in URL/URI ONLY what has to be escaped, to form a standard URL/URI
 // DOES NOT DECODE %25 (part of CHAR_DELIM)
+// no_high & 1: decode high chars
+// no_high & 2: decode space
 HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high) {
  size_t i, j;

@@ -3720,8 +3722,8 @@ HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high)
        || CHAR_DELIM(nchar)
        || CHAR_UNWISE(nchar)
        || CHAR_LOW(nchar)    /* CHAR_SPECIAL */
-        || CHAR_XXAVOID(nchar)
-        || ( no_high && CHAR_HIG(nchar) )
+        || ( CHAR_XXAVOID(nchar) && ( nchar != ' ' || ( no_high & 2) == 0 ) )
+        || ( ( no_high & 1 ) && CHAR_HIG(nchar) )
        ;

      if (!test && nchar >= 0) {  /* can safely unescape */
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -2100,18 +2100,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                  const int hasCharset = charset != NULL 
                    && *charset != '\0';
                  char BIGSTK query[HTS_URLMAXSIZE * 2];
-                  char *const a = strchr(lien, '?');

                  // cut query string
-                  if (a != NULL) {
-                    strcpybuff(query, a);
-                    *a = '\0';
-                  } else {
-                    query[0] = '\0';
+                  {
+                    char *const a = strchr(lien, '?');
+                    if (a != NULL) {
+                      strcpybuff(query, a);
+                      *a = '\0';
+                    } else {
+                      query[0] = '\0';
+                    }
                  }

                  // Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8)
-                  strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1));     /* note: '%' is still escaped */
+                  strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1 | 2));     /* note: '%' is still escaped */

                  // Force to encode non-printable chars (should never happend)
                  escape_remove_control(lien);
@@ -2149,7 +2151,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                  // Decode remaining %XX high characters with UTF-8 
                  // but only when this leads to valid UTF-8.
                  // Otherwise, leave them unescaped.
-                  if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) {
+                  if (hts_unescapeUrlSpecial(lien, catbuff, sizeof(catbuff),
+                                             UNESCAPE_URL_NO_ASCII) == 0) {
                    strcpybuff(lien, catbuff);
                  } else {
                    hts_log_print(opt, LOG_WARNING,
--- a/tests/11_crawl-parsing.test
+++ b/tests/11_crawl-parsing.test
@@ -34,3 +34,12 @@ bash crawl-test.sh --errors 0 --files 6 \
 	--found "ut.httrack.com/parsing/foo barae52.html" \
 	--found "ut.httrack.com/parsing/foo bar7b30.html" \
 	httrack http://ut.httrack.com/parsing/escaping.html
+
+# handling of # encoded in filename
+# see http://code.google.com/p/httrack/issues/detail?id=25
+bash crawl-test.sh --errors 2 --files 4 \
+	--found "ut.httrack.com/parsing/escaping2.html" \
+	--found "ut.httrack.com/parsing/++foo++bar++plus++.html" \
+	--found "ut.httrack.com/parsing/foo#bar#.html" \
+	--found "ut.httrack.com/parsing/foo bar.html" \
+	httrack http://ut.httrack.com/parsing/escaping2.html