Fixed issue 25 regarding un-encoding of characters such as # in the filename.

This commit is contained in:
Xavier Roche
2013-08-17 09:09:13 +00:00
parent e002254001
commit 2d6017ad06
5 changed files with 55 additions and 13 deletions

View File

@@ -204,7 +204,8 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8");
}
int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
const int flags) {
size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize;
int seenQuery = 0;
char utfBuffer[32];
@@ -239,7 +240,10 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
cUtf = (unsigned char) ec;
/* Shortcut for ASCII (do not unescape non-printable) */
if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) {
if (
(cUtf < 0x80 && cUtf >= 32)
&& ( flags & UNESCAPE_URL_NO_ASCII ) == 0
) {
/* Rollback new write position and character */
j = lastJ;
c = ec;
@@ -251,7 +255,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
/* ASCII (and not in %xx) */
else if (cUtf < 0x80 && i != lastI + 1) {
k = 0; /* cancel any sequence */
if (!seenQuery && c == '?') {
if (c == '?' && !seenQuery) {
seenQuery = 1;
}
}
@@ -316,3 +320,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
return 0;
}
int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
return hts_unescapeUrlSpecial(src, dest, max, 0);
}

View File

@@ -41,6 +41,14 @@ Please visit our Website: http://www.httrack.com
#include <windows.h>
#endif
/**
* Flags for hts_unescapeUrlSpecial().
**/
typedef enum unescapeFlags {
/** Do not decode ASCII. **/
UNESCAPE_URL_NO_ASCII = 1
} unescapeFlags;
/**
* Unescape HTML entities (as per HTML 4.0 Specification)
* and replace them in-place by their UTF-8 equivalents.
@@ -71,4 +79,16 @@ extern int hts_unescapeEntitiesWithCharset(const char *src,
**/
extern int hts_unescapeUrl(const char *src, char *dest, const size_t max);
/**
* Unescape an URL-encoded string. The implicit charset is UTF-8.
* In case of UTF-8 decoding error inside URL-encoded characters,
* the characters are left undecoded.
* "flags" is a mask composed of UNESCAPE_URL_XXX constants.
* Note: source and destination MUST NOT be the same.
* Returns 0 upon success, -1 upon overflow or error.
**/
extern int hts_unescapeUrlSpecial(const char *src,
char *dest, const size_t max,
int flags);
#endif

View File

@@ -3708,6 +3708,8 @@ HTSEXT_API char *unescape_http(char *catbuff, const char *s) {
// unescape in URL/URI ONLY what has to be escaped, to form a standard URL/URI
// DOES NOT DECODE %25 (part of CHAR_DELIM)
// no_high & 1: decode high chars
// no_high & 2: decode space
HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high) {
size_t i, j;
@@ -3720,8 +3722,8 @@ HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high)
|| CHAR_DELIM(nchar)
|| CHAR_UNWISE(nchar)
|| CHAR_LOW(nchar) /* CHAR_SPECIAL */
|| CHAR_XXAVOID(nchar)
|| ( no_high && CHAR_HIG(nchar) )
|| ( CHAR_XXAVOID(nchar) && ( nchar != ' ' || ( no_high & 2) == 0 ) )
|| ( ( no_high & 1 ) && CHAR_HIG(nchar) )
;
if (!test && nchar >= 0) { /* can safely unescape */

View File

@@ -2100,18 +2100,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
const int hasCharset = charset != NULL
&& *charset != '\0';
char BIGSTK query[HTS_URLMAXSIZE * 2];
char *const a = strchr(lien, '?');
// cut query string
if (a != NULL) {
strcpybuff(query, a);
*a = '\0';
} else {
query[0] = '\0';
{
char *const a = strchr(lien, '?');
if (a != NULL) {
strcpybuff(query, a);
*a = '\0';
} else {
query[0] = '\0';
}
}
// Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8)
strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1)); /* note: '%' is still escaped */
strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1 | 2)); /* note: '%' is still escaped */
// Force to encode non-printable chars (should never happend)
escape_remove_control(lien);
@@ -2149,7 +2151,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
// Decode remaining %XX high characters with UTF-8
// but only when this leads to valid UTF-8.
// Otherwise, leave them unescaped.
if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) {
if (hts_unescapeUrlSpecial(lien, catbuff, sizeof(catbuff),
UNESCAPE_URL_NO_ASCII) == 0) {
strcpybuff(lien, catbuff);
} else {
hts_log_print(opt, LOG_WARNING,

View File

@@ -34,3 +34,12 @@ bash crawl-test.sh --errors 0 --files 6 \
--found "ut.httrack.com/parsing/foo barae52.html" \
--found "ut.httrack.com/parsing/foo bar7b30.html" \
httrack http://ut.httrack.com/parsing/escaping.html
# handling of # encoded in filename
# see http://code.google.com/p/httrack/issues/detail?id=25
bash crawl-test.sh --errors 2 --files 4 \
--found "ut.httrack.com/parsing/escaping2.html" \
--found "ut.httrack.com/parsing/++foo++bar++plus++.html" \
--found "ut.httrack.com/parsing/foo#bar#.html" \
--found "ut.httrack.com/parsing/foo bar.html" \
httrack http://ut.httrack.com/parsing/escaping2.html