mirror of
https://github.com/xroche/httrack.git
synced 2026-05-17 00:16:02 +03:00
Fixed issue 25 regarding un-encoding of characters such as # in the filename.
This commit is contained in:
@@ -204,7 +204,8 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
|
||||
return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8");
|
||||
}
|
||||
|
||||
int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
|
||||
int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
|
||||
const int flags) {
|
||||
size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize;
|
||||
int seenQuery = 0;
|
||||
char utfBuffer[32];
|
||||
@@ -239,7 +240,10 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
|
||||
cUtf = (unsigned char) ec;
|
||||
|
||||
/* Shortcut for ASCII (do not unescape non-printable) */
|
||||
if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) {
|
||||
if (
|
||||
(cUtf < 0x80 && cUtf >= 32)
|
||||
&& ( flags & UNESCAPE_URL_NO_ASCII ) == 0
|
||||
) {
|
||||
/* Rollback new write position and character */
|
||||
j = lastJ;
|
||||
c = ec;
|
||||
@@ -251,7 +255,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
|
||||
/* ASCII (and not in %xx) */
|
||||
else if (cUtf < 0x80 && i != lastI + 1) {
|
||||
k = 0; /* cancel any sequence */
|
||||
if (!seenQuery && c == '?') {
|
||||
if (c == '?' && !seenQuery) {
|
||||
seenQuery = 1;
|
||||
}
|
||||
}
|
||||
@@ -316,3 +320,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
|
||||
return hts_unescapeUrlSpecial(src, dest, max, 0);
|
||||
}
|
||||
|
||||
@@ -41,6 +41,14 @@ Please visit our Website: http://www.httrack.com
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Flags for hts_unescapeUrlSpecial().
|
||||
**/
|
||||
typedef enum unescapeFlags {
|
||||
/** Do not decode ASCII. **/
|
||||
UNESCAPE_URL_NO_ASCII = 1
|
||||
} unescapeFlags;
|
||||
|
||||
/**
|
||||
* Unescape HTML entities (as per HTML 4.0 Specification)
|
||||
* and replace them in-place by their UTF-8 equivalents.
|
||||
@@ -71,4 +79,16 @@ extern int hts_unescapeEntitiesWithCharset(const char *src,
|
||||
**/
|
||||
extern int hts_unescapeUrl(const char *src, char *dest, const size_t max);
|
||||
|
||||
/**
|
||||
* Unescape an URL-encoded string. The implicit charset is UTF-8.
|
||||
* In case of UTF-8 decoding error inside URL-encoded characters,
|
||||
* the characters are left undecoded.
|
||||
* "flags" is a mask composed of UNESCAPE_URL_XXX constants.
|
||||
* Note: source and destination MUST NOT be the same.
|
||||
* Returns 0 upon success, -1 upon overflow or error.
|
||||
**/
|
||||
extern int hts_unescapeUrlSpecial(const char *src,
|
||||
char *dest, const size_t max,
|
||||
int flags);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3708,6 +3708,8 @@ HTSEXT_API char *unescape_http(char *catbuff, const char *s) {
|
||||
|
||||
// unescape in URL/URI ONLY what has to be escaped, to form a standard URL/URI
|
||||
// DOES NOT DECODE %25 (part of CHAR_DELIM)
|
||||
// no_high & 1: decode high chars
|
||||
// no_high & 2: decode space
|
||||
HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high) {
|
||||
size_t i, j;
|
||||
|
||||
@@ -3720,8 +3722,8 @@ HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high)
|
||||
|| CHAR_DELIM(nchar)
|
||||
|| CHAR_UNWISE(nchar)
|
||||
|| CHAR_LOW(nchar) /* CHAR_SPECIAL */
|
||||
|| CHAR_XXAVOID(nchar)
|
||||
|| ( no_high && CHAR_HIG(nchar) )
|
||||
|| ( CHAR_XXAVOID(nchar) && ( nchar != ' ' || ( no_high & 2) == 0 ) )
|
||||
|| ( ( no_high & 1 ) && CHAR_HIG(nchar) )
|
||||
;
|
||||
|
||||
if (!test && nchar >= 0) { /* can safely unescape */
|
||||
|
||||
@@ -2100,18 +2100,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
const int hasCharset = charset != NULL
|
||||
&& *charset != '\0';
|
||||
char BIGSTK query[HTS_URLMAXSIZE * 2];
|
||||
char *const a = strchr(lien, '?');
|
||||
|
||||
// cut query string
|
||||
if (a != NULL) {
|
||||
strcpybuff(query, a);
|
||||
*a = '\0';
|
||||
} else {
|
||||
query[0] = '\0';
|
||||
{
|
||||
char *const a = strchr(lien, '?');
|
||||
if (a != NULL) {
|
||||
strcpybuff(query, a);
|
||||
*a = '\0';
|
||||
} else {
|
||||
query[0] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
// Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8)
|
||||
strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1)); /* note: '%' is still escaped */
|
||||
strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1 | 2)); /* note: '%' is still escaped */
|
||||
|
||||
// Force to encode non-printable chars (should never happend)
|
||||
escape_remove_control(lien);
|
||||
@@ -2149,7 +2151,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
|
||||
// Decode remaining %XX high characters with UTF-8
|
||||
// but only when this leads to valid UTF-8.
|
||||
// Otherwise, leave them unescaped.
|
||||
if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) {
|
||||
if (hts_unescapeUrlSpecial(lien, catbuff, sizeof(catbuff),
|
||||
UNESCAPE_URL_NO_ASCII) == 0) {
|
||||
strcpybuff(lien, catbuff);
|
||||
} else {
|
||||
hts_log_print(opt, LOG_WARNING,
|
||||
|
||||
@@ -34,3 +34,12 @@ bash crawl-test.sh --errors 0 --files 6 \
|
||||
--found "ut.httrack.com/parsing/foo barae52.html" \
|
||||
--found "ut.httrack.com/parsing/foo bar7b30.html" \
|
||||
httrack http://ut.httrack.com/parsing/escaping.html
|
||||
|
||||
# handling of # encoded in filename
|
||||
# see http://code.google.com/p/httrack/issues/detail?id=25
|
||||
bash crawl-test.sh --errors 2 --files 4 \
|
||||
--found "ut.httrack.com/parsing/escaping2.html" \
|
||||
--found "ut.httrack.com/parsing/++foo++bar++plus++.html" \
|
||||
--found "ut.httrack.com/parsing/foo#bar#.html" \
|
||||
--found "ut.httrack.com/parsing/foo bar.html" \
|
||||
httrack http://ut.httrack.com/parsing/escaping2.html
|
||||
|
||||
Reference in New Issue
Block a user