Do not unescape '+' before the query string

Fixed issue 18
This commit is contained in:
Xavier Roche
2013-07-05 17:53:54 +00:00
parent 00fe2d4432
commit d2a3d7a3ff
2 changed files with 15 additions and 1 deletions

View File

@@ -206,6 +206,7 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize;
int seenQuery = 0;
char utfBuffer[32];
assert(src != dest);
@@ -218,7 +219,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
unsigned char cUtf = (unsigned char) c;
/* Replacement for ' ' */
if (c == '+') {
if (c == '+' && seenQuery) {
c = cUtf = ' ';
k = 0; /* cancel any sequence */
}
@@ -250,6 +251,9 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
/* ASCII (and not in %xx) */
else if (cUtf < 0x80 && i != lastI + 1) {
k = 0; /* cancel any sequence */
if (!seenQuery && c == '?') {
seenQuery = 1;
}
}
/* UTF-8 sequence in progress (either a raw or a %xx character) */

View File

@@ -24,3 +24,13 @@ bash crawl-test.sh --errors 0 --files 3 \
--found ut.httrack.com/parsing/fade.gif \
--found ut.httrack.com/parsing/javascript.html \
httrack http://ut.httrack.com/parsing/javascript.html
# handling of + before query string
bash crawl-test.sh --errors 0 --files 6 \
--found ut.httrack.com/parsing/escaping.html \
--found "ut.httrack.com/parsing/foo bar30f4.html" \
--found "ut.httrack.com/parsing/foo bar5e1f.html" \
--found "ut.httrack.com/parsing/foo+bar3860.html" \
--found "ut.httrack.com/parsing/foo barae52.html" \
--found "ut.httrack.com/parsing/foo bar7b30.html" \
httrack http://ut.httrack.com/parsing/escaping.html