825cb52060
Bug#54180 - Fragments containing '?' Do not throw a notice on bad URLs Add 'port' to return value after 'host', not at end of array
486 linhas
12 KiB
C++
486 linhas
12 KiB
C++
/*
|
|
+----------------------------------------------------------------------+
|
|
| HipHop for PHP |
|
|
+----------------------------------------------------------------------+
|
|
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
|
|
| Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
|
|
+----------------------------------------------------------------------+
|
|
| This source file is subject to version 2.00 of the Zend license, |
|
|
| that is bundled with this package in the file LICENSE, and is |
|
|
| available through the world-wide-web at the following url: |
|
|
| http://www.zend.com/license/2_00.txt. |
|
|
| If you did not receive a copy of the Zend license and are unable to |
|
|
| obtain it through the world-wide-web, please send a note to |
|
|
| license@zend.com so we can mail you a copy immediately. |
|
|
+----------------------------------------------------------------------+
|
|
*/
|
|
|
|
#include "hphp/runtime/base/zend_url.h"
|
|
#include "hphp/runtime/base/zend_string.h"
|
|
|
|
namespace HPHP {
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
static char *replace_controlchars(char *str, int len) {
|
|
unsigned char *s = (unsigned char *)str;
|
|
unsigned char *e = (unsigned char *)str + len;
|
|
|
|
if (!str) {
|
|
return (nullptr);
|
|
}
|
|
|
|
while (s < e) {
|
|
if (iscntrl(*s)) {
|
|
*s='_';
|
|
}
|
|
s++;
|
|
}
|
|
|
|
return (str);
|
|
}
|
|
|
|
Url::~Url() {
|
|
if (scheme) free(scheme);
|
|
if (user) free(user);
|
|
if (pass) free(pass);
|
|
if (host) free(host);
|
|
if (path) free(path);
|
|
if (query) free(query);
|
|
if (fragment) free(fragment);
|
|
}
|
|
|
|
bool url_parse(Url &output, const char *str, int length) {
|
|
memset(&output, 0, sizeof(Url));
|
|
|
|
char port_buf[6];
|
|
const char *s, *e, *p, *pp, *ue;
|
|
|
|
s = str;
|
|
ue = s + length;
|
|
|
|
/* parse scheme */
|
|
if ((e = (const char *)memchr((const void *)s, ':', length)) && (e - s)) {
|
|
/* validate scheme */
|
|
p = s;
|
|
while (p < e) {
|
|
/* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */
|
|
if (!isalpha(*p) && !isdigit(*p) &&
|
|
*p != '+' && *p != '.' && *p != '-') {
|
|
if (e + 1 < ue) {
|
|
goto parse_port;
|
|
} else {
|
|
goto just_path;
|
|
}
|
|
}
|
|
p++;
|
|
}
|
|
|
|
if (*(e + 1) == '\0') { /* only scheme is available */
|
|
output.scheme = string_duplicate(s, (e - s));
|
|
replace_controlchars(output.scheme, (e - s));
|
|
goto end;
|
|
}
|
|
|
|
/*
|
|
* certain schemas like mailto: and zlib: may not have any / after them
|
|
* this check ensures we support those.
|
|
*/
|
|
if (*(e+1) != '/') {
|
|
/* check if the data we get is a port this allows us to
|
|
* correctly parse things like a.com:80
|
|
*/
|
|
p = e + 1;
|
|
while (isdigit(*p)) {
|
|
p++;
|
|
}
|
|
|
|
if ((*p == '\0' || *p == '/') && (p - e) < 7) {
|
|
goto parse_port;
|
|
}
|
|
|
|
output.scheme = string_duplicate(s, (e-s));
|
|
replace_controlchars(output.scheme, (e - s));
|
|
|
|
length -= ++e - s;
|
|
s = e;
|
|
goto just_path;
|
|
} else {
|
|
output.scheme = string_duplicate(s, (e-s));
|
|
replace_controlchars(output.scheme, (e - s));
|
|
|
|
if (*(e+2) == '/') {
|
|
s = e + 3;
|
|
if (!strncasecmp("file", output.scheme, sizeof("file"))) {
|
|
if (*(e + 3) == '/') {
|
|
/* support windows drive letters as in:
|
|
file:///c:/somedir/file.txt
|
|
*/
|
|
if (*(e + 5) == ':') {
|
|
s = e + 4;
|
|
}
|
|
goto nohost;
|
|
}
|
|
}
|
|
} else {
|
|
if (!strncasecmp("file", output.scheme, sizeof("file"))) {
|
|
s = e + 1;
|
|
goto nohost;
|
|
} else {
|
|
length -= ++e - s;
|
|
s = e;
|
|
goto just_path;
|
|
}
|
|
}
|
|
}
|
|
} else if (e) { /* no scheme, look for port */
|
|
parse_port:
|
|
p = e + 1;
|
|
pp = p;
|
|
|
|
while (pp-p < 6 && isdigit(*pp)) {
|
|
pp++;
|
|
}
|
|
|
|
if (pp-p < 6 && (*pp == '/' || *pp == '\0')) {
|
|
memcpy(port_buf, p, (pp-p));
|
|
port_buf[pp-p] = '\0';
|
|
auto port = atoi(port_buf);
|
|
if (port > 0 && port <= 65535) {
|
|
output.port = port;
|
|
} else {
|
|
return false;
|
|
}
|
|
} else {
|
|
goto just_path;
|
|
}
|
|
} else {
|
|
just_path:
|
|
ue = s + length;
|
|
goto nohost;
|
|
}
|
|
|
|
e = ue;
|
|
|
|
if (!(p = (const char *)memchr(s, '/', (ue - s)))) {
|
|
const char *query = (const char *)memchr(s, '?', (ue - s));
|
|
const char *fragment = (const char *)memchr(s, '#', (ue - s));
|
|
|
|
if (query && fragment) {
|
|
e = (query > fragment) ? fragment : query;
|
|
} else if (query) {
|
|
e = query;
|
|
} else if (fragment) {
|
|
e = fragment;
|
|
}
|
|
} else {
|
|
e = p;
|
|
}
|
|
|
|
/* check for login and password */
|
|
if ((p = (const char *)memrchr(s, '@', (e-s)))) {
|
|
if ((pp = (const char *)memchr(s, ':', (p-s)))) {
|
|
if ((pp-s) > 0) {
|
|
output.user = string_duplicate(s, (pp-s));
|
|
replace_controlchars(output.user, (pp - s));
|
|
}
|
|
|
|
pp++;
|
|
if (p-pp > 0) {
|
|
output.pass = string_duplicate(pp, (p-pp));
|
|
replace_controlchars(output.pass, (p-pp));
|
|
}
|
|
} else {
|
|
output.user = string_duplicate(s, (p-s));
|
|
replace_controlchars(output.user, (p-s));
|
|
}
|
|
|
|
s = p + 1;
|
|
}
|
|
|
|
/* check for port */
|
|
if (*s == '[' && *(e-1) == ']') {
|
|
/* Short circuit portscan,
|
|
we're dealing with an
|
|
IPv6 embedded address */
|
|
p = s;
|
|
} else {
|
|
/* memrchr is a GNU specific extension
|
|
Emulate for wide compatability */
|
|
for(p = e; *p != ':' && p >= s; p--);
|
|
}
|
|
|
|
if (p >= s && *p == ':') {
|
|
if (!output.port) {
|
|
p++;
|
|
if (e-p > 5) { /* port cannot be longer then 5 characters */
|
|
return false;
|
|
} else if (e - p > 0) {
|
|
memcpy(port_buf, p, (e-p));
|
|
port_buf[e-p] = '\0';
|
|
auto port = atoi(port_buf);
|
|
if (port > 0 && port <= 65535) {
|
|
output.port = port;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
p--;
|
|
}
|
|
} else {
|
|
p = e;
|
|
}
|
|
|
|
/* check if we have a valid host, if we don't reject the string as url */
|
|
if ((p-s) < 1) {
|
|
return false;
|
|
}
|
|
|
|
output.host = string_duplicate(s, (p-s));
|
|
replace_controlchars(output.host, (p - s));
|
|
|
|
if (e == ue) {
|
|
return true;
|
|
}
|
|
|
|
s = e;
|
|
|
|
nohost:
|
|
|
|
if ((p = (const char *)memchr(s, '?', (ue - s)))) {
|
|
pp = strchr(s, '#');
|
|
|
|
if (pp && pp < p) {
|
|
if (pp - s) {
|
|
output.path = string_duplicate(s, (pp-s));
|
|
replace_controlchars(output.path, (pp - s));
|
|
p = pp;
|
|
}
|
|
goto label_parse;
|
|
}
|
|
|
|
if (p - s) {
|
|
output.path = string_duplicate(s, (p-s));
|
|
replace_controlchars(output.path, (p - s));
|
|
}
|
|
|
|
if (pp) {
|
|
if (pp - ++p) {
|
|
output.query = string_duplicate(p, (pp-p));
|
|
replace_controlchars(output.query, (pp - p));
|
|
}
|
|
p = pp;
|
|
goto label_parse;
|
|
} else if (++p - ue) {
|
|
output.query = string_duplicate(p, (ue-p));
|
|
replace_controlchars(output.query, (ue - p));
|
|
}
|
|
} else if ((p = (const char *)memchr(s, '#', (ue - s)))) {
|
|
if (p - s) {
|
|
output.path = string_duplicate(s, (p-s));
|
|
replace_controlchars(output.path, (p - s));
|
|
}
|
|
|
|
label_parse:
|
|
p++;
|
|
|
|
if (ue - p) {
|
|
output.fragment = string_duplicate(p, (ue-p));
|
|
replace_controlchars(output.fragment, (ue - p));
|
|
}
|
|
} else {
|
|
output.path = string_duplicate(s, (ue-s));
|
|
replace_controlchars(output.path, (ue - s));
|
|
}
|
|
end:
|
|
return true;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
static int php_htoi(char *s) {
|
|
int value;
|
|
int c;
|
|
|
|
c = ((unsigned char *)s)[0];
|
|
if (isupper(c))
|
|
c = tolower(c);
|
|
value = (c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10) * 16;
|
|
|
|
c = ((unsigned char *)s)[1];
|
|
if (isupper(c))
|
|
c = tolower(c);
|
|
value += c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10;
|
|
|
|
return (value);
|
|
}
|
|
|
|
/* rfc1738:
|
|
|
|
...The characters ";",
|
|
"/", "?", ":", "@", "=" and "&" are the characters which may be
|
|
reserved for special meaning within a scheme...
|
|
|
|
...Thus, only alphanumerics, the special characters "$-_.+!*'(),", and
|
|
reserved characters used for their reserved purposes may be used
|
|
unencoded within a URL...
|
|
|
|
For added safety, we only leave -_. unencoded.
|
|
*/
|
|
|
|
static unsigned char hexchars[] = "0123456789ABCDEF";
|
|
|
|
char *url_encode(const char *s, int &len) {
|
|
register unsigned char c;
|
|
unsigned char *to, *start;
|
|
unsigned char const *from, *end;
|
|
|
|
from = (unsigned char const *)s;
|
|
end = (unsigned char const *)s + len;
|
|
start = to = (unsigned char *)malloc(3 * len + 1);
|
|
|
|
while (from < end) {
|
|
c = *from++;
|
|
|
|
if (c == ' ') {
|
|
*to++ = '+';
|
|
} else if ((c < '0' && c != '-' && c != '.') ||
|
|
(c < 'A' && c > '9') ||
|
|
(c > 'Z' && c < 'a' && c != '_') ||
|
|
(c > 'z')) {
|
|
to[0] = '%';
|
|
to[1] = hexchars[c >> 4];
|
|
to[2] = hexchars[c & 15];
|
|
to += 3;
|
|
} else {
|
|
*to++ = c;
|
|
}
|
|
}
|
|
*to = 0;
|
|
len = to - start;
|
|
return (char *) start;
|
|
}
|
|
|
|
char *url_decode(const char *s, int &len) {
|
|
char *str = string_duplicate(s, len);
|
|
char *dest = str;
|
|
char *data = str;
|
|
|
|
while (len--) {
|
|
if (*data == '+') {
|
|
*dest = ' ';
|
|
}
|
|
else if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1))
|
|
&& isxdigit((int) *(data + 2))) {
|
|
*dest = (char) php_htoi(data + 1);
|
|
data += 2;
|
|
len -= 2;
|
|
} else {
|
|
*dest = *data;
|
|
}
|
|
data++;
|
|
dest++;
|
|
}
|
|
*dest = '\0';
|
|
len = dest - str;
|
|
return str;
|
|
}
|
|
|
|
// copied and re-factored from clearsilver-0.10.5/cgi/cgi.c
|
|
int url_decode(char *value) {
|
|
assert(value && *value); // check before calling this function
|
|
|
|
int i = 0, o = 0;
|
|
unsigned char *s = (unsigned char *)value;
|
|
|
|
while (s[i]) {
|
|
if (s[i] == '+') {
|
|
s[o++] = ' ';
|
|
i++;
|
|
} else if (s[i] == '%' && isxdigit(s[i+1]) && isxdigit(s[i+2])) {
|
|
char num;
|
|
num = (s[i+1] >= 'A') ? ((s[i+1] & 0xdf) - 'A') + 10 : (s[i+1] - '0');
|
|
num *= 16;
|
|
num += (s[i+2] >= 'A') ? ((s[i+2] & 0xdf) - 'A') + 10 : (s[i+2] - '0');
|
|
s[o++] = num;
|
|
i+=3;
|
|
} else {
|
|
s[o++] = s[i++];
|
|
}
|
|
}
|
|
if (i && o) s[o] = '\0';
|
|
return o;
|
|
}
|
|
|
|
int url_decode_ex(char *value, int len) {
|
|
assert(value && *value); // check before calling this function
|
|
assert(len >= 0);
|
|
if (len <= 0) return 0;
|
|
|
|
int i = 0, o = 0;
|
|
unsigned char *s = (unsigned char *)value;
|
|
unsigned char *end = s + len;
|
|
while (s + i < end) {
|
|
if (s[i] == '+') {
|
|
s[o++] = ' ';
|
|
i++;
|
|
} else if (s[i] == '%' && isxdigit(s[i+1]) && isxdigit(s[i+2])) {
|
|
char num;
|
|
num = (s[i+1] >= 'A') ? ((s[i+1] & 0xdf) - 'A') + 10 : (s[i+1] - '0');
|
|
num *= 16;
|
|
num += (s[i+2] >= 'A') ? ((s[i+2] & 0xdf) - 'A') + 10 : (s[i+2] - '0');
|
|
s[o++] = num;
|
|
i+=3;
|
|
} else {
|
|
s[o++] = s[i++];
|
|
}
|
|
}
|
|
if (i && o) s[o] = '\0';
|
|
return o;
|
|
}
|
|
|
|
char *url_raw_encode(const char *s, int &len) {
|
|
register int x, y;
|
|
unsigned char *str;
|
|
|
|
str = (unsigned char *)malloc(3 * len + 1);
|
|
for (x = 0, y = 0; len--; x++, y++) {
|
|
str[y] = (unsigned char) s[x];
|
|
if ((str[y] < '0' && str[y] != '-' && str[y] != '.') ||
|
|
(str[y] < 'A' && str[y] > '9') ||
|
|
(str[y] > 'Z' && str[y] < 'a' && str[y] != '_') ||
|
|
(str[y] > 'z' && str[y] != '~')) {
|
|
str[y++] = '%';
|
|
str[y++] = hexchars[(unsigned char) s[x] >> 4];
|
|
str[y] = hexchars[(unsigned char) s[x] & 15];
|
|
}
|
|
}
|
|
str[y] = '\0';
|
|
len = y;
|
|
return ((char *)str);
|
|
}
|
|
|
|
char *url_raw_decode(const char *s, int &len) {
|
|
char *str = string_duplicate(s, len);
|
|
char *dest = str;
|
|
char *data = str;
|
|
|
|
while (len--) {
|
|
if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1))
|
|
&& isxdigit((int) *(data + 2))) {
|
|
*dest = (char) php_htoi(data + 1);
|
|
data += 2;
|
|
len -= 2;
|
|
} else {
|
|
*dest = *data;
|
|
}
|
|
data++;
|
|
dest++;
|
|
}
|
|
*dest = '\0';
|
|
len = dest - str;
|
|
return str;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
}
|