/* +----------------------------------------------------------------------+ | HipHop for PHP | +----------------------------------------------------------------------+ | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) | +----------------------------------------------------------------------+ | This source file is subject to version 2.00 of the Zend license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.zend.com/license/2_00.txt. | | If you did not receive a copy of the Zend license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@zend.com so we can mail you a copy immediately. | +----------------------------------------------------------------------+ */ #include "hphp/runtime/base/zend_url.h" #include "hphp/runtime/base/zend_string.h" namespace HPHP { /////////////////////////////////////////////////////////////////////////////// static char *replace_controlchars(char *str, int len) { unsigned char *s = (unsigned char *)str; unsigned char *e = (unsigned char *)str + len; if (!str) { return (nullptr); } while (s < e) { if (iscntrl(*s)) { *s='_'; } s++; } return (str); } Url::~Url() { if (scheme) free(scheme); if (user) free(user); if (pass) free(pass); if (host) free(host); if (path) free(path); if (query) free(query); if (fragment) free(fragment); } bool url_parse(Url &output, const char *str, int length) { memset(&output, 0, sizeof(Url)); char port_buf[6]; const char *s, *e, *p, *pp, *ue; s = str; ue = s + length; /* parse scheme */ if ((e = (const char *)memchr((const void *)s, ':', length)) && (e - s)) { /* validate scheme */ p = s; while (p < e) { /* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */ if (!isalpha(*p) && !isdigit(*p) && *p != '+' && *p != '.' && *p != '-') { if (e + 1 < ue) { goto parse_port; } else { goto just_path; } } p++; } if (*(e + 1) == '\0') { /* only scheme is available */ output.scheme = string_duplicate(s, (e - s)); replace_controlchars(output.scheme, (e - s)); goto end; } /* * certain schemas like mailto: and zlib: may not have any / after them * this check ensures we support those. */ if (*(e+1) != '/') { /* check if the data we get is a port this allows us to * correctly parse things like a.com:80 */ p = e + 1; while (isdigit(*p)) { p++; } if ((*p == '\0' || *p == '/') && (p - e) < 7) { goto parse_port; } output.scheme = string_duplicate(s, (e-s)); replace_controlchars(output.scheme, (e - s)); length -= ++e - s; s = e; goto just_path; } else { output.scheme = string_duplicate(s, (e-s)); replace_controlchars(output.scheme, (e - s)); if (*(e+2) == '/') { s = e + 3; if (!strncasecmp("file", output.scheme, sizeof("file"))) { if (*(e + 3) == '/') { /* support windows drive letters as in: file:///c:/somedir/file.txt */ if (*(e + 5) == ':') { s = e + 4; } goto nohost; } } } else { if (!strncasecmp("file", output.scheme, sizeof("file"))) { s = e + 1; goto nohost; } else { length -= ++e - s; s = e; goto just_path; } } } } else if (e) { /* no scheme, look for port */ parse_port: p = e + 1; pp = p; while (pp-p < 6 && isdigit(*pp)) { pp++; } if (pp-p < 6 && (*pp == '/' || *pp == '\0')) { memcpy(port_buf, p, (pp-p)); port_buf[pp-p] = '\0'; auto port = atoi(port_buf); if (port > 0 && port <= 65535) { output.port = port; } else { return false; } } else { goto just_path; } } else { just_path: ue = s + length; goto nohost; } e = ue; if (!(p = (const char *)memchr(s, '/', (ue - s)))) { const char *query = (const char *)memchr(s, '?', (ue - s)); const char *fragment = (const char *)memchr(s, '#', (ue - s)); if (query && fragment) { e = (query > fragment) ? fragment : query; } else if (query) { e = query; } else if (fragment) { e = fragment; } } else { e = p; } /* check for login and password */ if ((p = (const char *)memrchr(s, '@', (e-s)))) { if ((pp = (const char *)memchr(s, ':', (p-s)))) { if ((pp-s) > 0) { output.user = string_duplicate(s, (pp-s)); replace_controlchars(output.user, (pp - s)); } pp++; if (p-pp > 0) { output.pass = string_duplicate(pp, (p-pp)); replace_controlchars(output.pass, (p-pp)); } } else { output.user = string_duplicate(s, (p-s)); replace_controlchars(output.user, (p-s)); } s = p + 1; } /* check for port */ if (*s == '[' && *(e-1) == ']') { /* Short circuit portscan, we're dealing with an IPv6 embedded address */ p = s; } else { /* memrchr is a GNU specific extension Emulate for wide compatability */ for(p = e; *p != ':' && p >= s; p--); } if (p >= s && *p == ':') { if (!output.port) { p++; if (e-p > 5) { /* port cannot be longer then 5 characters */ return false; } else if (e - p > 0) { memcpy(port_buf, p, (e-p)); port_buf[e-p] = '\0'; auto port = atoi(port_buf); if (port > 0 && port <= 65535) { output.port = port; } else { return false; } } p--; } } else { p = e; } /* check if we have a valid host, if we don't reject the string as url */ if ((p-s) < 1) { return false; } output.host = string_duplicate(s, (p-s)); replace_controlchars(output.host, (p - s)); if (e == ue) { return true; } s = e; nohost: if ((p = (const char *)memchr(s, '?', (ue - s)))) { pp = strchr(s, '#'); if (pp && pp < p) { if (pp - s) { output.path = string_duplicate(s, (pp-s)); replace_controlchars(output.path, (pp - s)); p = pp; } goto label_parse; } if (p - s) { output.path = string_duplicate(s, (p-s)); replace_controlchars(output.path, (p - s)); } if (pp) { if (pp - ++p) { output.query = string_duplicate(p, (pp-p)); replace_controlchars(output.query, (pp - p)); } p = pp; goto label_parse; } else if (++p - ue) { output.query = string_duplicate(p, (ue-p)); replace_controlchars(output.query, (ue - p)); } } else if ((p = (const char *)memchr(s, '#', (ue - s)))) { if (p - s) { output.path = string_duplicate(s, (p-s)); replace_controlchars(output.path, (p - s)); } label_parse: p++; if (ue - p) { output.fragment = string_duplicate(p, (ue-p)); replace_controlchars(output.fragment, (ue - p)); } } else { output.path = string_duplicate(s, (ue-s)); replace_controlchars(output.path, (ue - s)); } end: return true; } /////////////////////////////////////////////////////////////////////////////// static int php_htoi(char *s) { int value; int c; c = ((unsigned char *)s)[0]; if (isupper(c)) c = tolower(c); value = (c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10) * 16; c = ((unsigned char *)s)[1]; if (isupper(c)) c = tolower(c); value += c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10; return (value); } /* rfc1738: ...The characters ";", "/", "?", ":", "@", "=" and "&" are the characters which may be reserved for special meaning within a scheme... ...Thus, only alphanumerics, the special characters "$-_.+!*'(),", and reserved characters used for their reserved purposes may be used unencoded within a URL... For added safety, we only leave -_. unencoded. */ static unsigned char hexchars[] = "0123456789ABCDEF"; char *url_encode(const char *s, int &len) { register unsigned char c; unsigned char *to, *start; unsigned char const *from, *end; from = (unsigned char const *)s; end = (unsigned char const *)s + len; start = to = (unsigned char *)malloc(3 * len + 1); while (from < end) { c = *from++; if (c == ' ') { *to++ = '+'; } else if ((c < '0' && c != '-' && c != '.') || (c < 'A' && c > '9') || (c > 'Z' && c < 'a' && c != '_') || (c > 'z')) { to[0] = '%'; to[1] = hexchars[c >> 4]; to[2] = hexchars[c & 15]; to += 3; } else { *to++ = c; } } *to = 0; len = to - start; return (char *) start; } char *url_decode(const char *s, int &len) { char *str = string_duplicate(s, len); char *dest = str; char *data = str; while (len--) { if (*data == '+') { *dest = ' '; } else if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1)) && isxdigit((int) *(data + 2))) { *dest = (char) php_htoi(data + 1); data += 2; len -= 2; } else { *dest = *data; } data++; dest++; } *dest = '\0'; len = dest - str; return str; } // copied and re-factored from clearsilver-0.10.5/cgi/cgi.c int url_decode(char *value) { assert(value && *value); // check before calling this function int i = 0, o = 0; unsigned char *s = (unsigned char *)value; while (s[i]) { if (s[i] == '+') { s[o++] = ' '; i++; } else if (s[i] == '%' && isxdigit(s[i+1]) && isxdigit(s[i+2])) { char num; num = (s[i+1] >= 'A') ? ((s[i+1] & 0xdf) - 'A') + 10 : (s[i+1] - '0'); num *= 16; num += (s[i+2] >= 'A') ? ((s[i+2] & 0xdf) - 'A') + 10 : (s[i+2] - '0'); s[o++] = num; i+=3; } else { s[o++] = s[i++]; } } if (i && o) s[o] = '\0'; return o; } int url_decode_ex(char *value, int len) { assert(value && *value); // check before calling this function assert(len >= 0); if (len <= 0) return 0; int i = 0, o = 0; unsigned char *s = (unsigned char *)value; unsigned char *end = s + len; while (s + i < end) { if (s[i] == '+') { s[o++] = ' '; i++; } else if (s[i] == '%' && isxdigit(s[i+1]) && isxdigit(s[i+2])) { char num; num = (s[i+1] >= 'A') ? ((s[i+1] & 0xdf) - 'A') + 10 : (s[i+1] - '0'); num *= 16; num += (s[i+2] >= 'A') ? ((s[i+2] & 0xdf) - 'A') + 10 : (s[i+2] - '0'); s[o++] = num; i+=3; } else { s[o++] = s[i++]; } } if (i && o) s[o] = '\0'; return o; } char *url_raw_encode(const char *s, int &len) { register int x, y; unsigned char *str; str = (unsigned char *)malloc(3 * len + 1); for (x = 0, y = 0; len--; x++, y++) { str[y] = (unsigned char) s[x]; if ((str[y] < '0' && str[y] != '-' && str[y] != '.') || (str[y] < 'A' && str[y] > '9') || (str[y] > 'Z' && str[y] < 'a' && str[y] != '_') || (str[y] > 'z' && str[y] != '~')) { str[y++] = '%'; str[y++] = hexchars[(unsigned char) s[x] >> 4]; str[y] = hexchars[(unsigned char) s[x] & 15]; } } str[y] = '\0'; len = y; return ((char *)str); } char *url_raw_decode(const char *s, int &len) { char *str = string_duplicate(s, len); char *dest = str; char *data = str; while (len--) { if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1)) && isxdigit((int) *(data + 2))) { *dest = (char) php_htoi(data + 1); data += 2; len -= 2; } else { *dest = *data; } data++; dest++; } *dest = '\0'; len = dest - str; return str; } /////////////////////////////////////////////////////////////////////////////// }