Arquivos
hhvm/hphp/util/zend/zend_html.cpp
T
steveo b4ca4bbbd0 change use of html_supported_charset to avoid duplicating work
change callers of determine_charset to check for nulls, instead of calling html_supported_charset (to pre-validate charset name) and then calling this (which has to run through a list of names anyway).
2013-03-27 16:10:33 -07:00

828 linhas
28 KiB
C++

/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
| Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 2.00 of the Zend license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.zend.com/license/2_00.txt. |
| If you did not receive a copy of the Zend license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@zend.com so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#include "util/zend/zend_html.h"
#include <util/lock.h>
#include <unicode/uchar.h>
#include <unicode/utf8.h>
namespace HPHP {
///////////////////////////////////////////////////////////////////////////////
// UTF-8 entity tables
using namespace entity_charset_enum;
/* codepage 1252 is a Windows extension to iso-8859-1. */
static entity_table_t ent_cp_1252[] = {
"euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
"Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
"bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
"oelig", nullptr, nullptr, "Yuml"
};
static entity_table_t ent_iso_8859_1[] = {
"nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
"sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
"macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
"para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
"frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
"Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
"Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
"Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
"Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
"Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
"atilde", "auml", "aring", "aelig", "ccedil", "egrave",
"eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
"iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
"ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
"uuml", "yacute", "thorn", "yuml"
};
static entity_table_t ent_iso_8859_15[] = {
"nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
"sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
"macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
"micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
"raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
"Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
"Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
"Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
"Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
"Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
"atilde", "auml", "aring", "aelig", "ccedil", "egrave",
"eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
"iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
"ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
"uuml", "yacute", "thorn", "yuml"
};
static entity_table_t ent_uni_338_402[] = {
/* 338 (0x0152) */
"OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 352 (0x0160) */
"Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 376 (0x0178) */
"Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 400 (0x0190) */
nullptr, nullptr, "fnof"
};
static entity_table_t ent_uni_spacing[] = {
/* 710 */
"circ",
/* 711 - 730 */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 731 - 732 */
nullptr, "tilde"
};
static entity_table_t ent_uni_greek[] = {
/* 913 */
"Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
"Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
/* 938 - 944 are not mapped */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
"iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
"sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
/* 970 - 976 are not mapped */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
"thetasym", "upsih",
nullptr, nullptr, nullptr,
"piv"
};
static entity_table_t ent_uni_punct[] = {
/* 8194 */
"ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
"thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
/* 8216 */
"lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
"dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
/* 8242 */
"prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
"frasl"
};
static entity_table_t ent_uni_euro[] = {
"euro"
};
static entity_table_t ent_uni_8465_8501[] = {
/* 8465 */
"image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8472 */
"weierp", nullptr, nullptr, nullptr,
/* 8476 */
"real", nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8482 */
"trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8501 */
"alefsym",
};
static entity_table_t ent_uni_8592_9002[] = {
/* 8592 (0x2190) */
"larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8608 (0x21a0) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8624 (0x21b0) */
nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8640 (0x21c0) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8656 (0x21d0) */
"lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
/* 8672 (0x21e0) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8704 (0x2200) */
"forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
"isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
/* 8720 (0x2210) */
"coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
"compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
/* 8736 (0x2220) */
"ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
"or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
/* 8752 (0x2230) */
nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
/* 8768 (0x2240) */
"wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
"asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
/* 8784 (0x2250) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8800 (0x2260) */
"ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
"lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
/* 8816 (0x2270) */
"nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
/* 8832 (0x2280) */
"npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8848 (0x2290) */
nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8864 (0x22a0) */
nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8880 (0x22b0) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8896 (0x22c0) */
nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8912 (0x22d0) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8928 (0x22e0) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8944 (0x22f0) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8960 (0x2300) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
"lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
/* 8976 (0x2310) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
/* 8992 (0x2320) */
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, "lang", "rang"
};
static entity_table_t ent_uni_9674[] = {
/* 9674 */
"loz"
};
static entity_table_t ent_uni_9824_9830[] = {
/* 9824 */
"spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
};
static const struct html_entity_map entity_map[] = {
{ cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
{ cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
{ cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
{ cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
{ cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
{ cs_utf_8, 338, 402, ent_uni_338_402 },
{ cs_utf_8, 710, 732, ent_uni_spacing },
{ cs_utf_8, 913, 982, ent_uni_greek },
{ cs_utf_8, 8194, 8260, ent_uni_punct },
{ cs_utf_8, 8364, 8364, ent_uni_euro },
{ cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
{ cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
{ cs_utf_8, 9674, 9674, ent_uni_9674 },
{ cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
{ cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
{ cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
{ cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
{ cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
{ cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
/* Missing support for these at the moment
{ cs_koi8r, 0xa3, 0xff, ent_koi8r },
{ cs_cp1251, 0x80, 0xff, ent_cp_1251 },
{ cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
{ cs_cp866, 0xc0, 0xff, ent_cp_866 },
{ cs_macroman, 0x0b, 0xff, ent_macroman },
*/
{ cs_terminator }
};
static const struct {
const char *codeset;
entity_charset charset;
} charset_map[] = {
{ "ISO-8859-1", cs_8859_1 },
{ "ISO8859-1", cs_8859_1 },
{ "ISO-8859-15", cs_8859_15 },
{ "ISO8859-15", cs_8859_15 },
{ "utf-8", cs_utf_8 },
{ "cp1252", cs_cp1252 },
{ "Windows-1252", cs_cp1252 },
{ "1252", cs_cp1252 },
{ "BIG5", cs_big5 },
{ "950", cs_big5 },
{ "GB2312", cs_gb2312 },
{ "936", cs_gb2312 },
{ "BIG5-HKSCS", cs_big5hkscs },
{ "Shift_JIS", cs_sjis },
{ "SJIS", cs_sjis },
{ "932", cs_sjis },
{ "EUCJP", cs_eucjp },
/* Missing support for these at the moment
{ "EUC-JP", cs_eucjp },
{ "KOI8-R", cs_koi8r },
{ "koi8-ru", cs_koi8r },
{ "koi8r", cs_koi8r },
{ "cp1251", cs_cp1251 },
{ "Windows-1251", cs_cp1251 },
{ "win-1251", cs_cp1251 },
{ "iso8859-5", cs_8859_5 },
{ "iso-8859-5", cs_8859_5 },
{ "cp866", cs_cp866 },
{ "866", cs_cp866 },
{ "ibm866", cs_cp866 },
{ "MacRoman", cs_macroman },
*/
{ nullptr }
};
///////////////////////////////////////////////////////////////////////////////
entity_charset determine_charset(const char *charset_hint) {
entity_charset charset = cs_unknown;
if (charset_hint == nullptr) {
// default to utf-8
return cs_utf_8;
}
size_t len = strlen(charset_hint);
/* now walk the charset map and look for the codeset */
for (int i = 0; charset_map[i].codeset; i++) {
if (len == strlen(charset_map[i].codeset) &&
strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
charset = charset_map[i].charset;
break;
}
}
return charset;
}
static int utf32_to_utf8(unsigned char *buf, int k) {
int retval = 0;
if (k < 0x80) {
buf[0] = k;
retval = 1;
} else if (k < 0x800) {
buf[0] = 0xc0 | (k >> 6);
buf[1] = 0x80 | (k & 0x3f);
retval = 2;
} else if (k < 0x10000) {
buf[0] = 0xe0 | (k >> 12);
buf[1] = 0x80 | ((k >> 6) & 0x3f);
buf[2] = 0x80 | (k & 0x3f);
retval = 3;
} else if (k < 0x200000) {
buf[0] = 0xf0 | (k >> 18);
buf[1] = 0x80 | ((k >> 12) & 0x3f);
buf[2] = 0x80 | ((k >> 6) & 0x3f);
buf[3] = 0x80 | (k & 0x3f);
retval = 4;
} else if (k < 0x4000000) {
buf[0] = 0xf8 | (k >> 24);
buf[1] = 0x80 | ((k >> 18) & 0x3f);
buf[2] = 0x80 | ((k >> 12) & 0x3f);
buf[3] = 0x80 | ((k >> 6) & 0x3f);
buf[4] = 0x80 | (k & 0x3f);
retval = 5;
} else {
buf[0] = 0xfc | (k >> 30);
buf[1] = 0x80 | ((k >> 24) & 0x3f);
buf[2] = 0x80 | ((k >> 18) & 0x3f);
buf[3] = 0x80 | ((k >> 12) & 0x3f);
buf[4] = 0x80 | ((k >> 6) & 0x3f);
buf[5] = 0x80 | (k & 0x3f);
retval = 6;
}
buf[retval] = '\0';
return retval;
}
typedef hphp_hash_map
<const char *, std::string, hphp_hash<const char *>, eqstr>
HtmlEntityMap;
static volatile bool EntityMapInited = false;
static Mutex EntityMapMutex;
static HtmlEntityMap EntityMap[cs_end];
static HtmlEntityMap XHPEntityMap[cs_end];
static void init_entity_table() {
for (unsigned int i = 0; entity_map[i].charset != cs_terminator; i++) {
const html_entity_map &em = entity_map[i];
const entity_charset charset = entity_map[i].charset;
int index = 0;
for (int ch = em.basechar; ch <= em.endchar; ch++, index++) {
const char *entity = em.table[index];
if (entity == nullptr) {
continue;
}
unsigned char buf[10];
switch (charset) {
case cs_8859_1:
case cs_cp1252:
case cs_8859_15:
case cs_cp1251:
case cs_8859_5:
case cs_cp866:
case cs_koi8r:
buf[0] = ch;
buf[1] = '\0';
break;
case cs_utf_8:
utf32_to_utf8(buf, ch);
break;
default:
continue;
}
EntityMap[charset][entity] = (const char *)buf;
XHPEntityMap[charset][entity] = (const char *)buf;
}
EntityMap[charset]["quot"] = "\"";
EntityMap[charset]["lt"] = "<";
EntityMap[charset]["gt"] = ">";
EntityMap[charset]["amp"] = "&";
XHPEntityMap[charset]["quot"] = "\"";
XHPEntityMap[charset]["lt"] = "<";
XHPEntityMap[charset]["gt"] = ">";
XHPEntityMap[charset]["amp"] = "&";
// XHP-specific entities
XHPEntityMap[charset]["apos"] = "\'";
XHPEntityMap[charset]["cloud"] = "\u2601";
XHPEntityMap[charset]["umbrella"] = "\u2602";
XHPEntityMap[charset]["snowman"] = "\u2603";
XHPEntityMap[charset]["snowflake"] = "\u2745";
XHPEntityMap[charset]["comet"] = "\u2604";
XHPEntityMap[charset]["thunderstorm"] = "\u2608";
}
// the first element is an empty table
EntityMap[cs_terminator]["quot"] = "\"";
EntityMap[cs_terminator]["lt"] = "<";
EntityMap[cs_terminator]["gt"] = ">";
EntityMap[cs_terminator]["amp"] = "&";
// XHP-specific entities
XHPEntityMap[cs_terminator]["apos"] = "\'";
XHPEntityMap[cs_terminator]["cloud"] = "\u2601";
XHPEntityMap[cs_terminator]["umbrella"] = "\u2602";
XHPEntityMap[cs_terminator]["snowman"] = "\u2603";
XHPEntityMap[cs_terminator]["snowflake"] = "\u2745";
XHPEntityMap[cs_terminator]["comet"] = "\u2604";
XHPEntityMap[cs_terminator]["thunderstorm"] = "\u2608";
}
///////////////////////////////////////////////////////////////////////////////
char *string_html_encode(const char *input, int &len, bool encode_double_quote,
bool encode_single_quote, bool utf8, bool nbsp) {
assert(input);
/**
* Though seems to be wasting memory a lot, we have to realize most of the
* time this function is called with small strings, or fragments of HTMLs.
* Allocating/deallocating anything less than 1K is trivial these days, and
* we want avoid string copying as much as possible. Of course, the return
* char * is really sent back at large, occupying unnessary space for
* potentially longer time than we need, we have to realize the two closest
* solutions are not that much better, either:
*
* 1. pre-calculate size by iterating through the string once: too time
* consuming;
* 2. take a guess and double buffer size when over: still wasting, and
* it may not save that much.
*/
char *ret = (char *)malloc(len * 6uL + 1);
if (!ret) {
return nullptr;
}
char *q = ret;
for (const char *p = input, *end = input + len; p < end; p++) {
char c = *p;
switch (c) {
case '"':
if (encode_double_quote) {
*q++ = '&'; *q++ = 'q'; *q++ = 'u'; *q++ = 'o'; *q++ = 't'; *q++ = ';';
} else {
*q++ = c;
}
break;
case '\'':
if (encode_single_quote) {
*q++ = '&'; *q++ = '#'; *q++ = '0'; *q++ = '3'; *q++ = '9'; *q++ = ';';
} else {
*q++ = c;
}
break;
case '<':
*q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
break;
case '>':
*q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
break;
case '&':
*q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
break;
case '\xc2':
if (nbsp && utf8 && *(p+1) == '\xa0') {
*q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
p++;
} else {
*q++ = c;
}
break;
case '\xa0':
if (nbsp && !utf8) {
*q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
} else {
*q++ = c;
}
break;
default:
*q++ = c;
break;
}
}
if (q - ret > INT_MAX) {
free(ret);
return nullptr;
}
*q = 0;
len = q - ret;
return ret;
}
char *string_html_encode_extra(const char *input, int &len,
StringHtmlEncoding flags,
const AsciiMap *asciiMap) {
assert(input);
/**
* Though seems to be wasting memory a lot, we have to realize most of the
* time this function is called with small strings, or fragments of HTMLs.
* Allocating/deallocating anything less than 1K is trivial these days, and
* we want avoid string copying as much as possible. Of course, the return
* char * is really sent back at large, occupying unnessary space for
* potentially longer time than we need, we have to realize the two closest
* solutions are not that much better, either:
*
* 1. pre-calculate size by iterating through the string once: too time
* consuming;
* 2. take a guess and double buffer size when over: still wasting, and
* it may not save that much.
*/
char *ret = (char *)malloc(len * 8uL + 1);
if (!ret) {
return nullptr;
}
char *q = ret;
const char *rep = "\ufffd";
int32_t srcPosBytes;
for (srcPosBytes = 0; srcPosBytes < len; /* incremented in-loop */) {
unsigned char c = input[srcPosBytes];
if (c && c < 128) {
srcPosBytes++; // Optimize US-ASCII case
if ((asciiMap->map[c & 64 ? 1 : 0] >> (c & 63)) & 1) {
switch (c) {
case '"':
*q++ = '&'; *q++ = 'q'; *q++ = 'u';
*q++ = 'o'; *q++ = 't'; *q++ = ';';
break;
case '\'':
*q++ = '&'; *q++ = '#'; *q++ = '0';
*q++ = '3'; *q++ = '9'; *q++ = ';';
break;
case '<':
*q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
break;
case '>':
*q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
break;
case '&':
*q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
break;
default:
*q++ = '&'; *q++ = '#';
*q++ = c >= 100 ? '1' : '0';
*q++ = ((c / 10) % 10) + '0';
*q++ = (c % 10) + '0';
*q++ = ';';
break;
}
} else {
*q++ = c;
}
} else if (flags & STRING_HTML_ENCODE_UTF8) {
UChar32 curCodePoint;
U8_NEXT(input, srcPosBytes, len, curCodePoint);
if ((flags & STRING_HTML_ENCODE_NBSP) && curCodePoint == 0xC2A0) {
*q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
} else if (curCodePoint <= 0) {
if (flags & STRING_HTML_ENCODE_UTF8IZE_REPLACE) {
if (flags & STRING_HTML_ENCODE_HIGH) {
*q++ = '&'; *q++ = '#'; *q++ = 'x';
*q++ = 'f'; *q++ = 'f'; *q++ = 'f'; *q++ = 'd';
*q++ = ';';
} else {
const char *r = rep;
while (*r) *q++ = *r++;
}
}
} else if (flags & STRING_HTML_ENCODE_HIGH) {
q += sprintf(q, "&#x%x;", curCodePoint);
} else {
int32_t pos = 0;
U8_APPEND_UNSAFE(q, pos, curCodePoint);
q += pos;
}
} else {
srcPosBytes++; // Optimize US-ASCII case
if (c == 0xa0) {
*q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
} else if (flags & STRING_HTML_ENCODE_HIGH) {
*q++ = '&'; *q++ = '#';
*q++ = c >= 200 ? '2' : '1';
*q++ = ((c / 10) % 10) + '0';
*q++ = (c % 10) + '0';
*q++ = ';';
} else {
*q++ = c;
}
}
}
if (q - ret > INT_MAX) {
free(ret);
return nullptr;
}
*q = 0;
len = q - ret;
return ret;
}
inline static bool decode_entity(char *entity, int *len,
bool decode_double_quote,
bool decode_single_quote,
entity_charset charset, bool all,
bool xhp = false) {
// entity is 16 bytes, allocated statically below
// default in PHP
assert(entity && *entity);
if (entity[0] == '#') {
int code;
if (entity[1] == 'x' || entity[1] == 'X') {
code = strtol(entity + 2, nullptr, 16);
} else {
code = strtol(entity + 1, nullptr, 10);
}
// since we don't support multibyte chars other than utf-8
int l = 1;
if (code == 39 && decode_single_quote) {
entity[0] = code;
entity[1] = '\0';
*len = l;
return true;
}
switch (charset) {
case cs_utf_8:
{
unsigned char buf[10];
int size = utf32_to_utf8(buf, code);
memcpy(entity, buf, size + 1);
l = size;
break;
}
case cs_8859_1:
case cs_8859_5:
case cs_8859_15:
if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
return false;
} else {
if (code == 39) {
return false;
}
entity[0] = code;
entity[1] = '\0';
}
break;
case cs_cp1252:
case cs_cp1251:
case cs_cp866:
if (code > 0xff) {
return false;
}
entity[0] = code;
entity[1] = '\0';
break;
case cs_big5:
case cs_big5hkscs:
case cs_sjis:
case cs_eucjp:
if (code >= 0x80) {
return false;
}
entity[0] = code;
entity[1] = '\0';
break;
case cs_gb2312:
if (code >= 0x81) {
return false;
}
entity[0] = code;
entity[1] = '\0';
break;
default:
return false;
break;
}
*len = l;
return true;
} else {
HtmlEntityMap *entityMap;
if (strncasecmp(entity, "quot", 4) == 0 && !decode_double_quote) {
return false;
}
if (all) {
entityMap = xhp ? &XHPEntityMap[charset] : &EntityMap[charset];
} else {
entityMap = xhp ? &XHPEntityMap[cs_terminator]
: &EntityMap[cs_terminator];
}
HtmlEntityMap::const_iterator iter = entityMap->find(entity);
if (iter != entityMap->end()) {
memcpy(entity, iter->second.c_str(), iter->second.length() + 1);
*len = iter->second.length();
return true;
}
}
return false;
}
char *string_html_decode(const char *input, int &len,
bool decode_double_quote, bool decode_single_quote,
const char *charset_hint, bool all,
bool xhp /* = false */) {
assert(input);
if (!EntityMapInited) {
Lock lock(EntityMapMutex);
if (!EntityMapInited) {
init_entity_table();
EntityMapInited = true;
}
}
entity_charset charset = determine_charset(charset_hint);
if (charset == cs_unknown) {
return nullptr;
}
char *ret = (char *)malloc(len + 1);
char *q = ret;
for (const char *p = input; *p || UNLIKELY(p - input < len); p++) {
char ch = *p;
if (ch != '&') {
*q++ = ch;
continue;
}
p++;
bool found = false;
for (const char *t = p; *t; t++) {
if (*t == ';') {
int l = t - p;
if (l > 0) {
char sbuf[16] = {0};
char *buf;
if (l > 10) {
buf = (char* )malloc(l + 1);
} else {
buf = sbuf;
}
memcpy(buf, p, l);
buf[l] = '\0';
if (decode_entity(buf, &l, decode_double_quote, decode_single_quote,
charset, all, xhp)) {
memcpy(q, buf, l);
found = true;
p = t;
q += l;
}
if (buf != sbuf) {
free(buf);
}
}
break;
}
}
if (!found) {
p--;
*q++ = '&'; // not an entity
}
}
*q = '\0';
len = q - ret;
return ret;
}
const html_entity_map* html_get_entity_map() {
if (!EntityMapInited) {
Lock lock(EntityMapMutex);
if (!EntityMapInited) {
init_entity_table();
EntityMapInited = true;
}
}
return entity_map;
}
///////////////////////////////////////////////////////////////////////////////
}