b4ca4bbbd0
change callers of determine_charset to check for nulls, instead of calling html_supported_charset (to pre-validate charset name) and then calling this (which has to run through a list of names anyway).
828 linhas
28 KiB
C++
828 linhas
28 KiB
C++
/*
|
|
+----------------------------------------------------------------------+
|
|
| HipHop for PHP |
|
|
+----------------------------------------------------------------------+
|
|
| Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
|
|
| Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
|
|
+----------------------------------------------------------------------+
|
|
| This source file is subject to version 2.00 of the Zend license, |
|
|
| that is bundled with this package in the file LICENSE, and is |
|
|
| available through the world-wide-web at the following url: |
|
|
| http://www.zend.com/license/2_00.txt. |
|
|
| If you did not receive a copy of the Zend license and are unable to |
|
|
| obtain it through the world-wide-web, please send a note to |
|
|
| license@zend.com so we can mail you a copy immediately. |
|
|
+----------------------------------------------------------------------+
|
|
*/
|
|
|
|
#include "util/zend/zend_html.h"
|
|
#include <util/lock.h>
|
|
#include <unicode/uchar.h>
|
|
#include <unicode/utf8.h>
|
|
|
|
namespace HPHP {
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// UTF-8 entity tables
|
|
|
|
using namespace entity_charset_enum;
|
|
|
|
/* codepage 1252 is a Windows extension to iso-8859-1. */
|
|
static entity_table_t ent_cp_1252[] = {
|
|
"euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
|
|
"Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
|
|
nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
|
|
"bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
|
|
"oelig", nullptr, nullptr, "Yuml"
|
|
};
|
|
|
|
static entity_table_t ent_iso_8859_1[] = {
|
|
"nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
|
|
"sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
|
|
"macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
|
|
"para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
|
|
"frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
|
|
"Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
|
|
"Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
|
|
"Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
|
|
"Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
|
|
"Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
|
|
"atilde", "auml", "aring", "aelig", "ccedil", "egrave",
|
|
"eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
|
|
"iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
|
|
"ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
|
|
"uuml", "yacute", "thorn", "yuml"
|
|
};
|
|
|
|
static entity_table_t ent_iso_8859_15[] = {
|
|
"nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
|
|
"sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
|
|
"macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
|
|
"micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
|
|
"raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
|
|
"Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
|
|
"Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
|
|
"Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
|
|
"Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
|
|
"Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
|
|
"atilde", "auml", "aring", "aelig", "ccedil", "egrave",
|
|
"eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
|
|
"iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
|
|
"ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
|
|
"uuml", "yacute", "thorn", "yuml"
|
|
};
|
|
|
|
static entity_table_t ent_uni_338_402[] = {
|
|
/* 338 (0x0152) */
|
|
"OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 352 (0x0160) */
|
|
"Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 376 (0x0178) */
|
|
"Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 400 (0x0190) */
|
|
nullptr, nullptr, "fnof"
|
|
};
|
|
|
|
static entity_table_t ent_uni_spacing[] = {
|
|
/* 710 */
|
|
"circ",
|
|
/* 711 - 730 */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 731 - 732 */
|
|
nullptr, "tilde"
|
|
};
|
|
|
|
static entity_table_t ent_uni_greek[] = {
|
|
/* 913 */
|
|
"Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
|
|
"Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
|
|
nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
|
|
/* 938 - 944 are not mapped */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
|
|
"iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
|
|
"sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
|
|
/* 970 - 976 are not mapped */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
"thetasym", "upsih",
|
|
nullptr, nullptr, nullptr,
|
|
"piv"
|
|
};
|
|
|
|
static entity_table_t ent_uni_punct[] = {
|
|
/* 8194 */
|
|
"ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
"thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
|
|
nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
|
|
/* 8216 */
|
|
"lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
|
|
"dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
|
|
/* 8242 */
|
|
"prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
|
|
nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
"frasl"
|
|
};
|
|
|
|
static entity_table_t ent_uni_euro[] = {
|
|
"euro"
|
|
};
|
|
|
|
static entity_table_t ent_uni_8465_8501[] = {
|
|
/* 8465 */
|
|
"image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8472 */
|
|
"weierp", nullptr, nullptr, nullptr,
|
|
/* 8476 */
|
|
"real", nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8482 */
|
|
"trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8501 */
|
|
"alefsym",
|
|
};
|
|
|
|
static entity_table_t ent_uni_8592_9002[] = {
|
|
/* 8592 (0x2190) */
|
|
"larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8608 (0x21a0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8624 (0x21b0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8640 (0x21c0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8656 (0x21d0) */
|
|
"lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
|
|
nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
|
|
/* 8672 (0x21e0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8704 (0x2200) */
|
|
"forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
|
|
"isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
|
|
/* 8720 (0x2210) */
|
|
"coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
|
|
"compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
|
|
/* 8736 (0x2220) */
|
|
"ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
|
|
"or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
|
|
/* 8752 (0x2230) */
|
|
nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
|
|
/* 8768 (0x2240) */
|
|
"wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
|
|
"asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
|
|
/* 8784 (0x2250) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8800 (0x2260) */
|
|
"ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
|
|
"lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
|
|
/* 8816 (0x2270) */
|
|
"nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
|
|
nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
|
|
/* 8832 (0x2280) */
|
|
"npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8848 (0x2290) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8864 (0x22a0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8880 (0x22b0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8896 (0x22c0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8912 (0x22d0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8928 (0x22e0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8944 (0x22f0) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8960 (0x2300) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
"lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
|
|
/* 8976 (0x2310) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
/* 8992 (0x2320) */
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
|
nullptr, "lang", "rang"
|
|
};
|
|
|
|
static entity_table_t ent_uni_9674[] = {
|
|
/* 9674 */
|
|
"loz"
|
|
};
|
|
|
|
static entity_table_t ent_uni_9824_9830[] = {
|
|
/* 9824 */
|
|
"spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
|
|
};
|
|
|
|
static const struct html_entity_map entity_map[] = {
|
|
{ cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
|
|
{ cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
|
|
{ cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
|
|
{ cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
|
|
{ cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
|
|
{ cs_utf_8, 338, 402, ent_uni_338_402 },
|
|
{ cs_utf_8, 710, 732, ent_uni_spacing },
|
|
{ cs_utf_8, 913, 982, ent_uni_greek },
|
|
{ cs_utf_8, 8194, 8260, ent_uni_punct },
|
|
{ cs_utf_8, 8364, 8364, ent_uni_euro },
|
|
{ cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
|
|
{ cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
|
|
{ cs_utf_8, 9674, 9674, ent_uni_9674 },
|
|
{ cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
|
|
{ cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
|
|
{ cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
|
|
{ cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
|
|
{ cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
|
|
{ cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
|
|
/* Missing support for these at the moment
|
|
{ cs_koi8r, 0xa3, 0xff, ent_koi8r },
|
|
{ cs_cp1251, 0x80, 0xff, ent_cp_1251 },
|
|
{ cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
|
|
{ cs_cp866, 0xc0, 0xff, ent_cp_866 },
|
|
{ cs_macroman, 0x0b, 0xff, ent_macroman },
|
|
*/
|
|
{ cs_terminator }
|
|
};
|
|
|
|
static const struct {
|
|
const char *codeset;
|
|
entity_charset charset;
|
|
} charset_map[] = {
|
|
{ "ISO-8859-1", cs_8859_1 },
|
|
{ "ISO8859-1", cs_8859_1 },
|
|
{ "ISO-8859-15", cs_8859_15 },
|
|
{ "ISO8859-15", cs_8859_15 },
|
|
{ "utf-8", cs_utf_8 },
|
|
{ "cp1252", cs_cp1252 },
|
|
{ "Windows-1252", cs_cp1252 },
|
|
{ "1252", cs_cp1252 },
|
|
{ "BIG5", cs_big5 },
|
|
{ "950", cs_big5 },
|
|
{ "GB2312", cs_gb2312 },
|
|
{ "936", cs_gb2312 },
|
|
{ "BIG5-HKSCS", cs_big5hkscs },
|
|
{ "Shift_JIS", cs_sjis },
|
|
{ "SJIS", cs_sjis },
|
|
{ "932", cs_sjis },
|
|
{ "EUCJP", cs_eucjp },
|
|
/* Missing support for these at the moment
|
|
{ "EUC-JP", cs_eucjp },
|
|
{ "KOI8-R", cs_koi8r },
|
|
{ "koi8-ru", cs_koi8r },
|
|
{ "koi8r", cs_koi8r },
|
|
{ "cp1251", cs_cp1251 },
|
|
{ "Windows-1251", cs_cp1251 },
|
|
{ "win-1251", cs_cp1251 },
|
|
{ "iso8859-5", cs_8859_5 },
|
|
{ "iso-8859-5", cs_8859_5 },
|
|
{ "cp866", cs_cp866 },
|
|
{ "866", cs_cp866 },
|
|
{ "ibm866", cs_cp866 },
|
|
{ "MacRoman", cs_macroman },
|
|
*/
|
|
{ nullptr }
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
entity_charset determine_charset(const char *charset_hint) {
|
|
entity_charset charset = cs_unknown;
|
|
|
|
if (charset_hint == nullptr) {
|
|
// default to utf-8
|
|
return cs_utf_8;
|
|
}
|
|
|
|
size_t len = strlen(charset_hint);
|
|
|
|
/* now walk the charset map and look for the codeset */
|
|
for (int i = 0; charset_map[i].codeset; i++) {
|
|
if (len == strlen(charset_map[i].codeset) &&
|
|
strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
|
|
charset = charset_map[i].charset;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return charset;
|
|
}
|
|
|
|
static int utf32_to_utf8(unsigned char *buf, int k) {
|
|
int retval = 0;
|
|
|
|
if (k < 0x80) {
|
|
buf[0] = k;
|
|
retval = 1;
|
|
} else if (k < 0x800) {
|
|
buf[0] = 0xc0 | (k >> 6);
|
|
buf[1] = 0x80 | (k & 0x3f);
|
|
retval = 2;
|
|
} else if (k < 0x10000) {
|
|
buf[0] = 0xe0 | (k >> 12);
|
|
buf[1] = 0x80 | ((k >> 6) & 0x3f);
|
|
buf[2] = 0x80 | (k & 0x3f);
|
|
retval = 3;
|
|
} else if (k < 0x200000) {
|
|
buf[0] = 0xf0 | (k >> 18);
|
|
buf[1] = 0x80 | ((k >> 12) & 0x3f);
|
|
buf[2] = 0x80 | ((k >> 6) & 0x3f);
|
|
buf[3] = 0x80 | (k & 0x3f);
|
|
retval = 4;
|
|
} else if (k < 0x4000000) {
|
|
buf[0] = 0xf8 | (k >> 24);
|
|
buf[1] = 0x80 | ((k >> 18) & 0x3f);
|
|
buf[2] = 0x80 | ((k >> 12) & 0x3f);
|
|
buf[3] = 0x80 | ((k >> 6) & 0x3f);
|
|
buf[4] = 0x80 | (k & 0x3f);
|
|
retval = 5;
|
|
} else {
|
|
buf[0] = 0xfc | (k >> 30);
|
|
buf[1] = 0x80 | ((k >> 24) & 0x3f);
|
|
buf[2] = 0x80 | ((k >> 18) & 0x3f);
|
|
buf[3] = 0x80 | ((k >> 12) & 0x3f);
|
|
buf[4] = 0x80 | ((k >> 6) & 0x3f);
|
|
buf[5] = 0x80 | (k & 0x3f);
|
|
retval = 6;
|
|
}
|
|
buf[retval] = '\0';
|
|
|
|
return retval;
|
|
}
|
|
|
|
typedef hphp_hash_map
|
|
<const char *, std::string, hphp_hash<const char *>, eqstr>
|
|
HtmlEntityMap;
|
|
|
|
static volatile bool EntityMapInited = false;
|
|
static Mutex EntityMapMutex;
|
|
static HtmlEntityMap EntityMap[cs_end];
|
|
static HtmlEntityMap XHPEntityMap[cs_end];
|
|
|
|
static void init_entity_table() {
|
|
for (unsigned int i = 0; entity_map[i].charset != cs_terminator; i++) {
|
|
const html_entity_map &em = entity_map[i];
|
|
const entity_charset charset = entity_map[i].charset;
|
|
|
|
int index = 0;
|
|
for (int ch = em.basechar; ch <= em.endchar; ch++, index++) {
|
|
const char *entity = em.table[index];
|
|
if (entity == nullptr) {
|
|
continue;
|
|
}
|
|
unsigned char buf[10];
|
|
switch (charset) {
|
|
case cs_8859_1:
|
|
case cs_cp1252:
|
|
case cs_8859_15:
|
|
case cs_cp1251:
|
|
case cs_8859_5:
|
|
case cs_cp866:
|
|
case cs_koi8r:
|
|
buf[0] = ch;
|
|
buf[1] = '\0';
|
|
break;
|
|
|
|
case cs_utf_8:
|
|
utf32_to_utf8(buf, ch);
|
|
break;
|
|
|
|
default:
|
|
continue;
|
|
}
|
|
EntityMap[charset][entity] = (const char *)buf;
|
|
XHPEntityMap[charset][entity] = (const char *)buf;
|
|
}
|
|
|
|
EntityMap[charset]["quot"] = "\"";
|
|
EntityMap[charset]["lt"] = "<";
|
|
EntityMap[charset]["gt"] = ">";
|
|
EntityMap[charset]["amp"] = "&";
|
|
|
|
XHPEntityMap[charset]["quot"] = "\"";
|
|
XHPEntityMap[charset]["lt"] = "<";
|
|
XHPEntityMap[charset]["gt"] = ">";
|
|
XHPEntityMap[charset]["amp"] = "&";
|
|
// XHP-specific entities
|
|
XHPEntityMap[charset]["apos"] = "\'";
|
|
XHPEntityMap[charset]["cloud"] = "\u2601";
|
|
XHPEntityMap[charset]["umbrella"] = "\u2602";
|
|
XHPEntityMap[charset]["snowman"] = "\u2603";
|
|
XHPEntityMap[charset]["snowflake"] = "\u2745";
|
|
XHPEntityMap[charset]["comet"] = "\u2604";
|
|
XHPEntityMap[charset]["thunderstorm"] = "\u2608";
|
|
}
|
|
|
|
// the first element is an empty table
|
|
EntityMap[cs_terminator]["quot"] = "\"";
|
|
EntityMap[cs_terminator]["lt"] = "<";
|
|
EntityMap[cs_terminator]["gt"] = ">";
|
|
EntityMap[cs_terminator]["amp"] = "&";
|
|
// XHP-specific entities
|
|
XHPEntityMap[cs_terminator]["apos"] = "\'";
|
|
XHPEntityMap[cs_terminator]["cloud"] = "\u2601";
|
|
XHPEntityMap[cs_terminator]["umbrella"] = "\u2602";
|
|
XHPEntityMap[cs_terminator]["snowman"] = "\u2603";
|
|
XHPEntityMap[cs_terminator]["snowflake"] = "\u2745";
|
|
XHPEntityMap[cs_terminator]["comet"] = "\u2604";
|
|
XHPEntityMap[cs_terminator]["thunderstorm"] = "\u2608";
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
char *string_html_encode(const char *input, int &len, bool encode_double_quote,
|
|
bool encode_single_quote, bool utf8, bool nbsp) {
|
|
assert(input);
|
|
/**
|
|
* Though seems to be wasting memory a lot, we have to realize most of the
|
|
* time this function is called with small strings, or fragments of HTMLs.
|
|
* Allocating/deallocating anything less than 1K is trivial these days, and
|
|
* we want avoid string copying as much as possible. Of course, the return
|
|
* char * is really sent back at large, occupying unnessary space for
|
|
* potentially longer time than we need, we have to realize the two closest
|
|
* solutions are not that much better, either:
|
|
*
|
|
* 1. pre-calculate size by iterating through the string once: too time
|
|
* consuming;
|
|
* 2. take a guess and double buffer size when over: still wasting, and
|
|
* it may not save that much.
|
|
*/
|
|
char *ret = (char *)malloc(len * 6uL + 1);
|
|
if (!ret) {
|
|
return nullptr;
|
|
}
|
|
char *q = ret;
|
|
for (const char *p = input, *end = input + len; p < end; p++) {
|
|
char c = *p;
|
|
switch (c) {
|
|
case '"':
|
|
if (encode_double_quote) {
|
|
*q++ = '&'; *q++ = 'q'; *q++ = 'u'; *q++ = 'o'; *q++ = 't'; *q++ = ';';
|
|
} else {
|
|
*q++ = c;
|
|
}
|
|
break;
|
|
case '\'':
|
|
if (encode_single_quote) {
|
|
*q++ = '&'; *q++ = '#'; *q++ = '0'; *q++ = '3'; *q++ = '9'; *q++ = ';';
|
|
} else {
|
|
*q++ = c;
|
|
}
|
|
break;
|
|
case '<':
|
|
*q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
|
|
break;
|
|
case '>':
|
|
*q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
|
|
break;
|
|
case '&':
|
|
*q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
|
|
break;
|
|
case '\xc2':
|
|
if (nbsp && utf8 && *(p+1) == '\xa0') {
|
|
*q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
|
|
p++;
|
|
} else {
|
|
*q++ = c;
|
|
}
|
|
break;
|
|
case '\xa0':
|
|
if (nbsp && !utf8) {
|
|
*q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
|
|
} else {
|
|
*q++ = c;
|
|
}
|
|
break;
|
|
default:
|
|
*q++ = c;
|
|
break;
|
|
}
|
|
}
|
|
if (q - ret > INT_MAX) {
|
|
free(ret);
|
|
return nullptr;
|
|
}
|
|
*q = 0;
|
|
len = q - ret;
|
|
return ret;
|
|
}
|
|
|
|
char *string_html_encode_extra(const char *input, int &len,
|
|
StringHtmlEncoding flags,
|
|
const AsciiMap *asciiMap) {
|
|
assert(input);
|
|
/**
|
|
* Though seems to be wasting memory a lot, we have to realize most of the
|
|
* time this function is called with small strings, or fragments of HTMLs.
|
|
* Allocating/deallocating anything less than 1K is trivial these days, and
|
|
* we want avoid string copying as much as possible. Of course, the return
|
|
* char * is really sent back at large, occupying unnessary space for
|
|
* potentially longer time than we need, we have to realize the two closest
|
|
* solutions are not that much better, either:
|
|
*
|
|
* 1. pre-calculate size by iterating through the string once: too time
|
|
* consuming;
|
|
* 2. take a guess and double buffer size when over: still wasting, and
|
|
* it may not save that much.
|
|
*/
|
|
char *ret = (char *)malloc(len * 8uL + 1);
|
|
if (!ret) {
|
|
return nullptr;
|
|
}
|
|
char *q = ret;
|
|
const char *rep = "\ufffd";
|
|
int32_t srcPosBytes;
|
|
for (srcPosBytes = 0; srcPosBytes < len; /* incremented in-loop */) {
|
|
unsigned char c = input[srcPosBytes];
|
|
if (c && c < 128) {
|
|
srcPosBytes++; // Optimize US-ASCII case
|
|
if ((asciiMap->map[c & 64 ? 1 : 0] >> (c & 63)) & 1) {
|
|
switch (c) {
|
|
case '"':
|
|
*q++ = '&'; *q++ = 'q'; *q++ = 'u';
|
|
*q++ = 'o'; *q++ = 't'; *q++ = ';';
|
|
break;
|
|
case '\'':
|
|
*q++ = '&'; *q++ = '#'; *q++ = '0';
|
|
*q++ = '3'; *q++ = '9'; *q++ = ';';
|
|
break;
|
|
case '<':
|
|
*q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
|
|
break;
|
|
case '>':
|
|
*q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
|
|
break;
|
|
case '&':
|
|
*q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
|
|
break;
|
|
default:
|
|
*q++ = '&'; *q++ = '#';
|
|
*q++ = c >= 100 ? '1' : '0';
|
|
*q++ = ((c / 10) % 10) + '0';
|
|
*q++ = (c % 10) + '0';
|
|
*q++ = ';';
|
|
break;
|
|
}
|
|
} else {
|
|
*q++ = c;
|
|
}
|
|
} else if (flags & STRING_HTML_ENCODE_UTF8) {
|
|
UChar32 curCodePoint;
|
|
U8_NEXT(input, srcPosBytes, len, curCodePoint);
|
|
if ((flags & STRING_HTML_ENCODE_NBSP) && curCodePoint == 0xC2A0) {
|
|
*q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
|
|
} else if (curCodePoint <= 0) {
|
|
if (flags & STRING_HTML_ENCODE_UTF8IZE_REPLACE) {
|
|
if (flags & STRING_HTML_ENCODE_HIGH) {
|
|
*q++ = '&'; *q++ = '#'; *q++ = 'x';
|
|
*q++ = 'f'; *q++ = 'f'; *q++ = 'f'; *q++ = 'd';
|
|
*q++ = ';';
|
|
} else {
|
|
const char *r = rep;
|
|
while (*r) *q++ = *r++;
|
|
}
|
|
}
|
|
} else if (flags & STRING_HTML_ENCODE_HIGH) {
|
|
q += sprintf(q, "&#x%x;", curCodePoint);
|
|
} else {
|
|
int32_t pos = 0;
|
|
U8_APPEND_UNSAFE(q, pos, curCodePoint);
|
|
q += pos;
|
|
}
|
|
} else {
|
|
srcPosBytes++; // Optimize US-ASCII case
|
|
if (c == 0xa0) {
|
|
*q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
|
|
} else if (flags & STRING_HTML_ENCODE_HIGH) {
|
|
*q++ = '&'; *q++ = '#';
|
|
*q++ = c >= 200 ? '2' : '1';
|
|
*q++ = ((c / 10) % 10) + '0';
|
|
*q++ = (c % 10) + '0';
|
|
*q++ = ';';
|
|
} else {
|
|
*q++ = c;
|
|
}
|
|
}
|
|
}
|
|
if (q - ret > INT_MAX) {
|
|
free(ret);
|
|
return nullptr;
|
|
}
|
|
*q = 0;
|
|
len = q - ret;
|
|
return ret;
|
|
}
|
|
|
|
inline static bool decode_entity(char *entity, int *len,
|
|
bool decode_double_quote,
|
|
bool decode_single_quote,
|
|
entity_charset charset, bool all,
|
|
bool xhp = false) {
|
|
// entity is 16 bytes, allocated statically below
|
|
// default in PHP
|
|
assert(entity && *entity);
|
|
if (entity[0] == '#') {
|
|
int code;
|
|
if (entity[1] == 'x' || entity[1] == 'X') {
|
|
code = strtol(entity + 2, nullptr, 16);
|
|
} else {
|
|
code = strtol(entity + 1, nullptr, 10);
|
|
}
|
|
|
|
// since we don't support multibyte chars other than utf-8
|
|
int l = 1;
|
|
|
|
if (code == 39 && decode_single_quote) {
|
|
entity[0] = code;
|
|
entity[1] = '\0';
|
|
*len = l;
|
|
return true;
|
|
}
|
|
|
|
switch (charset) {
|
|
case cs_utf_8:
|
|
{
|
|
unsigned char buf[10];
|
|
int size = utf32_to_utf8(buf, code);
|
|
memcpy(entity, buf, size + 1);
|
|
l = size;
|
|
break;
|
|
}
|
|
|
|
case cs_8859_1:
|
|
case cs_8859_5:
|
|
case cs_8859_15:
|
|
if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
|
|
return false;
|
|
} else {
|
|
if (code == 39) {
|
|
return false;
|
|
}
|
|
entity[0] = code;
|
|
entity[1] = '\0';
|
|
}
|
|
break;
|
|
|
|
case cs_cp1252:
|
|
case cs_cp1251:
|
|
case cs_cp866:
|
|
if (code > 0xff) {
|
|
return false;
|
|
}
|
|
entity[0] = code;
|
|
entity[1] = '\0';
|
|
break;
|
|
|
|
case cs_big5:
|
|
case cs_big5hkscs:
|
|
case cs_sjis:
|
|
case cs_eucjp:
|
|
if (code >= 0x80) {
|
|
return false;
|
|
}
|
|
entity[0] = code;
|
|
entity[1] = '\0';
|
|
break;
|
|
|
|
case cs_gb2312:
|
|
if (code >= 0x81) {
|
|
return false;
|
|
}
|
|
entity[0] = code;
|
|
entity[1] = '\0';
|
|
break;
|
|
|
|
default:
|
|
return false;
|
|
break;
|
|
}
|
|
*len = l;
|
|
return true;
|
|
} else {
|
|
HtmlEntityMap *entityMap;
|
|
|
|
if (strncasecmp(entity, "quot", 4) == 0 && !decode_double_quote) {
|
|
return false;
|
|
}
|
|
|
|
if (all) {
|
|
entityMap = xhp ? &XHPEntityMap[charset] : &EntityMap[charset];
|
|
} else {
|
|
entityMap = xhp ? &XHPEntityMap[cs_terminator]
|
|
: &EntityMap[cs_terminator];
|
|
}
|
|
HtmlEntityMap::const_iterator iter = entityMap->find(entity);
|
|
if (iter != entityMap->end()) {
|
|
memcpy(entity, iter->second.c_str(), iter->second.length() + 1);
|
|
*len = iter->second.length();
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
char *string_html_decode(const char *input, int &len,
|
|
bool decode_double_quote, bool decode_single_quote,
|
|
const char *charset_hint, bool all,
|
|
bool xhp /* = false */) {
|
|
assert(input);
|
|
|
|
if (!EntityMapInited) {
|
|
Lock lock(EntityMapMutex);
|
|
if (!EntityMapInited) {
|
|
init_entity_table();
|
|
EntityMapInited = true;
|
|
}
|
|
}
|
|
|
|
entity_charset charset = determine_charset(charset_hint);
|
|
if (charset == cs_unknown) {
|
|
return nullptr;
|
|
}
|
|
|
|
char *ret = (char *)malloc(len + 1);
|
|
char *q = ret;
|
|
for (const char *p = input; *p || UNLIKELY(p - input < len); p++) {
|
|
char ch = *p;
|
|
if (ch != '&') {
|
|
*q++ = ch;
|
|
continue;
|
|
}
|
|
p++;
|
|
|
|
bool found = false;
|
|
for (const char *t = p; *t; t++) {
|
|
if (*t == ';') {
|
|
int l = t - p;
|
|
if (l > 0) {
|
|
char sbuf[16] = {0};
|
|
char *buf;
|
|
if (l > 10) {
|
|
buf = (char* )malloc(l + 1);
|
|
} else {
|
|
buf = sbuf;
|
|
}
|
|
memcpy(buf, p, l);
|
|
buf[l] = '\0';
|
|
if (decode_entity(buf, &l, decode_double_quote, decode_single_quote,
|
|
charset, all, xhp)) {
|
|
memcpy(q, buf, l);
|
|
found = true;
|
|
p = t;
|
|
q += l;
|
|
}
|
|
if (buf != sbuf) {
|
|
free(buf);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (!found) {
|
|
p--;
|
|
*q++ = '&'; // not an entity
|
|
}
|
|
}
|
|
*q = '\0';
|
|
len = q - ret;
|
|
return ret;
|
|
}
|
|
|
|
const html_entity_map* html_get_entity_map() {
|
|
if (!EntityMapInited) {
|
|
Lock lock(EntityMapMutex);
|
|
if (!EntityMapInited) {
|
|
init_entity_table();
|
|
EntityMapInited = true;
|
|
}
|
|
}
|
|
return entity_map;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
}
|