change use of html_supported_charset to avoid duplicating work
change callers of determine_charset to check for nulls, instead of calling html_supported_charset (to pre-validate charset name) and then calling this (which has to run through a list of names anyway).
Esse commit está contido em:
@@ -449,13 +449,15 @@ String StringUtil::HtmlDecode(CStrRef input, QuoteStyle quoteStyle,
|
||||
|
||||
assert(charset);
|
||||
|
||||
if (!html_supported_charset(charset)) {
|
||||
throw NotImplementedException(charset);
|
||||
}
|
||||
|
||||
int len = input.size();
|
||||
char *ret = string_html_decode(input, len, quoteStyle != NoQuotes,
|
||||
quoteStyle == BothQuotes, charset, all);
|
||||
if (!ret) {
|
||||
// null iff charset was not recognized
|
||||
throw NotImplementedException(charset);
|
||||
// (charset is not null, see assertion above)
|
||||
}
|
||||
|
||||
return String(ret, len, AttachString);
|
||||
}
|
||||
|
||||
|
||||
@@ -807,8 +807,10 @@ static const HtmlBasicEntity basic_entities[] = {
|
||||
};
|
||||
|
||||
Array f_get_html_translation_table(int table, int quote_style) {
|
||||
static entity_charset charset = determine_charset(nullptr); // get default one
|
||||
char ind[2]; ind[1] = 0;
|
||||
entity_charset charset = determine_charset(NULL);
|
||||
|
||||
assert(charset != entity_charset_enum::cs_unknown);
|
||||
|
||||
const int HTML_SPECIALCHARS = 0;
|
||||
const int HTML_ENTITIES = 1;
|
||||
|
||||
@@ -67,8 +67,15 @@ bool ScannerToken::htmlTrim() {
|
||||
|
||||
void ScannerToken::xhpDecode() {
|
||||
int len = m_text.size();
|
||||
// note: 5th arg is charset_hint string; here we pass nullptr to indicate
|
||||
// "use the default one" which is UTF-8. (Just saves a charset lookup.)
|
||||
char *ret = string_html_decode(m_text.c_str(), len, true,
|
||||
false, "UTF-8", true, true);
|
||||
false, nullptr, true, true);
|
||||
// safety check: decode function returns null iff charset unrecognized;
|
||||
// i.e. nullptr result would mean UTF-8 is available.
|
||||
// Pretty sure it is universally available!
|
||||
// (Do assertion anyway.)
|
||||
assert(ret);
|
||||
m_text = string(ret, len);
|
||||
free(ret);
|
||||
}
|
||||
|
||||
@@ -310,14 +310,13 @@ static const struct {
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
entity_charset determine_charset(const char *charset_hint) {
|
||||
entity_charset charset = cs_utf_8;
|
||||
entity_charset charset = cs_unknown;
|
||||
|
||||
if (charset_hint == nullptr) {
|
||||
// default to utf-8
|
||||
return cs_utf_8;
|
||||
}
|
||||
|
||||
DEBUG_ONLY bool found = false;
|
||||
size_t len = strlen(charset_hint);
|
||||
|
||||
/* now walk the charset map and look for the codeset */
|
||||
@@ -325,15 +324,10 @@ entity_charset determine_charset(const char *charset_hint) {
|
||||
if (len == strlen(charset_map[i].codeset) &&
|
||||
strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
|
||||
charset = charset_map[i].charset;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// All code paths that go into this check html_supported_charset()
|
||||
// and throw if not.
|
||||
assert(found && "currently we expect to only use supported charsets");
|
||||
|
||||
return charset;
|
||||
}
|
||||
|
||||
@@ -766,6 +760,9 @@ char *string_html_decode(const char *input, int &len,
|
||||
}
|
||||
|
||||
entity_charset charset = determine_charset(charset_hint);
|
||||
if (charset == cs_unknown) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
char *ret = (char *)malloc(len + 1);
|
||||
char *q = ret;
|
||||
@@ -826,16 +823,5 @@ const html_entity_map* html_get_entity_map() {
|
||||
return entity_map;
|
||||
}
|
||||
|
||||
bool html_supported_charset(const char *charset) {
|
||||
size_t len = strlen(charset);
|
||||
for (int i = 0; charset_map[i].codeset; i++) {
|
||||
if (len == strlen(charset_map[i].codeset) &&
|
||||
strncasecmp(charset, charset_map[i].codeset, len) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
}
|
||||
|
||||
@@ -62,6 +62,7 @@ enum entity_charset {
|
||||
cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
|
||||
cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
|
||||
cs_cp1251, cs_8859_5, cs_cp866, cs_macroman,
|
||||
cs_unknown,
|
||||
cs_end
|
||||
};
|
||||
}
|
||||
@@ -85,6 +86,10 @@ struct html_entity_map {
|
||||
|
||||
const html_entity_map* html_get_entity_map();
|
||||
|
||||
/*
|
||||
* returns cs_unknown iff not found;
|
||||
* if input null, returns default charset of cs_utf_8
|
||||
*/
|
||||
entity_charset determine_charset(const char*);
|
||||
|
||||
char *string_html_encode(const char *input, int &len, bool encode_double_quote,
|
||||
@@ -92,11 +97,18 @@ char *string_html_encode(const char *input, int &len, bool encode_double_quote,
|
||||
char *string_html_encode_extra(const char *input, int &len,
|
||||
StringHtmlEncoding flags,
|
||||
const AsciiMap *asciiMap);
|
||||
|
||||
/**
|
||||
* returns decoded string;
|
||||
* note, can return nullptr if the charset could not be detected
|
||||
* using the given charset_hint; can also pass in nullptr
|
||||
* for the charset_hint to use the default one (UTF-8).
|
||||
* (see determine_charset).
|
||||
*/
|
||||
char *string_html_decode(const char *input, int &len,
|
||||
bool decode_double_quote, bool decode_single_quote,
|
||||
const char *charset_hint,
|
||||
bool all, bool xhp = false );
|
||||
bool html_supported_charset(const char *charset);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
}
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário