From b4ca4bbbd0b2f307286d80ed75ab53950625fb0d Mon Sep 17 00:00:00 2001 From: steveo Date: Tue, 26 Mar 2013 07:14:52 -0700 Subject: [PATCH] change use of html_supported_charset to avoid duplicating work change callers of determine_charset to check for nulls, instead of calling html_supported_charset (to pre-validate charset name) and then calling this (which has to run through a list of names anyway). --- hphp/runtime/base/string_util.cpp | 10 ++++++---- hphp/runtime/ext/ext_string.cpp | 4 +++- hphp/util/parser/scanner.cpp | 9 ++++++++- hphp/util/zend/zend_html.cpp | 22 ++++------------------ hphp/util/zend/zend_html.h | 14 +++++++++++++- 5 files changed, 34 insertions(+), 25 deletions(-) diff --git a/hphp/runtime/base/string_util.cpp b/hphp/runtime/base/string_util.cpp index 2c9b1ec19..bd4fe6074 100644 --- a/hphp/runtime/base/string_util.cpp +++ b/hphp/runtime/base/string_util.cpp @@ -449,13 +449,15 @@ String StringUtil::HtmlDecode(CStrRef input, QuoteStyle quoteStyle, assert(charset); - if (!html_supported_charset(charset)) { - throw NotImplementedException(charset); - } - int len = input.size(); char *ret = string_html_decode(input, len, quoteStyle != NoQuotes, quoteStyle == BothQuotes, charset, all); + if (!ret) { + // null iff charset was not recognized + throw NotImplementedException(charset); + // (charset is not null, see assertion above) + } + return String(ret, len, AttachString); } diff --git a/hphp/runtime/ext/ext_string.cpp b/hphp/runtime/ext/ext_string.cpp index 81fe95332..c3b35d8c6 100644 --- a/hphp/runtime/ext/ext_string.cpp +++ b/hphp/runtime/ext/ext_string.cpp @@ -807,8 +807,10 @@ static const HtmlBasicEntity basic_entities[] = { }; Array f_get_html_translation_table(int table, int quote_style) { + static entity_charset charset = determine_charset(nullptr); // get default one char ind[2]; ind[1] = 0; - entity_charset charset = determine_charset(NULL); + + assert(charset != entity_charset_enum::cs_unknown); const int HTML_SPECIALCHARS = 0; const int HTML_ENTITIES = 1; diff --git a/hphp/util/parser/scanner.cpp b/hphp/util/parser/scanner.cpp index 15c9923f3..6522f84ef 100644 --- a/hphp/util/parser/scanner.cpp +++ b/hphp/util/parser/scanner.cpp @@ -67,8 +67,15 @@ bool ScannerToken::htmlTrim() { void ScannerToken::xhpDecode() { int len = m_text.size(); + // note: 5th arg is charset_hint string; here we pass nullptr to indicate + // "use the default one" which is UTF-8. (Just saves a charset lookup.) char *ret = string_html_decode(m_text.c_str(), len, true, - false, "UTF-8", true, true); + false, nullptr, true, true); + // safety check: decode function returns null iff charset unrecognized; + // i.e. nullptr result would mean UTF-8 is available. + // Pretty sure it is universally available! + // (Do assertion anyway.) + assert(ret); m_text = string(ret, len); free(ret); } diff --git a/hphp/util/zend/zend_html.cpp b/hphp/util/zend/zend_html.cpp index e1ce85667..a123cf027 100644 --- a/hphp/util/zend/zend_html.cpp +++ b/hphp/util/zend/zend_html.cpp @@ -310,14 +310,13 @@ static const struct { /////////////////////////////////////////////////////////////////////////////// entity_charset determine_charset(const char *charset_hint) { - entity_charset charset = cs_utf_8; + entity_charset charset = cs_unknown; if (charset_hint == nullptr) { // default to utf-8 return cs_utf_8; } - DEBUG_ONLY bool found = false; size_t len = strlen(charset_hint); /* now walk the charset map and look for the codeset */ @@ -325,15 +324,10 @@ entity_charset determine_charset(const char *charset_hint) { if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) { charset = charset_map[i].charset; - found = true; break; } } - // All code paths that go into this check html_supported_charset() - // and throw if not. - assert(found && "currently we expect to only use supported charsets"); - return charset; } @@ -766,6 +760,9 @@ char *string_html_decode(const char *input, int &len, } entity_charset charset = determine_charset(charset_hint); + if (charset == cs_unknown) { + return nullptr; + } char *ret = (char *)malloc(len + 1); char *q = ret; @@ -826,16 +823,5 @@ const html_entity_map* html_get_entity_map() { return entity_map; } -bool html_supported_charset(const char *charset) { - size_t len = strlen(charset); - for (int i = 0; charset_map[i].codeset; i++) { - if (len == strlen(charset_map[i].codeset) && - strncasecmp(charset, charset_map[i].codeset, len) == 0) { - return true; - } - } - return false; -} - /////////////////////////////////////////////////////////////////////////////// } diff --git a/hphp/util/zend/zend_html.h b/hphp/util/zend/zend_html.h index fe1e9d1d6..72e108e45 100644 --- a/hphp/util/zend/zend_html.h +++ b/hphp/util/zend/zend_html.h @@ -62,6 +62,7 @@ enum entity_charset { cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, cs_cp1251, cs_8859_5, cs_cp866, cs_macroman, + cs_unknown, cs_end }; } @@ -85,6 +86,10 @@ struct html_entity_map { const html_entity_map* html_get_entity_map(); +/* + * returns cs_unknown iff not found; + * if input null, returns default charset of cs_utf_8 + */ entity_charset determine_charset(const char*); char *string_html_encode(const char *input, int &len, bool encode_double_quote, @@ -92,11 +97,18 @@ char *string_html_encode(const char *input, int &len, bool encode_double_quote, char *string_html_encode_extra(const char *input, int &len, StringHtmlEncoding flags, const AsciiMap *asciiMap); + +/** + * returns decoded string; + * note, can return nullptr if the charset could not be detected + * using the given charset_hint; can also pass in nullptr + * for the charset_hint to use the default one (UTF-8). + * (see determine_charset). + */ char *string_html_decode(const char *input, int &len, bool decode_double_quote, bool decode_single_quote, const char *charset_hint, bool all, bool xhp = false ); -bool html_supported_charset(const char *charset); /////////////////////////////////////////////////////////////////////////////// }