From b4ca4bbbd0b2f307286d80ed75ab53950625fb0d Mon Sep 17 00:00:00 2001
From: steveo <steveo@fb.com>
Date: Tue, 26 Mar 2013 07:14:52 -0700
Subject: [PATCH] change use of html_supported_charset to avoid duplicating
 work

change callers of determine_charset to check for nulls, instead of calling html_supported_charset (to pre-validate charset name) and then calling this (which has to run through a list of names anyway).
---
 hphp/runtime/base/string_util.cpp | 10 ++++++----
 hphp/runtime/ext/ext_string.cpp   |  4 +++-
 hphp/util/parser/scanner.cpp      |  9 ++++++++-
 hphp/util/zend/zend_html.cpp      | 22 ++++------------------
 hphp/util/zend/zend_html.h        | 14 +++++++++++++-
 5 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/hphp/runtime/base/string_util.cpp b/hphp/runtime/base/string_util.cpp
index 2c9b1ec19..bd4fe6074 100644
--- a/hphp/runtime/base/string_util.cpp
+++ b/hphp/runtime/base/string_util.cpp
@@ -449,13 +449,15 @@ String StringUtil::HtmlDecode(CStrRef input, QuoteStyle quoteStyle,
 
   assert(charset);
 
-  if (!html_supported_charset(charset)) {
-    throw NotImplementedException(charset);
-  }
-
   int len = input.size();
   char *ret = string_html_decode(input, len, quoteStyle != NoQuotes,
                                  quoteStyle == BothQuotes, charset, all);
+  if (!ret) {
+    // null iff charset was not recognized
+    throw NotImplementedException(charset);
+    // (charset is not null, see assertion above)
+  }
+
   return String(ret, len, AttachString);
 }
 
diff --git a/hphp/runtime/ext/ext_string.cpp b/hphp/runtime/ext/ext_string.cpp
index 81fe95332..c3b35d8c6 100644
--- a/hphp/runtime/ext/ext_string.cpp
+++ b/hphp/runtime/ext/ext_string.cpp
@@ -807,8 +807,10 @@ static const HtmlBasicEntity basic_entities[] = {
 };
 
 Array f_get_html_translation_table(int table, int quote_style) {
+  static entity_charset charset = determine_charset(nullptr); // get default one
   char ind[2]; ind[1] = 0;
-  entity_charset charset = determine_charset(NULL);
+
+  assert(charset != entity_charset_enum::cs_unknown);
 
   const int HTML_SPECIALCHARS = 0;
   const int HTML_ENTITIES = 1;
diff --git a/hphp/util/parser/scanner.cpp b/hphp/util/parser/scanner.cpp
index 15c9923f3..6522f84ef 100644
--- a/hphp/util/parser/scanner.cpp
+++ b/hphp/util/parser/scanner.cpp
@@ -67,8 +67,15 @@ bool ScannerToken::htmlTrim() {
 
 void ScannerToken::xhpDecode() {
   int len = m_text.size();
+  // note: 5th arg is charset_hint string; here we pass nullptr to indicate
+  // "use the default one" which is UTF-8.  (Just saves a charset lookup.)
   char *ret = string_html_decode(m_text.c_str(), len, true,
-                                 false, "UTF-8", true, true);
+                                 false, nullptr, true, true);
+  // safety check: decode function returns null iff charset unrecognized;
+  // i.e. nullptr result would mean UTF-8 is available.
+  // Pretty sure it is universally available!
+  // (Do assertion anyway.)
+  assert(ret);
   m_text = string(ret, len);
   free(ret);
 }
diff --git a/hphp/util/zend/zend_html.cpp b/hphp/util/zend/zend_html.cpp
index e1ce85667..a123cf027 100644
--- a/hphp/util/zend/zend_html.cpp
+++ b/hphp/util/zend/zend_html.cpp
@@ -310,14 +310,13 @@ static const struct {
 ///////////////////////////////////////////////////////////////////////////////
 
 entity_charset determine_charset(const char *charset_hint) {
-  entity_charset charset = cs_utf_8;
+  entity_charset charset = cs_unknown;
 
   if (charset_hint == nullptr) {
     // default to utf-8
     return cs_utf_8;
   }
 
-  DEBUG_ONLY bool found = false;
   size_t len = strlen(charset_hint);
 
   /* now walk the charset map and look for the codeset */
@@ -325,15 +324,10 @@ entity_charset determine_charset(const char *charset_hint) {
     if (len == strlen(charset_map[i].codeset) &&
       strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
       charset = charset_map[i].charset;
-      found = true;
       break;
     }
   }
 
-  // All code paths that go into this check html_supported_charset()
-  // and throw if not.
-  assert(found && "currently we expect to only use supported charsets");
-
   return charset;
 }
 
@@ -766,6 +760,9 @@ char *string_html_decode(const char *input, int &len,
   }
 
   entity_charset charset = determine_charset(charset_hint);
+  if (charset == cs_unknown) {
+    return nullptr;
+  }
 
   char *ret = (char *)malloc(len + 1);
   char *q = ret;
@@ -826,16 +823,5 @@ const html_entity_map* html_get_entity_map() {
   return entity_map;
 }
 
-bool html_supported_charset(const char *charset) {
-  size_t len = strlen(charset);
-  for (int i = 0; charset_map[i].codeset; i++) {
-     if (len == strlen(charset_map[i].codeset) &&
-       strncasecmp(charset, charset_map[i].codeset, len) == 0) {
-       return true;
-     }
-   }
-  return false;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 }
diff --git a/hphp/util/zend/zend_html.h b/hphp/util/zend/zend_html.h
index fe1e9d1d6..72e108e45 100644
--- a/hphp/util/zend/zend_html.h
+++ b/hphp/util/zend/zend_html.h
@@ -62,6 +62,7 @@ enum entity_charset {
   cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
   cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
   cs_cp1251, cs_8859_5, cs_cp866, cs_macroman,
+  cs_unknown,
   cs_end
 };
 }
@@ -85,6 +86,10 @@ struct html_entity_map {
 
 const html_entity_map* html_get_entity_map();
 
+/*
+ * returns cs_unknown iff not found;
+ * if input null, returns default charset of cs_utf_8
+ */
 entity_charset determine_charset(const char*);
 
 char *string_html_encode(const char *input, int &len, bool encode_double_quote,
@@ -92,11 +97,18 @@ char *string_html_encode(const char *input, int &len, bool encode_double_quote,
 char *string_html_encode_extra(const char *input, int &len,
                                StringHtmlEncoding flags,
                                const AsciiMap *asciiMap);
+
+/**
+ * returns decoded string;
+ * note, can return nullptr if the charset could not be detected
+ * using the given charset_hint; can also pass in nullptr
+ * for the charset_hint to use the default one (UTF-8).
+ * (see determine_charset).
+ */
 char *string_html_decode(const char *input, int &len,
                          bool decode_double_quote, bool decode_single_quote,
                          const char *charset_hint,
                          bool all, bool xhp = false );
-bool html_supported_charset(const char *charset);
 
 ///////////////////////////////////////////////////////////////////////////////
 }