change use of html_supported_charset to avoid duplicating work

change callers of determine_charset to check for nulls, instead of calling html_supported_charset (to pre-validate charset name) and then calling this (which has to run through a list of names anyway).
2013-03-26 07:14:52 -07:00
commit b4ca4bbbd0
@@ -449,13 +449,15 @@ String StringUtil::HtmlDecode(CStrRef input, QuoteStyle quoteStyle,

  assert(charset);

-  if (!html_supported_charset(charset)) {
-    throw NotImplementedException(charset);
-  }
-
  int len = input.size();
  char *ret = string_html_decode(input, len, quoteStyle != NoQuotes,
                                 quoteStyle == BothQuotes, charset, all);
+  if (!ret) {
+    // null iff charset was not recognized
+    throw NotImplementedException(charset);
+    // (charset is not null, see assertion above)
+  }
+
  return String(ret, len, AttachString);
 }

@@ -807,8 +807,10 @@ static const HtmlBasicEntity basic_entities[] = {
 };

 Array f_get_html_translation_table(int table, int quote_style) {
+  static entity_charset charset = determine_charset(nullptr); // get default one
  char ind[2]; ind[1] = 0;
-  entity_charset charset = determine_charset(NULL);
+
+  assert(charset != entity_charset_enum::cs_unknown);

  const int HTML_SPECIALCHARS = 0;
  const int HTML_ENTITIES = 1;
@@ -67,8 +67,15 @@ bool ScannerToken::htmlTrim() {

 void ScannerToken::xhpDecode() {
  int len = m_text.size();
+  // note: 5th arg is charset_hint string; here we pass nullptr to indicate
+  // "use the default one" which is UTF-8.  (Just saves a charset lookup.)
  char *ret = string_html_decode(m_text.c_str(), len, true,
-                                 false, "UTF-8", true, true);
+                                 false, nullptr, true, true);
+  // safety check: decode function returns null iff charset unrecognized;
+  // i.e. nullptr result would mean UTF-8 is available.
+  // Pretty sure it is universally available!
+  // (Do assertion anyway.)
+  assert(ret);
  m_text = string(ret, len);
  free(ret);
 }
@@ -310,14 +310,13 @@ static const struct {
 ///////////////////////////////////////////////////////////////////////////////

 entity_charset determine_charset(const char *charset_hint) {
-  entity_charset charset = cs_utf_8;
+  entity_charset charset = cs_unknown;

  if (charset_hint == nullptr) {
    // default to utf-8
    return cs_utf_8;
  }

-  DEBUG_ONLY bool found = false;
  size_t len = strlen(charset_hint);

  /* now walk the charset map and look for the codeset */
@@ -325,15 +324,10 @@ entity_charset determine_charset(const char *charset_hint) {
    if (len == strlen(charset_map[i].codeset) &&
      strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
      charset = charset_map[i].charset;
-      found = true;
      break;
    }
  }

-  // All code paths that go into this check html_supported_charset()
-  // and throw if not.
-  assert(found && "currently we expect to only use supported charsets");
-
  return charset;
 }

@@ -766,6 +760,9 @@ char *string_html_decode(const char *input, int &len,
  }

  entity_charset charset = determine_charset(charset_hint);
+  if (charset == cs_unknown) {
+    return nullptr;
+  }

  char *ret = (char *)malloc(len + 1);
  char *q = ret;
@@ -826,16 +823,5 @@ const html_entity_map* html_get_entity_map() {
  return entity_map;
 }

-bool html_supported_charset(const char *charset) {
-  size_t len = strlen(charset);
-  for (int i = 0; charset_map[i].codeset; i++) {
-     if (len == strlen(charset_map[i].codeset) &&
-       strncasecmp(charset, charset_map[i].codeset, len) == 0) {
-       return true;
-     }
-   }
-  return false;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 }
@@ -62,6 +62,7 @@ enum entity_charset {
  cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
  cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
  cs_cp1251, cs_8859_5, cs_cp866, cs_macroman,
+  cs_unknown,
  cs_end
 };
 }
@@ -85,6 +86,10 @@ struct html_entity_map {

 const html_entity_map* html_get_entity_map();

+/*
+ * returns cs_unknown iff not found;
+ * if input null, returns default charset of cs_utf_8
+ */
 entity_charset determine_charset(const char*);

 char *string_html_encode(const char *input, int &len, bool encode_double_quote,
@@ -92,11 +97,18 @@ char *string_html_encode(const char *input, int &len, bool encode_double_quote,
 char *string_html_encode_extra(const char *input, int &len,
                               StringHtmlEncoding flags,
                               const AsciiMap *asciiMap);
+
+/**
+ * returns decoded string;
+ * note, can return nullptr if the charset could not be detected
+ * using the given charset_hint; can also pass in nullptr
+ * for the charset_hint to use the default one (UTF-8).
+ * (see determine_charset).
+ */
 char *string_html_decode(const char *input, int &len,
                         bool decode_double_quote, bool decode_single_quote,
                         const char *charset_hint,
                         bool all, bool xhp = false );
-bool html_supported_charset(const char *charset);

 ///////////////////////////////////////////////////////////////////////////////
 }