diff --git a/hphp/test/ext/test_ext.h b/hphp/test/ext/test_ext.h index c7b61a365..d5b1f7542 100644 --- a/hphp/test/ext/test_ext.h +++ b/hphp/test/ext/test_ext.h @@ -35,9 +35,6 @@ #include "hphp/facebook/extensions/tao/test_ext_tao.h" #include "hphp/facebook/extensions/urlextraction/test_ext_urlextraction.h" #include "hphp/test/ext/test_ext_curl.h" -#include "hphp/test/ext/test_ext_icu_ucnv.h" -#include "hphp/test/ext/test_ext_icu_ucsdet.h" -#include "hphp/test/ext/test_ext_icu_uspoof.h" #include "hphp/test/ext/test_ext_imagesprite.h" #include "hphp/test/ext/test_ext_intl.h" #include "hphp/test/ext/test_ext_ipc.h" diff --git a/hphp/test/ext/test_ext_icu_ucnv.cpp b/hphp/test/ext/test_ext_icu_ucnv.cpp deleted file mode 100644 index 3a1e4a1cc..000000000 --- a/hphp/test/ext/test_ext_icu_ucnv.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | HipHop for PHP | - +----------------------------------------------------------------------+ - | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | http://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ -*/ - -#include "hphp/test/ext/test_ext_icu_ucnv.h" -#include "hphp/runtime/ext/ext_icu_ucnv.h" - -IMPLEMENT_SEP_EXTENSION_TEST(Icu_ucnv); -/////////////////////////////////////////////////////////////////////////////// - -bool TestExtIcu_ucnv::RunTests(const std::string &which) { - bool ret = true; - - RUN_TEST(test_UConverter); - - return ret; -} - -/////////////////////////////////////////////////////////////////////////////// - -bool TestExtIcu_ucnv::test_UConverter() { - // Handled in TestCodeRun - return Count(true); -} diff --git a/hphp/test/ext/test_ext_icu_ucnv.h b/hphp/test/ext/test_ext_icu_ucnv.h deleted file mode 100644 index c6b705248..000000000 --- a/hphp/test/ext/test_ext_icu_ucnv.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | HipHop for PHP | - +----------------------------------------------------------------------+ - | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | http://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ -*/ - -#ifndef incl_HPHP_TEST_EXT_ICU_UCNV_H_ -#define incl_HPHP_TEST_EXT_ICU_UCNV_H_ - -// >>>>>> Generated by idl.php. Do NOT modify. <<<<<< - -#include "hphp/test/ext/test_cpp_ext.h" - -/////////////////////////////////////////////////////////////////////////////// - -class TestExtIcu_ucnv : public TestCppExt { - public: - virtual bool RunTests(const std::string &which); - - bool test_UConverter(); -}; - -/////////////////////////////////////////////////////////////////////////////// - -#endif // incl_HPHP_TEST_EXT_ICU_UCNV_H_ diff --git a/hphp/test/ext/test_ext_icu_ucsdet.cpp b/hphp/test/ext/test_ext_icu_ucsdet.cpp deleted file mode 100644 index e4308acbd..000000000 --- a/hphp/test/ext/test_ext_icu_ucsdet.cpp +++ /dev/null @@ -1,382 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | HipHop for PHP | - +----------------------------------------------------------------------+ - | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | http://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ -*/ - -#include "hphp/test/ext/test_ext_icu_ucsdet.h" -#include "hphp/runtime/ext/ext_icu_ucsdet.h" -#include "hphp/runtime/ext/ext_array.h" - -/////////////////////////////////////////////////////////////////////////////// - -bool TestExtIcu_ucsdet::RunTests(const std::string &which) { - bool ret = true; - - RUN_TEST(test_basics); - - // Special cases - RUN_TEST(test_empty); - RUN_TEST(test_cannot_detect); - RUN_TEST(test_declared_encoding); - - // English and Western European - RUN_TEST(test_hello_world); - RUN_TEST(test_windows_1252); - - // Eastern European - RUN_TEST(test_windows_1250); - - // Arabic - RUN_TEST(test_windows_1256); - - // Japanese - RUN_TEST(test_shift_jis); - RUN_TEST(test_euc_jp); - RUN_TEST(test_iso_2022_jp); - - // Chinese - RUN_TEST(test_gb2312); - RUN_TEST(test_big5); - - // Cyrillic - RUN_TEST(test_koi8r); - RUN_TEST(test_windows_1251); - - // Universal - RUN_TEST(test_utf8); - RUN_TEST(test_utf16); - - return ret; -} - -bool TestExtIcu_ucsdet::detect_and_convert_to_utf8( - CStrRef bytes, - CStrRef utf8) { - p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)()); - detector->t_settext(bytes); - Object matchObj = detector->t_detect(); - p_EncodingMatch match = matchObj.getTyped(); - if (!match->t_isvalid()) { - return false; - } - return match->t_getutf8() == utf8; -} - -bool TestExtIcu_ucsdet::test_uninitialized() { - p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)()); - - try { - Object matchObj = detector->t_detect(); - } catch (Exception& e) { - return Count(true); - } - - return Count(false); -} - -bool TestExtIcu_ucsdet::test_basics() { - // This is as unmistakably UTF-8 as it gets. - const char* utf8_snowman_with_bom = "\uFEFF\u2603"; - - p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)()); - - detector->t_settext(utf8_snowman_with_bom); - Object matchObj = detector->t_detect(); - p_EncodingMatch match = matchObj.getTyped(); - VERIFY(match->t_isvalid() == true); - VERIFY(match->t_getencoding() == "UTF-8"); - VERIFY(match->t_getconfidence() == 100); - VERIFY(match->t_getutf8() == utf8_snowman_with_bom); - return Count(true); -} - -bool TestExtIcu_ucsdet::test_empty() { - VERIFY(detect_and_convert_to_utf8("", "") == true); - return Count(true); -} - -bool TestExtIcu_ucsdet::test_cannot_detect() { - p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)()); - - // The detector has no idea what to do with this. - detector->t_settext("\xc7\xe8\xec\xed\xe8\xe9 \xe2\xe5\xf7\xe5\xf0"); - Object matchObj = detector->t_detect(); - p_EncodingMatch match = matchObj.getTyped(); - VERIFY(match->t_isvalid() == false); - return Count(true); -} - -bool TestExtIcu_ucsdet::test_declared_encoding() { - // Right now (ICU 4.6), this API doesn't actually do anything, but - // let's at least verify it doesn't crash. - p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)()); - detector->t_settext("Yo!"); - detector->t_setdeclaredencoding("windows-1251"); - - Object matchObj = detector->t_detect(); - p_EncodingMatch match = matchObj.getTyped(); - VERIFY(match->t_isvalid() == true); - VERIFY(match->t_getutf8() == "Yo!"); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_hello_world() { - VERIFY(detect_and_convert_to_utf8("Hello, world!", "Hello, world!") == true); - return Count(true); -} - -bool TestExtIcu_ucsdet::test_windows_1252() { - VERIFY(detect_and_convert_to_utf8( - "Toda Europa ley\xf3 Don Quijote como una s\xe1tira.", - "Toda Europa ley\u00f3 Don Quijote como una s\u00e1tira.") == true); - - VERIFY(detect_and_convert_to_utf8( - "Notre P\xe8re, qui \xeates aux cieux", - "Notre P\u00e8re, qui \u00eates aux cieux") == true); - - VERIFY(detect_and_convert_to_utf8( - "Marta da Silva, als beste Spielerin und beste Torsch\xFCtzin der WM " - "2007 sowie bisher f\xFCnf Mal als \x84Weltfu\xDF" "ballerin des " - "Jahres\x93 ausgezeichnet, kommt zur Welt.", - "Marta da Silva, als beste Spielerin und beste Torsch\u00fctzin der WM " - "2007 sowie bisher f\u00FCnf Mal als \u201EWeltfu\u00DFballerin des " - "Jahres\u201C ausgezeichnet, kommt zur Welt.") == true); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_windows_1250() { - VERIFY(detect_and_convert_to_utf8( - "Do Wikipedie m\xf9\x9e" "e p\xf8isp\xedvat kdokoliv.", - "Do Wikipedie m\u016f\u017ee p\u0159isp\u00edvat kdokoliv.") == true); - - VERIFY(detect_and_convert_to_utf8( - "O\xe8" "e na\x9a, koji jesi na nebesima, sveti se ime Tvoje.", - "O\u010de na\u0161, koji jesi na nebesima, sveti se ime Tvoje.") == true); - - VERIFY(detect_and_convert_to_utf8( - "Prezentacj\xea pierwszego graficznego \x9crodowiska pracy z rodziny " - "Windows firmy Microsoft przeprowadzono w listopadzie 1985.", - "Prezentacj\u0119 pierwszego graficznego \u015brodowiska pracy z rodziny " - "Windows firmy Microsoft przeprowadzono w listopadzie 1985.") == true); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_windows_1256() { - VERIFY(detect_and_convert_to_utf8( - "\xe1\xc7 \xc3\xca\xdf\xe1\xe3 \xc7\xe1\xda\xd1\xc8\xed\xc9", - "\u0644\u0627 \u0623\u062a\u0643\u0644\u0645 \u0627\u0644\u0639\u0631\u0628" - "\u064a\u0629") == true); - - VERIFY(detect_and_convert_to_utf8( - "\xe6\xed\xdf\xed\xc8\xed\xcf\xed\xc7 \xe5\xed \xe3\xd4\xd1\xe6\xda \xe3" - "\xe6\xd3\xe6\xda\xc9 \xe3\xca\xda\xcf\xcf\xc9 \xc7\xe1\xe1\xdb\xc7\xca" - "\xa1 \xe3\xc8\xe4\xed\xc9 \xda\xe1\xec \xc7\xe1\xe6\xed\xc8\xa1 \xd0\xc7" - "\xca \xe3\xcd\xca\xe6\xec \xcd\xd1\xa1 \xca\xd4\xdb\xe1\xe5\xc7 \xe3\xc4" - "\xd3\xd3\xc9 \xe6\xed\xdf\xed\xe3\xed\xcf\xed\xc7\xa1 \xc7\xe1\xca" - "\xed \xe5\xed \xe3\xe4\xd9\xe3\xc9 \xdb\xed\xd1 \xd1\xc8\xcd\xed\xc9.", - "\u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627 \u0647\u064a \u0645" - "\u0634\u0631\u0648\u0639 \u0645\u0648\u0633\u0648\u0639\u0629 \u0645\u062a" - "\u0639\u062f\u062f\u0629 \u0627\u0644\u0644\u063a\u0627\u062a\u060c \u0645" - "\u0628\u0646\u064a\u0629 \u0639\u0644\u0649 \u0627\u0644\u0648\u064a\u0628" - "\u060c \u0630\u0627\u062a \u0645\u062d\u062a\u0648\u0649 \u062d\u0631" - "\u060c \u062a\u0634\u063a\u0644\u0647\u0627 \u0645\u0624\u0633\u0633" - "\u0629 \u0648\u064a\u0643\u064a\u0645\u064a\u062f\u064a\u0627\u060c \u0627" - "\u0644\u062a\u064a \u0647\u064a \u0645\u0646\u0638\u0645\u0629 \u063a" - "\u064a\u0631 \u0631\u0628\u062d\u064a\u0629.") == true); - return Count(true); -} - -bool TestExtIcu_ucsdet::test_shift_jis() { - VERIFY(detect_and_convert_to_utf8( - "\x81w\x82\xc6\x82\xc8\x82\xe8\x82\xcc\x83g\x83g\x83\x8d\x81x\x82\xcd\x81" - "A\x83X\x83^\x83W\x83I\x83W\x83u\x83\x8a\x90\xa7\x8d\xec\x82\xcc\x93\xfa" - "\x96{\x82\xcc\x92\xb7\x95\xd2\x83" "A\x83j\x83\x81\x81[\x83V\x83\x87\x83" - "\x93\x8d\xec\x95i\x81" "B", - "\u300e\u3068\u306a\u308a\u306e\u30c8\u30c8\u30ed\u300f\u306f\u3001\u30b9" - "\u30bf\u30b8\u30aa\u30b8\u30d6\u30ea\u5236\u4f5c\u306e\u65e5\u672c\u306e" - "\u9577\u7de8\u30a2\u30cb\u30e1\u30fc\u30b7\u30e7\u30f3\u4f5c\u54c1\u3002") - == true); - - VERIFY(detect_and_convert_to_utf8( - "\x83" "E\x83" "B\x83L\x83y\x83" "f\x83" "B\x83" "A (Wikipedia) \x82\xcd" - "\x83" "E\x83" "B\x83L\x83\x81\x83" "f\x83" "B\x83" "A\x8d\xe0\x92" - "c\x82\xaa\x89^\x89" "c\x82\xb7\x82\xe9\x83I\x83\x93\x83\x89\x83" "C\x83" - "\x93\x95S\x89\xc8\x8e\x96\x93T\x81" "B", - "\u30a6\u30a3\u30ad\u30da\u30c7\u30a3\u30a2 (Wikipedia) \u306f\u30a6" - "\u30a3\u30ad\u30e1\u30c7\u30a3\u30a2\u8ca1\u56e3\u304c\u904b\u55b6\u3059" - "\u308b\u30aa\u30f3\u30e9\u30a4\u30f3\u767e\u79d1\u4e8b\u5178\u3002") - == true); - - // Too short; detector thinks it's most likely Windows-1252. - // - // VERIFY(detect_and_convert_to_utf8( - // "\x90\xa2\x8a" "E\x90l\x8c\xa0\x90\xe9\x8c\xbe", - // "\u4e16\u754c\u4eba\u6a29\u5ba3\u8a00") == true); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_euc_jp() { - VERIFY(detect_and_convert_to_utf8( - "\xb2\xbf\xbf\xcd\xa4\xe2\xa1\xa2\xa4\xdb\xa4\xb7\xa4\xa4\xa4\xde\xa4\xde" - "\xa4\xcb\xc2\xe1\xca\xe1\xa1\xa2\xb9\xb4\xb6\xd8\xa1\xa2\xcb\xf4\xa4\xcf" - "\xc4\xc9\xca\xfc\xa4\xb5\xa4\xec\xa4\xeb\xa4\xb3\xa4\xc8\xa4\xcf\xa4\xca" - "\xa4\xa4\xa1\xa3", - "\u4f55\u4eba\u3082\u3001\u307b\u3057\u3044\u307e\u307e\u306b\u902e\u6355" - "\u3001\u62d8\u7981\u3001\u53c8\u306f\u8ffd\u653e\u3055\u308c\u308b\u3053" - "\u3068\u306f\u306a\u3044\u3002") == true); - VERIFY(detect_and_convert_to_utf8( - "1920\xc7\xaf \xa5\xa6\xa5\xa3\xa5\xf3\xa5\xd6\xa5\xeb\xa5\xc9\xa5\xf3\xc1" - "\xaa\xbc\xea\xb8\xa2\xa1\xcaThe Championships, Wimbledon 1920\xa1\xcb\xa4" - "\xcb\xb4\xd8\xa4\xb9\xa4\xeb\xb5\xad\xbb\xf6\xa1\xa3", - "1920\u5e74 \u30a6\u30a3\u30f3\u30d6\u30eb\u30c9\u30f3\u9078\u624b\u6a29" - "\uff08The Championships, Wimbledon 1920\uff09\u306b\u95a2\u3059\u308b" - "\u8a18\u4e8b\u3002") == true); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_iso_2022_jp() { - VERIFY(detect_and_convert_to_utf8( - "\x1b$B%-%M%F%#%C%/%3%M%/%7%g%s!J\x1b(Bkinetic connection\x1b$B!K$O\x1b(B1" - "986\x1b$BG/$K%=%K!<$,\x1b(BMSX2\x1b$B$GH/Gd$7$?%Q%:%k%2!<%`!#\x1b(B", - "\u30ad\u30cd\u30c6\u30a3\u30c3\u30af\u30b3\u30cd\u30af\u30b7\u30e7\u30f3" - "\uff08kinetic connection\uff09\u306f1986\u5e74\u306b\u30bd\u30cb\u30fc" - "\u304cMSX2\u3067\u767a\u58f2\u3057\u305f\u30d1\u30ba\u30eb\u30b2\u30fc" - "\u30e0\u3002") == true); - VERIFY(detect_and_convert_to_utf8( - "\x1b$B5f6K\x1b(B!!\x1b$BJQBV2>LL\x1b(B", - "\u7a76\u6975!!\u5909\u614b\u4eee\u9762") == true); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_gb2312() { - VERIFY(detect_and_convert_to_utf8( - "\xca\xc7\xd2\xbb\xb8\xf6\xd3\xef\xd1\xd4\xa1\xa2\xc4\xda\xc8\xdd\xbf\xaa" - "\xb7\xc5\xb5\xc4\xcd\xf8\xc2\xe7\xb0\xd9\xbf\xc6\xc8\xab\xca\xe9\xbc\xc6" - "\xbb\xae", - "\u662f\u4e00\u4e2a\u8bed\u8a00\u3001\u5185\u5bb9\u5f00\u653e\u7684\u7f51" - "\u7edc\u767e\u79d1\u5168\u4e66\u8ba1\u5212") == true); - VERIFY(detect_and_convert_to_utf8( - "\xa1\xb6\xd2\xbb\xc1\xa3\xd6\xd3\xd5\xe6\xc8\xcb\xcb\xd5\xa1\xb7\xa3\xa8" - "\xd3\xa2\xce\xc4\xc3\xfb\xa3\xbaSo Real Time Cooking\xa3\xa9", - "\u300a\u4e00\u7c92\u949f\u771f\u4eba\u82cf\u300b\uff08\u82f1\u6587\u540d" - "\uff1aSo Real Time Cooking\uff09") == true); - - // Too short; detector thinks it's most likely Shift-JIS. - // VERIFY(detect_and_convert_to_utf8( - // "\xce\xe2\xb8\xe7\xbf\xdf", - // "\u5434\u54e5\u7a9f") == true); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_big5() { - VERIFY(detect_and_convert_to_utf8( - "1\xa1]\xa4@\xa1^\xacO0\xbbP2\xa4\xa7\xb6\xa1\xaa\xba\xa6\xdb\xb5M\xbc\xc6" - "\xa1" "A\xacO\xb3\xcc\xa4p\xaa\xba\xa5\xbf\xa9_\xbc\xc6\xa1" "C", - "1\uff08\u4e00\uff09\u662f0\u82072\u4e4b\u9593\u7684\u81ea\u7136\u6578" - "\uff0c\u662f\u6700\u5c0f\u7684\u6b63\u5947\u6578\u3002") == true); - VERIFY(detect_and_convert_to_utf8( - "\xbeG\xa5\xf2\xaf\xf4\xa1]\xad^\xa4\xe5\xa6W \xa1G Cheng Chung Yin\xa1^" - "\xa1" "A\xacO\xa4@\xa6W\xa5x\xc6W\xa4k\xbat\xad\xfb", - "\u912d\u4ef2\u8335\uff08\u82f1\u6587\u540d \uff1a Cheng Chung Yin\uff09" - "\uff0c\u662f\u4e00\u540d\u53f0\u7063\u5973\u6f14\u54e1") == true); - - // Too short; detector thinks it's most likely Shift-JIS. - // VERIFY(detect_and_convert_to_utf8( - // "\xa7" "d\xad\xf4\xb8]", - // "\u5433\u54e5\u7a9f") == true); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_koi8r() { - VERIFY(detect_and_convert_to_utf8( - "\xeb\xd7\xc5\xc2\xc5\xcb \xd0\xc5\xd2\xd7\xc1\xd1 \xd0\xcf \xd0\xcc\xcf" - "\xdd\xc1\xc4\xc9 \xc9 \xd7\xd4\xcf\xd2\xc1\xd1 \xd0\xcf \xce\xc1\xd3\xc5" - "\xcc\xc5\xce\xc9\xc0 \xd0\xd2\xcf\xd7\xc9\xce\xc3\xc9\xd1 \xeb\xc1\xce" - "\xc1\xc4\xd9.", - "\u041a\u0432\u0435\u0431\u0435\u043a \u043f\u0435\u0440\u0432\u0430" - "\u044f \u043f\u043e \u043f\u043b\u043e\u0449\u0430\u0434\u0438 \u0438 " - "\u0432\u0442\u043e\u0440\u0430\u044f \u043f\u043e \u043d\u0430\u0441" - "\u0435\u043b\u0435\u043d\u0438\u044e \u043f\u0440\u043e\u0432\u0438" - "\u043d\u0446\u0438\u044f \u041a\u0430\u043d\u0430\u0434\u044b.") == true); - VERIFY(detect_and_convert_to_utf8( - "\xe2\xdf\xd2\xc4\xc5\xce\xc9 \xc5 \xd3\xc5\xcc\xcf \xd7 \xf3\xc5\xd7\xc5" - "\xd2\xce\xc1 \xe2\xdf\xcc\xc7\xc1\xd2\xc9\xd1", - "\u0411\u044a\u0440\u0434\u0435\u043d\u0438 \u0435 \u0441\u0435\u043b" - "\u043e \u0432 \u0421\u0435\u0432\u0435\u0440\u043d\u0430 \u0411\u044a" - "\u043b\u0433\u0430\u0440\u0438\u044f") == true); - VERIFY(detect_and_convert_to_utf8( - "\xfe\xc5\xd2\xd7\xc5\xce\xc1 \xd0\xd2\xc5\xd7\xdf\xda\xc8\xcf\xc4\xce" - "\xc1", - "\u0427\u0435\u0440\u0432\u0435\u043d\u0430 \u043f\u0440\u0435\u0432" - "\u044a\u0437\u0445\u043e\u0434\u043d\u0430") == true); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_windows_1251() { - VERIFY(detect_and_convert_to_utf8( - "\xce\xf7\xe5 \xed\xe0\xf8, \xea\xee\xbc \xf1\xe8 \xed\xe0 \xed\xe5\xe1" - "\xe5\xf1\xe0\xf2\xe0", - "\u041e\u0447\u0435 \u043d\u0430\u0448, \u043a\u043e\u0458 \u0441\u0438 " - "\u043d\u0430 \u043d\u0435\u0431\u0435\u0441\u0430\u0442\u0430") == true); - VERIFY(detect_and_convert_to_utf8( - "\xd1 \xe7\xee\xee\xeb\xee\xe3\xe8\xf7\xe5\xf1\xea\xee\xe9 \xf2\xee\xf7\xea" - "\xe8 \xe7\xf0\xe5\xed\xe8\xff, \xe4\xee\xec\xe0\xf8\xed\xff\xff \xea\xee" - "\xf8\xea\xe0 \x97 \xec\xeb\xe5\xea\xee\xef\xe8\xf2\xe0\xfe\xf9\xe5\xe5" - " \xf1\xe5\xec\xe5\xe9\xf1\xf2\xe2\xe0 \xea\xee\xf8\xe0\xf7\xfc\xe8\xf5 " - "\xee\xf2\xf0\xff\xe4\xe0 \xf5\xe8\xf9\xed\xfb\xf5.", - "\u0421 \u0437\u043e\u043e\u043b\u043e\u0433\u0438\u0447\u0435\u0441\u043a" - "\u043e\u0439 \u0442\u043e\u0447\u043a\u0438 \u0437\u0440\u0435\u043d\u0438" - "\u044f, \u0434\u043e\u043c\u0430\u0448\u043d\u044f\u044f \u043a\u043e" - "\u0448\u043a\u0430 \u2014 \u043c\u043b\u0435\u043a\u043e\u043f\u0438\u0442" - "\u0430\u044e\u0449\u0435\u0435 \u0441\u0435\u043c\u0435\u0439\u0441\u0442" - "\u0432\u0430 \u043a\u043e\u0448\u0430\u0447\u044c\u0438\u0445 \u043e\u0442" - "\u0440\u044f\u0434\u0430 \u0445\u0438\u0449\u043d\u044b\u0445.") == true); - return Count(true); -} - -bool TestExtIcu_ucsdet::test_utf8() { - VERIFY(detect_and_convert_to_utf8( - "\u10e8\u10d8\u10dc\u10d0\u10e3\u10e0\u10d8 \u10d9\u10d0\u10e2\u10d0", - "\u10e8\u10d8\u10dc\u10d0\u10e3\u10e0\u10d8 \u10d9\u10d0\u10e2\u10d0") - == true); - VERIFY(detect_and_convert_to_utf8( - "\u0e2b\u0e19\u0e49\u0e32\u0e2b\u0e25\u0e31\u0e01", - "\u0e2b\u0e19\u0e49\u0e32\u0e2b\u0e25\u0e31\u0e01") == true); - - return Count(true); -} - -bool TestExtIcu_ucsdet::test_utf16() { - // The detector only handles UTF-16 if there's a BOM at the front. - char utf16[] = - "\xff\xfeH\x00" "e\x00l\x00l\x00o\x00,\x00 \x00w\x00o\x00r\x00l\x00" - "d\x00!\x00"; - - // Take off 1 byte for the NUL at the end of the char[]. - String utf16Str(utf16, sizeof utf16 - 1, AttachLiteral); - VERIFY(detect_and_convert_to_utf8( - utf16Str, - "\ufeffHello, world!") == true); - - return Count(true); -} diff --git a/hphp/test/ext/test_ext_icu_ucsdet.h b/hphp/test/ext/test_ext_icu_ucsdet.h deleted file mode 100644 index 64c22b592..000000000 --- a/hphp/test/ext/test_ext_icu_ucsdet.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | HipHop for PHP | - +----------------------------------------------------------------------+ - | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | http://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ -*/ - -#ifndef incl_HPHP_TEST_EXT_ICU_UCSDET_H_ -#define incl_HPHP_TEST_EXT_ICU_UCSDET_H_ - -// >>>>>> Generated by idl.php. Do NOT modify. <<<<<< - -#include "hphp/test/ext/test_cpp_ext.h" - -/////////////////////////////////////////////////////////////////////////////// - -class TestExtIcu_ucsdet : public TestCppExt { - public: - virtual bool RunTests(const std::string &which); - - private: - bool detect_and_convert_to_utf8(CStrRef bytes, CStrRef utf8); - - bool test_basics(); - - bool test_uninitialized(); - bool test_empty(); - bool test_cannot_detect(); - bool test_declared_encoding(); - - // English and Western European - bool test_hello_world(); - bool test_windows_1252(); - - // Eastern European - bool test_windows_1250(); - - // Arabic - bool test_windows_1256(); - - // Japanese - bool test_shift_jis(); - bool test_euc_jp(); - bool test_iso_2022_jp(); - - // Chinese - bool test_gb2312(); - bool test_big5(); - - // Cyrillic - bool test_koi8r(); - bool test_windows_1251(); - - // Universal - bool test_utf8(); - bool test_utf16(); -}; - -/////////////////////////////////////////////////////////////////////////////// - -#endif // incl_HPHP_TEST_EXT_ICU_UCSDET_H_ diff --git a/hphp/test/ext/test_ext_icu_uspoof.cpp b/hphp/test/ext/test_ext_icu_uspoof.cpp deleted file mode 100644 index 0fb2586d2..000000000 --- a/hphp/test/ext/test_ext_icu_uspoof.cpp +++ /dev/null @@ -1,213 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | HipHop for PHP | - +----------------------------------------------------------------------+ - | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | http://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ -*/ - -#include "hphp/test/ext/test_ext_icu_uspoof.h" -#include "hphp/runtime/ext/ext_icu_uspoof.h" - -/////////////////////////////////////////////////////////////////////////////// - -bool TestExtIcu_uspoof::RunTests(const std::string &which) { - bool ret = true; - - RUN_TEST(test_SpoofChecker_issuspicious); - RUN_TEST(test_SpoofChecker_areconfusable); - RUN_TEST(test_SpoofChecker_issuesfound); - RUN_TEST(test_SpoofChecker_setchecks); - RUN_TEST(test_SpoofChecker_setallowedlocales); - - return ret; -} - -/////////////////////////////////////////////////////////////////////////////// - -bool TestExtIcu_uspoof::test_SpoofChecker_issuspicious() { - p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)()); - VS(checker->t_issuspicious("facebook"), false); - - // facebook with Cyrillic spoof characters - VS(checker->t_issuspicious("f\u0430\u0441\u0435b\u043e\u043ek"), true); - - // "Russia" in Cyrillic with Latin spoof characters - VS(checker->t_issuspicious("Pocc\u0438\u044f"), true); - - // paypal with Cyrillic spoof characters - VS(checker->t_issuspicious("http://www.payp\u0430l.com"), true); - - // certain all-uppercase Latin sequences can be spoof of Greek - VS(checker->t_issuspicious("NAPKIN PEZ"), true); - VS(checker->t_issuspicious("napkin pez"), false); - - // English with Japanese characters - VS(checker->t_issuspicious("True fact: \u5fcd\u8005 are mammals"), false); - - // Japanese name with mixed kanji and hiragana - VS(checker->t_issuspicious("\u6a4b\u672c\u611b\u307f"), false); - - try { - checker->t_issuspicious("this is not UTF-8: \x87\xFB\xCA\x94\xDB"); - } catch (Exception& e) { - return Count(true); - } - - return Count(false); -} - -bool TestExtIcu_uspoof::test_SpoofChecker_areconfusable() { - p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)()); - VS(checker->t_areconfusable("hello, world", "goodbye, world"), false); - VS(checker->t_areconfusable("hello, world", "hello, world"), true); - VS(checker->t_areconfusable("hello, world", "he11o, wor1d"), true); - VS(checker->t_areconfusable("hell\u00f8", "hello\u0337"), true); - - VS(checker->t_areconfusable("facebook", "f\u0430\u0441\u0435b\u043e\u043ek"), - true); - - VS(checker->t_areconfusable("facebook", "\U0001d41faceboo\u1d0b"), true); - - VS(checker->t_areconfusable("facebook", "\u017facebook"), true); - - VS(checker->t_areconfusable("paypal", "payp\u0430l"), true); - VS(checker->t_areconfusable( - "NAPKIN PEZ", - "\u039d\u0391\u03a1\u039a\u0399\u039d \u03a1\u0395\u0396"), - true); - - VS(checker->t_areconfusable( - "facebook", - "ufiek-a\u048ba\u049d \u049da\u048b\u00f0a\u048b\u01e5a\u048b-\u049dota-" - "\u00f0o\u00f0ol"), - false); - - try { - checker->t_areconfusable( - "this is not UTF-8: \x87\xFB\xCA\x94\xDB", - "so there."); - } catch (Exception& e) { - return Count(true); - } - - return Count(false); -} - -bool TestExtIcu_uspoof::test_SpoofChecker_issuesfound() { - p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)()); - Variant ret; - - VS(checker->t_issuspicious("NAPKIN PEZ", ref(ret)), true); - VS(ret.getInt64(), q_SpoofChecker$$WHOLE_SCRIPT_CONFUSABLE); - - VS(checker->t_issuspicious("f\u0430\u0441\u0435b\u043e\u043ek", ref(ret)), - true); - VS(ret.getInt64(), q_SpoofChecker$$MIXED_SCRIPT_CONFUSABLE); - - VS(checker->t_areconfusable("hello, world", "he11o, wor1d", ref(ret)), true); - VS(ret.getInt64(), q_SpoofChecker$$SINGLE_SCRIPT_CONFUSABLE); - - return Count(true); -} - -bool TestExtIcu_uspoof::test_SpoofChecker_setchecks() { - { - p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)()); - - // The checker should start in any-case mode. - VS(checker->t_areconfusable("HELLO", "H\u0415LLO"), true); - VS(checker->t_areconfusable("hello", "h\u0435llo"), true); - - // Go to lower-case only mode (assumes all strings have been - // case-folded). - checker->t_setchecks( - q_SpoofChecker$$MIXED_SCRIPT_CONFUSABLE | - q_SpoofChecker$$WHOLE_SCRIPT_CONFUSABLE | - q_SpoofChecker$$SINGLE_SCRIPT_CONFUSABLE - ); - VS(checker->t_areconfusable("HELLO", "H\u0415LLO"), false); - VS(checker->t_areconfusable("hello", "h\u0435llo"), true); - } - - { - p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)()); - VS(checker->t_issuspicious("True fact: \u5fcd\u8005 are mammals"), false); - - // Only allow characters of a single script. - checker->t_setchecks(q_SpoofChecker$$SINGLE_SCRIPT); - VS(checker->t_issuspicious("True fact: \u5fcd\u8005 are mammals"), true); - } - - try { - p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)()); - checker->t_setchecks(0xDEADBEEF); - } catch (Exception& e) { - return Count(true); - } - - return Count(false); -} - -bool TestExtIcu_uspoof::test_SpoofChecker_setallowedlocales() { - p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)()); - - const char* common = "Rogers"; - const char* japanese_kanji_hiragana = "\u6a4b\u672c\u611b\u307f"; - const char* korean = "\ud55c\uad6d\ub9d0"; - const char* arabic = "\u0645\u0631\u062d\u0628\u064b\u0627"; - const char* russian_cyrillic = - "\u0417\u0438\u0301\u043c\u043d\u0438\u0439 " - "\u0432\u0435\u0301\u0447\u0435\u0440"; - const char* snowman = "\u2603"; - - checker->t_setallowedlocales("en_US"); - VS(checker->t_issuspicious(common), false); - VS(checker->t_issuspicious(japanese_kanji_hiragana), true); - VS(checker->t_issuspicious(russian_cyrillic), true); - VS(checker->t_issuspicious(arabic), true); - VS(checker->t_issuspicious(korean), true); - VS(checker->t_issuspicious(snowman), false); - - checker->t_setallowedlocales("en_US, ja_JP"); - VS(checker->t_issuspicious(common), false); - VS(checker->t_issuspicious(japanese_kanji_hiragana), false); - VS(checker->t_issuspicious(russian_cyrillic), true); - VS(checker->t_issuspicious(arabic), true); - VS(checker->t_issuspicious(korean), true); - VS(checker->t_issuspicious(snowman), false); - - checker->t_setallowedlocales("en_US, ko_KR"); - VS(checker->t_issuspicious(common), false); - VS(checker->t_issuspicious(japanese_kanji_hiragana), true); - VS(checker->t_issuspicious(russian_cyrillic), true); - VS(checker->t_issuspicious(arabic), true); - VS(checker->t_issuspicious(korean), false); - VS(checker->t_issuspicious(snowman), false); - - checker->t_setallowedlocales("en_US, ar_AR"); - VS(checker->t_issuspicious(common), false); - VS(checker->t_issuspicious(japanese_kanji_hiragana), true); - VS(checker->t_issuspicious(russian_cyrillic), true); - VS(checker->t_issuspicious(arabic), false); - VS(checker->t_issuspicious(korean), true); - VS(checker->t_issuspicious(snowman), false); - - checker->t_setallowedlocales("en_US, ru_RU"); - VS(checker->t_issuspicious(common), false); - VS(checker->t_issuspicious(japanese_kanji_hiragana), true); - VS(checker->t_issuspicious(russian_cyrillic), false); - VS(checker->t_issuspicious(arabic), true); - VS(checker->t_issuspicious(korean), true); - VS(checker->t_issuspicious(snowman), false); - - return Count(true); -} diff --git a/hphp/test/ext/test_ext_icu_uspoof.h b/hphp/test/ext/test_ext_icu_uspoof.h deleted file mode 100644 index 3b2a61d6b..000000000 --- a/hphp/test/ext/test_ext_icu_uspoof.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | HipHop for PHP | - +----------------------------------------------------------------------+ - | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | http://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ -*/ - -#ifndef TEST_EXT_ICU_USPOOF_H -#define TEST_EXT_ICU_USPOOF_H - -// >>>>>> Generated by idl.php. Do NOT modify. <<<<<< - -#include "hphp/test/ext/test_cpp_ext.h" - -/////////////////////////////////////////////////////////////////////////////// - -class TestExtIcu_uspoof : public TestCppExt { - public: - virtual bool RunTests(const std::string &which); - - bool test_SpoofChecker_issuspicious(); - bool test_SpoofChecker_areconfusable(); - bool test_SpoofChecker_issuesfound(); - bool test_SpoofChecker_setchecks(); - bool test_SpoofChecker_setallowedlocales(); -}; - -/////////////////////////////////////////////////////////////////////////////// - -#endif // TEST_EXT_ICU_USPOOF_H diff --git a/hphp/test/slow/ext_icu/ucsdet.php b/hphp/test/slow/ext_icu/ucsdet.php new file mode 100644 index 000000000..c7daa278b --- /dev/null +++ b/hphp/test/slow/ext_icu/ucsdet.php @@ -0,0 +1,322 @@ +settext($bytes); + $match = $detector->detect(); + if (!$match->isvalid()) { + return false; + } + // echo "Got: " . $match->getutf8() . "\n"; + // echo "Want: " . $utf8 . "\n"; + return $match->getutf8() == $utf8; +} + +function test_basics() { + // This is as unmistakably UTF-8 as it gets. + $utf8_snowman_with_bom = getunicode("\\uFEFF\\u2603"); + + $detector = new EncodingDetector(); + + $detector->settext($utf8_snowman_with_bom); + $match = $detector->detect(); + VERIFY($match->isvalid() == true); + VERIFY($match->getencoding() == "UTF-8"); + VERIFY($match->getconfidence() == 100); + VERIFY($match->getutf8() == $utf8_snowman_with_bom); +} + +function test_cannot_detect() { + $detector = new EncodingDetector(); + + // The detector has no idea what to do with this. + $detector->settext("\xc7\xe8\xec\xed\xe8\xe9 \xe2\xe5\xf7\xe5\xf0"); + $match = $detector->detect(); + VERIFY($match->isvalid() == false); +} + +function test_declared_encoding() { + // Right now (ICU 4.6), this API doesn't actually do anything, but + // let's at least verify it doesn't crash. + $detector = new EncodingDetector(); + $detector->settext("Yo!"); + $detector->setdeclaredencoding("windows-1251"); + + $match = $detector->detect(); + VERIFY($match->isvalid() == true); + VERIFY($match->getutf8() == "Yo!"); +} + +function test_hello_world() { + VERIFY(detect_and_convert_to_utf8("Hello, world!", "Hello, world!") == true); +} + +function test_windows_1252() { + VERIFY(detect_and_convert_to_utf8( + "Toda Europa ley\xf3 Don Quijote como una s\xe1tira.", + getunicode('Toda Europa ley\u00f3 Don Quijote como una s\u00e1tira.')) + == true); + + VERIFY(detect_and_convert_to_utf8( + "Notre P\xe8re, qui \xeates aux cieux", + getunicode('Notre P\u00e8re, qui \u00eates aux cieux')) == true); + + VERIFY(detect_and_convert_to_utf8( + "Marta da Silva, als beste Spielerin und beste Torsch\xFCtzin der WM ". + "2007 sowie bisher f\xFCnf Mal als \x84Weltfu\xDF". "ballerin des ". + "Jahres\x93 ausgezeichnet, kommt zur Welt.", + getunicode( + 'Marta da Silva, als beste Spielerin und beste Torsch\u00fctzin der WM '. + '2007 sowie bisher f\u00FCnf Mal als \u201EWeltfu\u00DFballerin des '. + 'Jahres\u201C ausgezeichnet, kommt zur Welt.')) == true); +} + +function test_windows_1250() { + VERIFY(detect_and_convert_to_utf8( + "Do Wikipedie m\xf9\x9e" ."e p\xf8isp\xedvat kdokoliv.", + getunicode('Do Wikipedie m\u016f\u017ee p\u0159isp\u00edvat kdokoliv.')) + == true); + + VERIFY(detect_and_convert_to_utf8( + "O\xe8". "e na\x9a, koji jesi na nebesima, sveti se ime Tvoje.", + getunicode('O\u010de na\u0161, koji jesi na nebesima, sveti se ime Tvoje.')) + == true); + + VERIFY(detect_and_convert_to_utf8( + "Prezentacj\xea pierwszego graficznego \x9crodowiska pracy z rodziny ". + "Windows firmy Microsoft przeprowadzono w listopadzie 1985.", + getunicode('Prezentacj\u0119 pierwszego graficznego \u015brodowiska pracy z rodziny '. + 'Windows firmy Microsoft przeprowadzono w listopadzie 1985.')) == true); +} + +function test_windows_1256() { + VERIFY(detect_and_convert_to_utf8( + "\xe1\xc7 \xc3\xca\xdf\xe1\xe3 \xc7\xe1\xda\xd1\xc8\xed\xc9", + getunicode( + '\u0644\u0627 \u0623\u062a\u0643\u0644\u0645 \u0627\u0644\u0639\u0631\u0628'. + '\u064a\u0629')) == true); + + VERIFY(detect_and_convert_to_utf8( + "\xe6\xed\xdf\xed\xc8\xed\xcf\xed\xc7 \xe5\xed \xe3\xd4\xd1\xe6\xda \xe3". + "\xe6\xd3\xe6\xda\xc9 \xe3\xca\xda\xcf\xcf\xc9 \xc7\xe1\xe1\xdb\xc7\xca". + "\xa1 \xe3\xc8\xe4\xed\xc9 \xda\xe1\xec \xc7\xe1\xe6\xed\xc8\xa1 \xd0\xc7". + "\xca \xe3\xcd\xca\xe6\xec \xcd\xd1\xa1 \xca\xd4\xdb\xe1\xe5\xc7 \xe3\xc4". + "\xd3\xd3\xc9 \xe6\xed\xdf\xed\xe3\xed\xcf\xed\xc7\xa1 \xc7\xe1\xca". + "\xed \xe5\xed \xe3\xe4\xd9\xe3\xc9 \xdb\xed\xd1 \xd1\xc8\xcd\xed\xc9.", + getunicode( + '\u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627 \u0647\u064a \u0645'. + '\u0634\u0631\u0648\u0639 \u0645\u0648\u0633\u0648\u0639\u0629 \u0645\u062a'. + '\u0639\u062f\u062f\u0629 \u0627\u0644\u0644\u063a\u0627\u062a\u060c \u0645'. + '\u0628\u0646\u064a\u0629 \u0639\u0644\u0649 \u0627\u0644\u0648\u064a\u0628'. + '\u060c \u0630\u0627\u062a \u0645\u062d\u062a\u0648\u0649 \u062d\u0631'. + '\u060c \u062a\u0634\u063a\u0644\u0647\u0627 \u0645\u0624\u0633\u0633'. + '\u0629 \u0648\u064a\u0643\u064a\u0645\u064a\u062f\u064a\u0627\u060c \u0627'. + '\u0644\u062a\u064a \u0647\u064a \u0645\u0646\u0638\u0645\u0629 \u063a'. + '\u064a\u0631 \u0631\u0628\u062d\u064a\u0629.')) == true); +} + +function test_shift_jis() { + VERIFY(detect_and_convert_to_utf8( + "\x81w\x82\xc6\x82\xc8\x82\xe8\x82\xcc\x83g\x83g\x83\x8d\x81x\x82\xcd\x81". + "A\x83X\x83^\x83W\x83I\x83W\x83u\x83\x8a\x90\xa7\x8d\xec\x82\xcc\x93\xfa". + "\x96{\x82\xcc\x92\xb7\x95\xd2\x83". "A\x83j\x83\x81\x81[\x83V\x83\x87\x83". + "\x93\x8d\xec\x95i\x81". "B", + getunicode( + '\u300e\u3068\u306a\u308a\u306e\u30c8\u30c8\u30ed\u300f\u306f\u3001\u30b9'. + '\u30bf\u30b8\u30aa\u30b8\u30d6\u30ea\u5236\u4f5c\u306e\u65e5\u672c\u306e'. + '\u9577\u7de8\u30a2\u30cb\u30e1\u30fc\u30b7\u30e7\u30f3\u4f5c\u54c1\u3002')) + == true); + + VERIFY(detect_and_convert_to_utf8( + "\x83". "E\x83". "B\x83L\x83y\x83". "f\x83". "B\x83". "A (Wikipedia) \x82\xcd". + "\x83". "E\x83". "B\x83L\x83\x81\x83". "f\x83". "B\x83". "A\x8d\xe0\x92". + "c\x82\xaa\x89^\x89". "c\x82\xb7\x82\xe9\x83I\x83\x93\x83\x89\x83". "C\x83". + "\x93\x95S\x89\xc8\x8e\x96\x93T\x81". "B", + getunicode( + '\u30a6\u30a3\u30ad\u30da\u30c7\u30a3\u30a2 (Wikipedia) \u306f\u30a6'. + '\u30a3\u30ad\u30e1\u30c7\u30a3\u30a2\u8ca1\u56e3\u304c\u904b\u55b6\u3059'. + '\u308b\u30aa\u30f3\u30e9\u30a4\u30f3\u767e\u79d1\u4e8b\u5178\u3002')) + == true); + + // Too short; detector thinks it's most likely Windows-1252. + // + // VERIFY(detect_and_convert_to_utf8( + // "\x90\xa2\x8a" "E\x90l\x8c\xa0\x90\xe9\x8c\xbe", + // "\u4e16\u754c\u4eba\u6a29\u5ba3\u8a00") == true); +} + +function test_euc_jp() { + VERIFY(detect_and_convert_to_utf8( + "\xb2\xbf\xbf\xcd\xa4\xe2\xa1\xa2\xa4\xdb\xa4\xb7\xa4\xa4\xa4\xde\xa4\xde". + "\xa4\xcb\xc2\xe1\xca\xe1\xa1\xa2\xb9\xb4\xb6\xd8\xa1\xa2\xcb\xf4\xa4\xcf". + "\xc4\xc9\xca\xfc\xa4\xb5\xa4\xec\xa4\xeb\xa4\xb3\xa4\xc8\xa4\xcf\xa4\xca". + "\xa4\xa4\xa1\xa3", + getunicode( + '\u4f55\u4eba\u3082\u3001\u307b\u3057\u3044\u307e\u307e\u306b\u902e\u6355'. + '\u3001\u62d8\u7981\u3001\u53c8\u306f\u8ffd\u653e\u3055\u308c\u308b\u3053'. + '\u3068\u306f\u306a\u3044\u3002')) == true); + VERIFY(detect_and_convert_to_utf8( + "1920\xc7\xaf \xa5\xa6\xa5\xa3\xa5\xf3\xa5\xd6\xa5\xeb\xa5\xc9\xa5\xf3\xc1". + "\xaa\xbc\xea\xb8\xa2\xa1\xcaThe Championships, Wimbledon 1920\xa1\xcb\xa4". + "\xcb\xb4\xd8\xa4\xb9\xa4\xeb\xb5\xad\xbb\xf6\xa1\xa3", + getunicode( + '1920\u5e74 \u30a6\u30a3\u30f3\u30d6\u30eb\u30c9\u30f3\u9078\u624b\u6a29'. + '\uff08The Championships, Wimbledon 1920\uff09\u306b\u95a2\u3059\u308b'. + '\u8a18\u4e8b\u3002')) == true); +} + +function test_iso_2022_jp() { + VERIFY(detect_and_convert_to_utf8( + "\x1b\$B%-%M%F%#%C%/%3%M%/%7%g%s!J\x1b(Bkinetic connection\x1b\$B!K\$O\x1b(B1". + "986\x1b\$BG/\$K%=%K!<\$,\x1b(BMSX2\x1b\$B\$GH/Gd\$7\$?%Q%:%k%2!<%`!#\x1b(B", + getunicode( + '\u30ad\u30cd\u30c6\u30a3\u30c3\u30af\u30b3\u30cd\u30af\u30b7\u30e7\u30f3'. + '\uff08kinetic connection\uff09\u306f1986\u5e74\u306b\u30bd\u30cb\u30fc'. + '\u304cMSX2\u3067\u767a\u58f2\u3057\u305f\u30d1\u30ba\u30eb\u30b2\u30fc'. + '\u30e0\u3002')) == true); + VERIFY(detect_and_convert_to_utf8( + "\x1b\$B5f6K\x1b(B!!\x1b\$BJQBV2>LL\x1b(B", + getunicode("\u7a76\u6975!!\u5909\u614b\u4eee\u9762")) == true); +} + +function test_gb2312() { + VERIFY(detect_and_convert_to_utf8( + "\xca\xc7\xd2\xbb\xb8\xf6\xd3\xef\xd1\xd4\xa1\xa2\xc4\xda\xc8\xdd\xbf\xaa". + "\xb7\xc5\xb5\xc4\xcd\xf8\xc2\xe7\xb0\xd9\xbf\xc6\xc8\xab\xca\xe9\xbc\xc6". + "\xbb\xae", + getunicode( + '\u662f\u4e00\u4e2a\u8bed\u8a00\u3001\u5185\u5bb9\u5f00\u653e\u7684\u7f51'. + '\u7edc\u767e\u79d1\u5168\u4e66\u8ba1\u5212')) == true); + VERIFY(detect_and_convert_to_utf8( + "\xa1\xb6\xd2\xbb\xc1\xa3\xd6\xd3\xd5\xe6\xc8\xcb\xcb\xd5\xa1\xb7\xa3\xa8". + "\xd3\xa2\xce\xc4\xc3\xfb\xa3\xbaSo Real Time Cooking\xa3\xa9", + getunicode( + '\u300a\u4e00\u7c92\u949f\u771f\u4eba\u82cf\u300b\uff08\u82f1\u6587\u540d'. + '\uff1aSo Real Time Cooking\uff09')) == true); + + // Too short; detector thinks it's most likely Shift-JIS. + // VERIFY(detect_and_convert_to_utf8( + // "\xce\xe2\xb8\xe7\xbf\xdf", + // "\u5434\u54e5\u7a9f") == true); +} + +function test_big5() { + VERIFY(detect_and_convert_to_utf8( + "1\xa1]\xa4@\xa1^\xacO0\xbbP2\xa4\xa7\xb6\xa1\xaa\xba\xa6\xdb\xb5M\xbc\xc6". + "\xa1". "A\xacO\xb3\xcc\xa4p\xaa\xba\xa5\xbf\xa9_\xbc\xc6\xa1". "C", + getunicode( + '1\uff08\u4e00\uff09\u662f0\u82072\u4e4b\u9593\u7684\u81ea\u7136\u6578'. + '\uff0c\u662f\u6700\u5c0f\u7684\u6b63\u5947\u6578\u3002')) == true); + VERIFY(detect_and_convert_to_utf8( + "\xbeG\xa5\xf2\xaf\xf4\xa1]\xad^\xa4\xe5\xa6W \xa1G Cheng Chung Yin\xa1^". + "\xa1". "A\xacO\xa4@\xa6W\xa5x\xc6W\xa4k\xbat\xad\xfb", + getunicode( + '\u912d\u4ef2\u8335\uff08\u82f1\u6587\u540d \uff1a Cheng Chung Yin\uff09'. + '\uff0c\u662f\u4e00\u540d\u53f0\u7063\u5973\u6f14\u54e1')) == true); + + // Too short; detector thinks it's most likely Shift-JIS. + // VERIFY(detect_and_convert_to_utf8( + // "\xa7" "d\xad\xf4\xb8]", + // "\u5433\u54e5\u7a9f") == true); +} + +function test_koi8r() { + VERIFY(detect_and_convert_to_utf8( + "\xeb\xd7\xc5\xc2\xc5\xcb \xd0\xc5\xd2\xd7\xc1\xd1 \xd0\xcf \xd0\xcc\xcf". + "\xdd\xc1\xc4\xc9 \xc9 \xd7\xd4\xcf\xd2\xc1\xd1 \xd0\xcf \xce\xc1\xd3\xc5". + "\xcc\xc5\xce\xc9\xc0 \xd0\xd2\xcf\xd7\xc9\xce\xc3\xc9\xd1 \xeb\xc1\xce". + "\xc1\xc4\xd9.", + getunicode( + '\u041a\u0432\u0435\u0431\u0435\u043a \u043f\u0435\u0440\u0432\u0430'. + '\u044f \u043f\u043e \u043f\u043b\u043e\u0449\u0430\u0434\u0438 \u0438 '. + '\u0432\u0442\u043e\u0440\u0430\u044f \u043f\u043e \u043d\u0430\u0441'. + '\u0435\u043b\u0435\u043d\u0438\u044e \u043f\u0440\u043e\u0432\u0438'. + '\u043d\u0446\u0438\u044f \u041a\u0430\u043d\u0430\u0434\u044b.')) == true); + VERIFY(detect_and_convert_to_utf8( + "\xe2\xdf\xd2\xc4\xc5\xce\xc9 \xc5 \xd3\xc5\xcc\xcf \xd7 \xf3\xc5\xd7\xc5". + "\xd2\xce\xc1 \xe2\xdf\xcc\xc7\xc1\xd2\xc9\xd1", + getunicode( + '\u0411\u044a\u0440\u0434\u0435\u043d\u0438 \u0435 \u0441\u0435\u043b'. + '\u043e \u0432 \u0421\u0435\u0432\u0435\u0440\u043d\u0430 \u0411\u044a'. + '\u043b\u0433\u0430\u0440\u0438\u044f')) == true); + VERIFY(detect_and_convert_to_utf8( + "\xfe\xc5\xd2\xd7\xc5\xce\xc1 \xd0\xd2\xc5\xd7\xdf\xda\xc8\xcf\xc4\xce". + "\xc1", + getunicode( + '\u0427\u0435\u0440\u0432\u0435\u043d\u0430 \u043f\u0440\u0435\u0432'. + '\u044a\u0437\u0445\u043e\u0434\u043d\u0430')) == true); +} + +function test_windows_1251() { + VERIFY(detect_and_convert_to_utf8( + "\xce\xf7\xe5 \xed\xe0\xf8, \xea\xee\xbc \xf1\xe8 \xed\xe0 \xed\xe5\xe1". + "\xe5\xf1\xe0\xf2\xe0", + getunicode( + '\u041e\u0447\u0435 \u043d\u0430\u0448, \u043a\u043e\u0458 \u0441\u0438 '. + '\u043d\u0430 \u043d\u0435\u0431\u0435\u0441\u0430\u0442\u0430')) == true); + VERIFY(detect_and_convert_to_utf8( + "\xd1 \xe7\xee\xee\xeb\xee\xe3\xe8\xf7\xe5\xf1\xea\xee\xe9 \xf2\xee\xf7\xea". + "\xe8 \xe7\xf0\xe5\xed\xe8\xff, \xe4\xee\xec\xe0\xf8\xed\xff\xff \xea\xee". + "\xf8\xea\xe0 \x97 \xec\xeb\xe5\xea\xee\xef\xe8\xf2\xe0\xfe\xf9\xe5\xe5". + " \xf1\xe5\xec\xe5\xe9\xf1\xf2\xe2\xe0 \xea\xee\xf8\xe0\xf7\xfc\xe8\xf5 ". + "\xee\xf2\xf0\xff\xe4\xe0 \xf5\xe8\xf9\xed\xfb\xf5.", + getunicode( + '\u0421 \u0437\u043e\u043e\u043b\u043e\u0433\u0438\u0447\u0435\u0441\u043a'. + '\u043e\u0439 \u0442\u043e\u0447\u043a\u0438 \u0437\u0440\u0435\u043d\u0438'. + '\u044f, \u0434\u043e\u043c\u0430\u0448\u043d\u044f\u044f \u043a\u043e'. + '\u0448\u043a\u0430 \u2014 \u043c\u043b\u0435\u043a\u043e\u043f\u0438\u0442'. + '\u0430\u044e\u0449\u0435\u0435 \u0441\u0435\u043c\u0435\u0439\u0441\u0442'. + '\u0432\u0430 \u043a\u043e\u0448\u0430\u0447\u044c\u0438\u0445 \u043e\u0442'. + '\u0440\u044f\u0434\u0430 \u0445\u0438\u0449\u043d\u044b\u0445.')) == true); +} + +function test_utf8() { + VERIFY(detect_and_convert_to_utf8( + getunicode( + "\u10e8\u10d8\u10dc\u10d0\u10e3\u10e0\u10d8 \u10d9\u10d0\u10e2\u10d0"), + getunicode( + '\u10e8\u10d8\u10dc\u10d0\u10e3\u10e0\u10d8 \u10d9\u10d0\u10e2\u10d0')) + == true); + VERIFY(detect_and_convert_to_utf8( + getunicode('\u0e2b\u0e19\u0e49\u0e32\u0e2b\u0e25\u0e31\u0e01'), + getunicode('\u0e2b\u0e19\u0e49\u0e32\u0e2b\u0e25\u0e31\u0e01')) == true); +} + +function test_utf16() { + // The detector only handles UTF-16 if there's a BOM at the front. + $utf16 = + "\xff\xfeH\x00". "e\x00l\x00l\x00o\x00,\x00 \x00w\x00o\x00r\x00l\x00". + "d\x00!\x00"; + VERIFY(detect_and_convert_to_utf8( + $utf16, + getunicode("\ufeffHello, world!")) == true); +} + +test_basics(); +test_cannot_detect(); +test_declared_encoding(); +test_hello_world(); +test_windows_1252(); +test_windows_1250(); +test_windows_1256(); +test_shift_jis(); +test_euc_jp(); +test_iso_2022_jp(); +test_gb2312(); +test_big5(); +test_koi8r(); +test_windows_1251(); +test_utf8(); +test_utf16(); diff --git a/hphp/test/slow/ext_icu/ucsdet.php.expect b/hphp/test/slow/ext_icu/ucsdet.php.expect new file mode 100644 index 000000000..352ac635c --- /dev/null +++ b/hphp/test/slow/ext_icu/ucsdet.php.expect @@ -0,0 +1,34 @@ +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) diff --git a/hphp/test/slow/ext_icu/uspoof.php b/hphp/test/slow/ext_icu/uspoof.php new file mode 100644 index 000000000..abd6db9e8 --- /dev/null +++ b/hphp/test/slow/ext_icu/uspoof.php @@ -0,0 +1,187 @@ +issuspicious("facebook"), false); + + // facebook with Cyrillic spoof characters + VS($checker->issuspicious(u('f\u0430\u0441\u0435b\u043e\u043ek')), true); + + // "Russia" in Cyrillic with Latin spoof characters + VS($checker->issuspicious(u('Pocc\u0438\u044f')), true); + + // paypal with Cyrillic spoof characters + VS($checker->issuspicious(u('http://www.payp\u0430l.com')), true); + + // certain all-uppercase Latin sequences can be spoof of Greek + VS($checker->issuspicious('NAPKIN PEZ'), true); + VS($checker->issuspicious('napkin pez'), false); + + // English with Japanese characters + VS($checker->issuspicious(u('True fact: \u5fcd\u8005 are mammals')), false); + + // Japanese name with mixed kanji and hiragana + VS($checker->issuspicious(u('\u6a4b\u672c\u611b\u307f')), false); + + // try { + // $checker->issuspicious("this is not UTF-8: \x87\xFB\xCA\x94\xDB"); + // } catch (Exception $e) { + // VS(true, true); + // } +} + +function test_SpoofChecker_areconfusable() { + $checker = new SpoofChecker(); + VS($checker->areconfusable("hello, world", "goodbye, world"), false); + VS($checker->areconfusable("hello, world", "hello, world"), true); + VS($checker->areconfusable("hello, world", "he11o, wor1d"), true); + VS($checker->areconfusable(u('hell\u00f8'), u('hello\u0337')), true); + + VS($checker->areconfusable("facebook", + u('f\u0430\u0441\u0435b\u043e\u043ek')), + true); + + VS($checker->areconfusable("facebook", "\xf0\x9d\x90\x9faceboo".u('\u1d0b')), + true); + + VS($checker->areconfusable("facebook", u('\u017facebook')), true); + + VS($checker->areconfusable("paypal", u('payp\u0430l')), true); + VS($checker->areconfusable( + "NAPKIN PEZ", + u('\u039d\u0391\u03a1\u039a\u0399\u039d \u03a1\u0395\u0396')), + true); + + VS($checker->areconfusable( + "facebook", + u('ufiek-a\u048ba\u049d \u049da\u048b\u00f0a\u048b\u01e5a\u048b-\u049dota-'. + '\u00f0o\u00f0ol')), + false); + + // try { + // $checker->areconfusable( + // "this is not UTF-8: \x87\xFB\xCA\x94\xDB", + // "so there."); + // } catch (Exception $e) { + // VS(true, true); + // } +} + +function test_SpoofChecker_issuesfound() { + $checker = new SpoofChecker(); + + VS($checker->issuspicious("NAPKIN PEZ", $ret), true); + VS($ret, Spoofchecker::WHOLE_SCRIPT_CONFUSABLE); + + VS($checker->issuspicious(u('f\u0430\u0441\u0435b\u043e\u043ek'), $ret), + true); + VS($ret, SpoofChecker::MIXED_SCRIPT_CONFUSABLE); + + VS($checker->areconfusable("hello, world", "he11o, wor1d", $ret), true); + VS($ret, SpoofChecker::SINGLE_SCRIPT_CONFUSABLE); + + return Count(true); +} + +function test_SpoofChecker_setchecks() { + $checker = new SpoofChecker(); + + // The checker should start in any-case mode. + VS($checker->areconfusable("HELLO", u('H\u0415LLO')), true); + VS($checker->areconfusable("hello", u('h\u0435llo')), true); + + // Go to lower-case only mode (assumes all strings have been + // case-folded). + $checker->setchecks( + SpoofChecker::MIXED_SCRIPT_CONFUSABLE | + SpoofChecker::WHOLE_SCRIPT_CONFUSABLE | + SpoofChecker::SINGLE_SCRIPT_CONFUSABLE + ); + VS($checker->areconfusable("HELLO", u('H\u0415LLO')), false); + VS($checker->areconfusable("hello", u('h\u0435llo')), true); + + $checker = new SpoofChecker(); + VS($checker->issuspicious(u('True fact: \u5fcd\u8005 are mammals')), false); + + // Only allow characters of a single script. + $checker->setchecks(SpoofChecker::SINGLE_SCRIPT); + VS($checker->issuspicious(u('True fact: \u5fcd\u8005 are mammals')), true); + + // try { + // $checker = new SpoofChecker(); + // $checker->setchecks(0xDEADBEEF); + // } catch (Exception $e) { + // VS(true, true); + // } +} + +function test_SpoofChecker_setallowedlocales() { + $checker = new SpoofChecker(); + + $common = "Rogers"; + $japanese_kanji_hiragana = u('\u6a4b\u672c\u611b\u307f'); + $korean = u('\ud55c\uad6d\ub9d0'); + $arabic = u('\u0645\u0631\u062d\u0628\u064b\u0627'); + $russian_cyrillic = + u('\u0417\u0438\u0301\u043c\u043d\u0438\u0439 '. + '\u0432\u0435\u0301\u0447\u0435\u0440'); + $snowman = u('\u2603'); + + $checker->setallowedlocales("en_US"); + VS($checker->issuspicious($common), false); + VS($checker->issuspicious($japanese_kanji_hiragana), true); + VS($checker->issuspicious($russian_cyrillic), true); + VS($checker->issuspicious($arabic), true); + VS($checker->issuspicious($korean), true); + VS($checker->issuspicious($snowman), false); + + $checker->setallowedlocales("en_US, ja_JP"); + VS($checker->issuspicious($common), false); + VS($checker->issuspicious($japanese_kanji_hiragana), false); + VS($checker->issuspicious($russian_cyrillic), true); + VS($checker->issuspicious($arabic), true); + VS($checker->issuspicious($korean), true); + VS($checker->issuspicious($snowman), false); + + $checker->setallowedlocales("en_US, ko_KR"); + VS($checker->issuspicious($common), false); + VS($checker->issuspicious($japanese_kanji_hiragana), true); + VS($checker->issuspicious($russian_cyrillic), true); + VS($checker->issuspicious($arabic), true); + VS($checker->issuspicious($korean), false); + VS($checker->issuspicious($snowman), false); + + $checker->setallowedlocales("en_US, ar_AR"); + VS($checker->issuspicious($common), false); + VS($checker->issuspicious($japanese_kanji_hiragana), true); + VS($checker->issuspicious($russian_cyrillic), true); + VS($checker->issuspicious($arabic), false); + VS($checker->issuspicious($korean), true); + VS($checker->issuspicious($snowman), false); + + $checker->setallowedlocales("en_US, ru_RU"); + VS($checker->issuspicious($common), false); + VS($checker->issuspicious($japanese_kanji_hiragana), true); + VS($checker->issuspicious($russian_cyrillic), false); + VS($checker->issuspicious($arabic), true); + VS($checker->issuspicious($korean), true); + VS($checker->issuspicious($snowman), false); +} + +test_SpoofChecker_issuspicious(); +test_SpoofChecker_areconfusable(); +test_SpoofChecker_issuesfound(); +test_SpoofChecker_setchecks(); +test_SpoofChecker_setallowedlocales(); diff --git a/hphp/test/slow/ext_icu/uspoof.php.expect b/hphp/test/slow/ext_icu/uspoof.php.expect new file mode 100644 index 000000000..4e6ee598d --- /dev/null +++ b/hphp/test/slow/ext_icu/uspoof.php.expect @@ -0,0 +1,60 @@ +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true)