Get the rest of the ExtIcu-ish things to php

Esse commit está contido em:
Jordan DeLong
2013-06-15 14:36:32 -07:00
commit de Sara Golemon
commit 75be029d31
11 arquivos alterados com 603 adições e 778 exclusões
-3
Ver Arquivo
@@ -35,9 +35,6 @@
#include "hphp/facebook/extensions/tao/test_ext_tao.h"
#include "hphp/facebook/extensions/urlextraction/test_ext_urlextraction.h"
#include "hphp/test/ext/test_ext_curl.h"
#include "hphp/test/ext/test_ext_icu_ucnv.h"
#include "hphp/test/ext/test_ext_icu_ucsdet.h"
#include "hphp/test/ext/test_ext_icu_uspoof.h"
#include "hphp/test/ext/test_ext_imagesprite.h"
#include "hphp/test/ext/test_ext_intl.h"
#include "hphp/test/ext/test_ext_ipc.h"
-36
Ver Arquivo
@@ -1,36 +0,0 @@
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#include "hphp/test/ext/test_ext_icu_ucnv.h"
#include "hphp/runtime/ext/ext_icu_ucnv.h"
IMPLEMENT_SEP_EXTENSION_TEST(Icu_ucnv);
///////////////////////////////////////////////////////////////////////////////
bool TestExtIcu_ucnv::RunTests(const std::string &which) {
bool ret = true;
RUN_TEST(test_UConverter);
return ret;
}
///////////////////////////////////////////////////////////////////////////////
bool TestExtIcu_ucnv::test_UConverter() {
// Handled in TestCodeRun
return Count(true);
}
-35
Ver Arquivo
@@ -1,35 +0,0 @@
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#ifndef incl_HPHP_TEST_EXT_ICU_UCNV_H_
#define incl_HPHP_TEST_EXT_ICU_UCNV_H_
// >>>>>> Generated by idl.php. Do NOT modify. <<<<<<
#include "hphp/test/ext/test_cpp_ext.h"
///////////////////////////////////////////////////////////////////////////////
class TestExtIcu_ucnv : public TestCppExt {
public:
virtual bool RunTests(const std::string &which);
bool test_UConverter();
};
///////////////////////////////////////////////////////////////////////////////
#endif // incl_HPHP_TEST_EXT_ICU_UCNV_H_
-382
Ver Arquivo
@@ -1,382 +0,0 @@
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#include "hphp/test/ext/test_ext_icu_ucsdet.h"
#include "hphp/runtime/ext/ext_icu_ucsdet.h"
#include "hphp/runtime/ext/ext_array.h"
///////////////////////////////////////////////////////////////////////////////
bool TestExtIcu_ucsdet::RunTests(const std::string &which) {
bool ret = true;
RUN_TEST(test_basics);
// Special cases
RUN_TEST(test_empty);
RUN_TEST(test_cannot_detect);
RUN_TEST(test_declared_encoding);
// English and Western European
RUN_TEST(test_hello_world);
RUN_TEST(test_windows_1252);
// Eastern European
RUN_TEST(test_windows_1250);
// Arabic
RUN_TEST(test_windows_1256);
// Japanese
RUN_TEST(test_shift_jis);
RUN_TEST(test_euc_jp);
RUN_TEST(test_iso_2022_jp);
// Chinese
RUN_TEST(test_gb2312);
RUN_TEST(test_big5);
// Cyrillic
RUN_TEST(test_koi8r);
RUN_TEST(test_windows_1251);
// Universal
RUN_TEST(test_utf8);
RUN_TEST(test_utf16);
return ret;
}
bool TestExtIcu_ucsdet::detect_and_convert_to_utf8(
CStrRef bytes,
CStrRef utf8) {
p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)());
detector->t_settext(bytes);
Object matchObj = detector->t_detect();
p_EncodingMatch match = matchObj.getTyped<c_EncodingMatch>();
if (!match->t_isvalid()) {
return false;
}
return match->t_getutf8() == utf8;
}
bool TestExtIcu_ucsdet::test_uninitialized() {
p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)());
try {
Object matchObj = detector->t_detect();
} catch (Exception& e) {
return Count(true);
}
return Count(false);
}
bool TestExtIcu_ucsdet::test_basics() {
// This is as unmistakably UTF-8 as it gets.
const char* utf8_snowman_with_bom = "\uFEFF\u2603";
p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)());
detector->t_settext(utf8_snowman_with_bom);
Object matchObj = detector->t_detect();
p_EncodingMatch match = matchObj.getTyped<c_EncodingMatch>();
VERIFY(match->t_isvalid() == true);
VERIFY(match->t_getencoding() == "UTF-8");
VERIFY(match->t_getconfidence() == 100);
VERIFY(match->t_getutf8() == utf8_snowman_with_bom);
return Count(true);
}
bool TestExtIcu_ucsdet::test_empty() {
VERIFY(detect_and_convert_to_utf8("", "") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_cannot_detect() {
p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)());
// The detector has no idea what to do with this.
detector->t_settext("\xc7\xe8\xec\xed\xe8\xe9 \xe2\xe5\xf7\xe5\xf0");
Object matchObj = detector->t_detect();
p_EncodingMatch match = matchObj.getTyped<c_EncodingMatch>();
VERIFY(match->t_isvalid() == false);
return Count(true);
}
bool TestExtIcu_ucsdet::test_declared_encoding() {
// Right now (ICU 4.6), this API doesn't actually do anything, but
// let's at least verify it doesn't crash.
p_EncodingDetector detector(NEWOBJ(c_EncodingDetector)());
detector->t_settext("Yo!");
detector->t_setdeclaredencoding("windows-1251");
Object matchObj = detector->t_detect();
p_EncodingMatch match = matchObj.getTyped<c_EncodingMatch>();
VERIFY(match->t_isvalid() == true);
VERIFY(match->t_getutf8() == "Yo!");
return Count(true);
}
bool TestExtIcu_ucsdet::test_hello_world() {
VERIFY(detect_and_convert_to_utf8("Hello, world!", "Hello, world!") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_windows_1252() {
VERIFY(detect_and_convert_to_utf8(
"Toda Europa ley\xf3 Don Quijote como una s\xe1tira.",
"Toda Europa ley\u00f3 Don Quijote como una s\u00e1tira.") == true);
VERIFY(detect_and_convert_to_utf8(
"Notre P\xe8re, qui \xeates aux cieux",
"Notre P\u00e8re, qui \u00eates aux cieux") == true);
VERIFY(detect_and_convert_to_utf8(
"Marta da Silva, als beste Spielerin und beste Torsch\xFCtzin der WM "
"2007 sowie bisher f\xFCnf Mal als \x84Weltfu\xDF" "ballerin des "
"Jahres\x93 ausgezeichnet, kommt zur Welt.",
"Marta da Silva, als beste Spielerin und beste Torsch\u00fctzin der WM "
"2007 sowie bisher f\u00FCnf Mal als \u201EWeltfu\u00DFballerin des "
"Jahres\u201C ausgezeichnet, kommt zur Welt.") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_windows_1250() {
VERIFY(detect_and_convert_to_utf8(
"Do Wikipedie m\xf9\x9e" "e p\xf8isp\xedvat kdokoliv.",
"Do Wikipedie m\u016f\u017ee p\u0159isp\u00edvat kdokoliv.") == true);
VERIFY(detect_and_convert_to_utf8(
"O\xe8" "e na\x9a, koji jesi na nebesima, sveti se ime Tvoje.",
"O\u010de na\u0161, koji jesi na nebesima, sveti se ime Tvoje.") == true);
VERIFY(detect_and_convert_to_utf8(
"Prezentacj\xea pierwszego graficznego \x9crodowiska pracy z rodziny "
"Windows firmy Microsoft przeprowadzono w listopadzie 1985.",
"Prezentacj\u0119 pierwszego graficznego \u015brodowiska pracy z rodziny "
"Windows firmy Microsoft przeprowadzono w listopadzie 1985.") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_windows_1256() {
VERIFY(detect_and_convert_to_utf8(
"\xe1\xc7 \xc3\xca\xdf\xe1\xe3 \xc7\xe1\xda\xd1\xc8\xed\xc9",
"\u0644\u0627 \u0623\u062a\u0643\u0644\u0645 \u0627\u0644\u0639\u0631\u0628"
"\u064a\u0629") == true);
VERIFY(detect_and_convert_to_utf8(
"\xe6\xed\xdf\xed\xc8\xed\xcf\xed\xc7 \xe5\xed \xe3\xd4\xd1\xe6\xda \xe3"
"\xe6\xd3\xe6\xda\xc9 \xe3\xca\xda\xcf\xcf\xc9 \xc7\xe1\xe1\xdb\xc7\xca"
"\xa1 \xe3\xc8\xe4\xed\xc9 \xda\xe1\xec \xc7\xe1\xe6\xed\xc8\xa1 \xd0\xc7"
"\xca \xe3\xcd\xca\xe6\xec \xcd\xd1\xa1 \xca\xd4\xdb\xe1\xe5\xc7 \xe3\xc4"
"\xd3\xd3\xc9 \xe6\xed\xdf\xed\xe3\xed\xcf\xed\xc7\xa1 \xc7\xe1\xca"
"\xed \xe5\xed \xe3\xe4\xd9\xe3\xc9 \xdb\xed\xd1 \xd1\xc8\xcd\xed\xc9.",
"\u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627 \u0647\u064a \u0645"
"\u0634\u0631\u0648\u0639 \u0645\u0648\u0633\u0648\u0639\u0629 \u0645\u062a"
"\u0639\u062f\u062f\u0629 \u0627\u0644\u0644\u063a\u0627\u062a\u060c \u0645"
"\u0628\u0646\u064a\u0629 \u0639\u0644\u0649 \u0627\u0644\u0648\u064a\u0628"
"\u060c \u0630\u0627\u062a \u0645\u062d\u062a\u0648\u0649 \u062d\u0631"
"\u060c \u062a\u0634\u063a\u0644\u0647\u0627 \u0645\u0624\u0633\u0633"
"\u0629 \u0648\u064a\u0643\u064a\u0645\u064a\u062f\u064a\u0627\u060c \u0627"
"\u0644\u062a\u064a \u0647\u064a \u0645\u0646\u0638\u0645\u0629 \u063a"
"\u064a\u0631 \u0631\u0628\u062d\u064a\u0629.") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_shift_jis() {
VERIFY(detect_and_convert_to_utf8(
"\x81w\x82\xc6\x82\xc8\x82\xe8\x82\xcc\x83g\x83g\x83\x8d\x81x\x82\xcd\x81"
"A\x83X\x83^\x83W\x83I\x83W\x83u\x83\x8a\x90\xa7\x8d\xec\x82\xcc\x93\xfa"
"\x96{\x82\xcc\x92\xb7\x95\xd2\x83" "A\x83j\x83\x81\x81[\x83V\x83\x87\x83"
"\x93\x8d\xec\x95i\x81" "B",
"\u300e\u3068\u306a\u308a\u306e\u30c8\u30c8\u30ed\u300f\u306f\u3001\u30b9"
"\u30bf\u30b8\u30aa\u30b8\u30d6\u30ea\u5236\u4f5c\u306e\u65e5\u672c\u306e"
"\u9577\u7de8\u30a2\u30cb\u30e1\u30fc\u30b7\u30e7\u30f3\u4f5c\u54c1\u3002")
== true);
VERIFY(detect_and_convert_to_utf8(
"\x83" "E\x83" "B\x83L\x83y\x83" "f\x83" "B\x83" "A (Wikipedia) \x82\xcd"
"\x83" "E\x83" "B\x83L\x83\x81\x83" "f\x83" "B\x83" "A\x8d\xe0\x92"
"c\x82\xaa\x89^\x89" "c\x82\xb7\x82\xe9\x83I\x83\x93\x83\x89\x83" "C\x83"
"\x93\x95S\x89\xc8\x8e\x96\x93T\x81" "B",
"\u30a6\u30a3\u30ad\u30da\u30c7\u30a3\u30a2 (Wikipedia) \u306f\u30a6"
"\u30a3\u30ad\u30e1\u30c7\u30a3\u30a2\u8ca1\u56e3\u304c\u904b\u55b6\u3059"
"\u308b\u30aa\u30f3\u30e9\u30a4\u30f3\u767e\u79d1\u4e8b\u5178\u3002")
== true);
// Too short; detector thinks it's most likely Windows-1252.
//
// VERIFY(detect_and_convert_to_utf8(
// "\x90\xa2\x8a" "E\x90l\x8c\xa0\x90\xe9\x8c\xbe",
// "\u4e16\u754c\u4eba\u6a29\u5ba3\u8a00") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_euc_jp() {
VERIFY(detect_and_convert_to_utf8(
"\xb2\xbf\xbf\xcd\xa4\xe2\xa1\xa2\xa4\xdb\xa4\xb7\xa4\xa4\xa4\xde\xa4\xde"
"\xa4\xcb\xc2\xe1\xca\xe1\xa1\xa2\xb9\xb4\xb6\xd8\xa1\xa2\xcb\xf4\xa4\xcf"
"\xc4\xc9\xca\xfc\xa4\xb5\xa4\xec\xa4\xeb\xa4\xb3\xa4\xc8\xa4\xcf\xa4\xca"
"\xa4\xa4\xa1\xa3",
"\u4f55\u4eba\u3082\u3001\u307b\u3057\u3044\u307e\u307e\u306b\u902e\u6355"
"\u3001\u62d8\u7981\u3001\u53c8\u306f\u8ffd\u653e\u3055\u308c\u308b\u3053"
"\u3068\u306f\u306a\u3044\u3002") == true);
VERIFY(detect_and_convert_to_utf8(
"1920\xc7\xaf \xa5\xa6\xa5\xa3\xa5\xf3\xa5\xd6\xa5\xeb\xa5\xc9\xa5\xf3\xc1"
"\xaa\xbc\xea\xb8\xa2\xa1\xcaThe Championships, Wimbledon 1920\xa1\xcb\xa4"
"\xcb\xb4\xd8\xa4\xb9\xa4\xeb\xb5\xad\xbb\xf6\xa1\xa3",
"1920\u5e74 \u30a6\u30a3\u30f3\u30d6\u30eb\u30c9\u30f3\u9078\u624b\u6a29"
"\uff08The Championships, Wimbledon 1920\uff09\u306b\u95a2\u3059\u308b"
"\u8a18\u4e8b\u3002") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_iso_2022_jp() {
VERIFY(detect_and_convert_to_utf8(
"\x1b$B%-%M%F%#%C%/%3%M%/%7%g%s!J\x1b(Bkinetic connection\x1b$B!K$O\x1b(B1"
"986\x1b$BG/$K%=%K!<$,\x1b(BMSX2\x1b$B$GH/Gd$7$?%Q%:%k%2!<%`!#\x1b(B",
"\u30ad\u30cd\u30c6\u30a3\u30c3\u30af\u30b3\u30cd\u30af\u30b7\u30e7\u30f3"
"\uff08kinetic connection\uff09\u306f1986\u5e74\u306b\u30bd\u30cb\u30fc"
"\u304cMSX2\u3067\u767a\u58f2\u3057\u305f\u30d1\u30ba\u30eb\u30b2\u30fc"
"\u30e0\u3002") == true);
VERIFY(detect_and_convert_to_utf8(
"\x1b$B5f6K\x1b(B!!\x1b$BJQBV2>LL\x1b(B",
"\u7a76\u6975!!\u5909\u614b\u4eee\u9762") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_gb2312() {
VERIFY(detect_and_convert_to_utf8(
"\xca\xc7\xd2\xbb\xb8\xf6\xd3\xef\xd1\xd4\xa1\xa2\xc4\xda\xc8\xdd\xbf\xaa"
"\xb7\xc5\xb5\xc4\xcd\xf8\xc2\xe7\xb0\xd9\xbf\xc6\xc8\xab\xca\xe9\xbc\xc6"
"\xbb\xae",
"\u662f\u4e00\u4e2a\u8bed\u8a00\u3001\u5185\u5bb9\u5f00\u653e\u7684\u7f51"
"\u7edc\u767e\u79d1\u5168\u4e66\u8ba1\u5212") == true);
VERIFY(detect_and_convert_to_utf8(
"\xa1\xb6\xd2\xbb\xc1\xa3\xd6\xd3\xd5\xe6\xc8\xcb\xcb\xd5\xa1\xb7\xa3\xa8"
"\xd3\xa2\xce\xc4\xc3\xfb\xa3\xbaSo Real Time Cooking\xa3\xa9",
"\u300a\u4e00\u7c92\u949f\u771f\u4eba\u82cf\u300b\uff08\u82f1\u6587\u540d"
"\uff1aSo Real Time Cooking\uff09") == true);
// Too short; detector thinks it's most likely Shift-JIS.
// VERIFY(detect_and_convert_to_utf8(
// "\xce\xe2\xb8\xe7\xbf\xdf",
// "\u5434\u54e5\u7a9f") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_big5() {
VERIFY(detect_and_convert_to_utf8(
"1\xa1]\xa4@\xa1^\xacO0\xbbP2\xa4\xa7\xb6\xa1\xaa\xba\xa6\xdb\xb5M\xbc\xc6"
"\xa1" "A\xacO\xb3\xcc\xa4p\xaa\xba\xa5\xbf\xa9_\xbc\xc6\xa1" "C",
"1\uff08\u4e00\uff09\u662f0\u82072\u4e4b\u9593\u7684\u81ea\u7136\u6578"
"\uff0c\u662f\u6700\u5c0f\u7684\u6b63\u5947\u6578\u3002") == true);
VERIFY(detect_and_convert_to_utf8(
"\xbeG\xa5\xf2\xaf\xf4\xa1]\xad^\xa4\xe5\xa6W \xa1G Cheng Chung Yin\xa1^"
"\xa1" "A\xacO\xa4@\xa6W\xa5x\xc6W\xa4k\xbat\xad\xfb",
"\u912d\u4ef2\u8335\uff08\u82f1\u6587\u540d \uff1a Cheng Chung Yin\uff09"
"\uff0c\u662f\u4e00\u540d\u53f0\u7063\u5973\u6f14\u54e1") == true);
// Too short; detector thinks it's most likely Shift-JIS.
// VERIFY(detect_and_convert_to_utf8(
// "\xa7" "d\xad\xf4\xb8]",
// "\u5433\u54e5\u7a9f") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_koi8r() {
VERIFY(detect_and_convert_to_utf8(
"\xeb\xd7\xc5\xc2\xc5\xcb \xd0\xc5\xd2\xd7\xc1\xd1 \xd0\xcf \xd0\xcc\xcf"
"\xdd\xc1\xc4\xc9 \xc9 \xd7\xd4\xcf\xd2\xc1\xd1 \xd0\xcf \xce\xc1\xd3\xc5"
"\xcc\xc5\xce\xc9\xc0 \xd0\xd2\xcf\xd7\xc9\xce\xc3\xc9\xd1 \xeb\xc1\xce"
"\xc1\xc4\xd9.",
"\u041a\u0432\u0435\u0431\u0435\u043a \u043f\u0435\u0440\u0432\u0430"
"\u044f \u043f\u043e \u043f\u043b\u043e\u0449\u0430\u0434\u0438 \u0438 "
"\u0432\u0442\u043e\u0440\u0430\u044f \u043f\u043e \u043d\u0430\u0441"
"\u0435\u043b\u0435\u043d\u0438\u044e \u043f\u0440\u043e\u0432\u0438"
"\u043d\u0446\u0438\u044f \u041a\u0430\u043d\u0430\u0434\u044b.") == true);
VERIFY(detect_and_convert_to_utf8(
"\xe2\xdf\xd2\xc4\xc5\xce\xc9 \xc5 \xd3\xc5\xcc\xcf \xd7 \xf3\xc5\xd7\xc5"
"\xd2\xce\xc1 \xe2\xdf\xcc\xc7\xc1\xd2\xc9\xd1",
"\u0411\u044a\u0440\u0434\u0435\u043d\u0438 \u0435 \u0441\u0435\u043b"
"\u043e \u0432 \u0421\u0435\u0432\u0435\u0440\u043d\u0430 \u0411\u044a"
"\u043b\u0433\u0430\u0440\u0438\u044f") == true);
VERIFY(detect_and_convert_to_utf8(
"\xfe\xc5\xd2\xd7\xc5\xce\xc1 \xd0\xd2\xc5\xd7\xdf\xda\xc8\xcf\xc4\xce"
"\xc1",
"\u0427\u0435\u0440\u0432\u0435\u043d\u0430 \u043f\u0440\u0435\u0432"
"\u044a\u0437\u0445\u043e\u0434\u043d\u0430") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_windows_1251() {
VERIFY(detect_and_convert_to_utf8(
"\xce\xf7\xe5 \xed\xe0\xf8, \xea\xee\xbc \xf1\xe8 \xed\xe0 \xed\xe5\xe1"
"\xe5\xf1\xe0\xf2\xe0",
"\u041e\u0447\u0435 \u043d\u0430\u0448, \u043a\u043e\u0458 \u0441\u0438 "
"\u043d\u0430 \u043d\u0435\u0431\u0435\u0441\u0430\u0442\u0430") == true);
VERIFY(detect_and_convert_to_utf8(
"\xd1 \xe7\xee\xee\xeb\xee\xe3\xe8\xf7\xe5\xf1\xea\xee\xe9 \xf2\xee\xf7\xea"
"\xe8 \xe7\xf0\xe5\xed\xe8\xff, \xe4\xee\xec\xe0\xf8\xed\xff\xff \xea\xee"
"\xf8\xea\xe0 \x97 \xec\xeb\xe5\xea\xee\xef\xe8\xf2\xe0\xfe\xf9\xe5\xe5"
" \xf1\xe5\xec\xe5\xe9\xf1\xf2\xe2\xe0 \xea\xee\xf8\xe0\xf7\xfc\xe8\xf5 "
"\xee\xf2\xf0\xff\xe4\xe0 \xf5\xe8\xf9\xed\xfb\xf5.",
"\u0421 \u0437\u043e\u043e\u043b\u043e\u0433\u0438\u0447\u0435\u0441\u043a"
"\u043e\u0439 \u0442\u043e\u0447\u043a\u0438 \u0437\u0440\u0435\u043d\u0438"
"\u044f, \u0434\u043e\u043c\u0430\u0448\u043d\u044f\u044f \u043a\u043e"
"\u0448\u043a\u0430 \u2014 \u043c\u043b\u0435\u043a\u043e\u043f\u0438\u0442"
"\u0430\u044e\u0449\u0435\u0435 \u0441\u0435\u043c\u0435\u0439\u0441\u0442"
"\u0432\u0430 \u043a\u043e\u0448\u0430\u0447\u044c\u0438\u0445 \u043e\u0442"
"\u0440\u044f\u0434\u0430 \u0445\u0438\u0449\u043d\u044b\u0445.") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_utf8() {
VERIFY(detect_and_convert_to_utf8(
"\u10e8\u10d8\u10dc\u10d0\u10e3\u10e0\u10d8 \u10d9\u10d0\u10e2\u10d0",
"\u10e8\u10d8\u10dc\u10d0\u10e3\u10e0\u10d8 \u10d9\u10d0\u10e2\u10d0")
== true);
VERIFY(detect_and_convert_to_utf8(
"\u0e2b\u0e19\u0e49\u0e32\u0e2b\u0e25\u0e31\u0e01",
"\u0e2b\u0e19\u0e49\u0e32\u0e2b\u0e25\u0e31\u0e01") == true);
return Count(true);
}
bool TestExtIcu_ucsdet::test_utf16() {
// The detector only handles UTF-16 if there's a BOM at the front.
char utf16[] =
"\xff\xfeH\x00" "e\x00l\x00l\x00o\x00,\x00 \x00w\x00o\x00r\x00l\x00"
"d\x00!\x00";
// Take off 1 byte for the NUL at the end of the char[].
String utf16Str(utf16, sizeof utf16 - 1, AttachLiteral);
VERIFY(detect_and_convert_to_utf8(
utf16Str,
"\ufeffHello, world!") == true);
return Count(true);
}
-70
Ver Arquivo
@@ -1,70 +0,0 @@
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#ifndef incl_HPHP_TEST_EXT_ICU_UCSDET_H_
#define incl_HPHP_TEST_EXT_ICU_UCSDET_H_
// >>>>>> Generated by idl.php. Do NOT modify. <<<<<<
#include "hphp/test/ext/test_cpp_ext.h"
///////////////////////////////////////////////////////////////////////////////
class TestExtIcu_ucsdet : public TestCppExt {
public:
virtual bool RunTests(const std::string &which);
private:
bool detect_and_convert_to_utf8(CStrRef bytes, CStrRef utf8);
bool test_basics();
bool test_uninitialized();
bool test_empty();
bool test_cannot_detect();
bool test_declared_encoding();
// English and Western European
bool test_hello_world();
bool test_windows_1252();
// Eastern European
bool test_windows_1250();
// Arabic
bool test_windows_1256();
// Japanese
bool test_shift_jis();
bool test_euc_jp();
bool test_iso_2022_jp();
// Chinese
bool test_gb2312();
bool test_big5();
// Cyrillic
bool test_koi8r();
bool test_windows_1251();
// Universal
bool test_utf8();
bool test_utf16();
};
///////////////////////////////////////////////////////////////////////////////
#endif // incl_HPHP_TEST_EXT_ICU_UCSDET_H_
-213
Ver Arquivo
@@ -1,213 +0,0 @@
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#include "hphp/test/ext/test_ext_icu_uspoof.h"
#include "hphp/runtime/ext/ext_icu_uspoof.h"
///////////////////////////////////////////////////////////////////////////////
bool TestExtIcu_uspoof::RunTests(const std::string &which) {
bool ret = true;
RUN_TEST(test_SpoofChecker_issuspicious);
RUN_TEST(test_SpoofChecker_areconfusable);
RUN_TEST(test_SpoofChecker_issuesfound);
RUN_TEST(test_SpoofChecker_setchecks);
RUN_TEST(test_SpoofChecker_setallowedlocales);
return ret;
}
///////////////////////////////////////////////////////////////////////////////
bool TestExtIcu_uspoof::test_SpoofChecker_issuspicious() {
p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)());
VS(checker->t_issuspicious("facebook"), false);
// facebook with Cyrillic spoof characters
VS(checker->t_issuspicious("f\u0430\u0441\u0435b\u043e\u043ek"), true);
// "Russia" in Cyrillic with Latin spoof characters
VS(checker->t_issuspicious("Pocc\u0438\u044f"), true);
// paypal with Cyrillic spoof characters
VS(checker->t_issuspicious("http://www.payp\u0430l.com"), true);
// certain all-uppercase Latin sequences can be spoof of Greek
VS(checker->t_issuspicious("NAPKIN PEZ"), true);
VS(checker->t_issuspicious("napkin pez"), false);
// English with Japanese characters
VS(checker->t_issuspicious("True fact: \u5fcd\u8005 are mammals"), false);
// Japanese name with mixed kanji and hiragana
VS(checker->t_issuspicious("\u6a4b\u672c\u611b\u307f"), false);
try {
checker->t_issuspicious("this is not UTF-8: \x87\xFB\xCA\x94\xDB");
} catch (Exception& e) {
return Count(true);
}
return Count(false);
}
bool TestExtIcu_uspoof::test_SpoofChecker_areconfusable() {
p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)());
VS(checker->t_areconfusable("hello, world", "goodbye, world"), false);
VS(checker->t_areconfusable("hello, world", "hello, world"), true);
VS(checker->t_areconfusable("hello, world", "he11o, wor1d"), true);
VS(checker->t_areconfusable("hell\u00f8", "hello\u0337"), true);
VS(checker->t_areconfusable("facebook", "f\u0430\u0441\u0435b\u043e\u043ek"),
true);
VS(checker->t_areconfusable("facebook", "\U0001d41faceboo\u1d0b"), true);
VS(checker->t_areconfusable("facebook", "\u017facebook"), true);
VS(checker->t_areconfusable("paypal", "payp\u0430l"), true);
VS(checker->t_areconfusable(
"NAPKIN PEZ",
"\u039d\u0391\u03a1\u039a\u0399\u039d \u03a1\u0395\u0396"),
true);
VS(checker->t_areconfusable(
"facebook",
"ufiek-a\u048ba\u049d \u049da\u048b\u00f0a\u048b\u01e5a\u048b-\u049dota-"
"\u00f0o\u00f0ol"),
false);
try {
checker->t_areconfusable(
"this is not UTF-8: \x87\xFB\xCA\x94\xDB",
"so there.");
} catch (Exception& e) {
return Count(true);
}
return Count(false);
}
bool TestExtIcu_uspoof::test_SpoofChecker_issuesfound() {
p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)());
Variant ret;
VS(checker->t_issuspicious("NAPKIN PEZ", ref(ret)), true);
VS(ret.getInt64(), q_SpoofChecker$$WHOLE_SCRIPT_CONFUSABLE);
VS(checker->t_issuspicious("f\u0430\u0441\u0435b\u043e\u043ek", ref(ret)),
true);
VS(ret.getInt64(), q_SpoofChecker$$MIXED_SCRIPT_CONFUSABLE);
VS(checker->t_areconfusable("hello, world", "he11o, wor1d", ref(ret)), true);
VS(ret.getInt64(), q_SpoofChecker$$SINGLE_SCRIPT_CONFUSABLE);
return Count(true);
}
bool TestExtIcu_uspoof::test_SpoofChecker_setchecks() {
{
p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)());
// The checker should start in any-case mode.
VS(checker->t_areconfusable("HELLO", "H\u0415LLO"), true);
VS(checker->t_areconfusable("hello", "h\u0435llo"), true);
// Go to lower-case only mode (assumes all strings have been
// case-folded).
checker->t_setchecks(
q_SpoofChecker$$MIXED_SCRIPT_CONFUSABLE |
q_SpoofChecker$$WHOLE_SCRIPT_CONFUSABLE |
q_SpoofChecker$$SINGLE_SCRIPT_CONFUSABLE
);
VS(checker->t_areconfusable("HELLO", "H\u0415LLO"), false);
VS(checker->t_areconfusable("hello", "h\u0435llo"), true);
}
{
p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)());
VS(checker->t_issuspicious("True fact: \u5fcd\u8005 are mammals"), false);
// Only allow characters of a single script.
checker->t_setchecks(q_SpoofChecker$$SINGLE_SCRIPT);
VS(checker->t_issuspicious("True fact: \u5fcd\u8005 are mammals"), true);
}
try {
p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)());
checker->t_setchecks(0xDEADBEEF);
} catch (Exception& e) {
return Count(true);
}
return Count(false);
}
bool TestExtIcu_uspoof::test_SpoofChecker_setallowedlocales() {
p_SpoofChecker checker(NEWOBJ(c_SpoofChecker)());
const char* common = "Rogers";
const char* japanese_kanji_hiragana = "\u6a4b\u672c\u611b\u307f";
const char* korean = "\ud55c\uad6d\ub9d0";
const char* arabic = "\u0645\u0631\u062d\u0628\u064b\u0627";
const char* russian_cyrillic =
"\u0417\u0438\u0301\u043c\u043d\u0438\u0439 "
"\u0432\u0435\u0301\u0447\u0435\u0440";
const char* snowman = "\u2603";
checker->t_setallowedlocales("en_US");
VS(checker->t_issuspicious(common), false);
VS(checker->t_issuspicious(japanese_kanji_hiragana), true);
VS(checker->t_issuspicious(russian_cyrillic), true);
VS(checker->t_issuspicious(arabic), true);
VS(checker->t_issuspicious(korean), true);
VS(checker->t_issuspicious(snowman), false);
checker->t_setallowedlocales("en_US, ja_JP");
VS(checker->t_issuspicious(common), false);
VS(checker->t_issuspicious(japanese_kanji_hiragana), false);
VS(checker->t_issuspicious(russian_cyrillic), true);
VS(checker->t_issuspicious(arabic), true);
VS(checker->t_issuspicious(korean), true);
VS(checker->t_issuspicious(snowman), false);
checker->t_setallowedlocales("en_US, ko_KR");
VS(checker->t_issuspicious(common), false);
VS(checker->t_issuspicious(japanese_kanji_hiragana), true);
VS(checker->t_issuspicious(russian_cyrillic), true);
VS(checker->t_issuspicious(arabic), true);
VS(checker->t_issuspicious(korean), false);
VS(checker->t_issuspicious(snowman), false);
checker->t_setallowedlocales("en_US, ar_AR");
VS(checker->t_issuspicious(common), false);
VS(checker->t_issuspicious(japanese_kanji_hiragana), true);
VS(checker->t_issuspicious(russian_cyrillic), true);
VS(checker->t_issuspicious(arabic), false);
VS(checker->t_issuspicious(korean), true);
VS(checker->t_issuspicious(snowman), false);
checker->t_setallowedlocales("en_US, ru_RU");
VS(checker->t_issuspicious(common), false);
VS(checker->t_issuspicious(japanese_kanji_hiragana), true);
VS(checker->t_issuspicious(russian_cyrillic), false);
VS(checker->t_issuspicious(arabic), true);
VS(checker->t_issuspicious(korean), true);
VS(checker->t_issuspicious(snowman), false);
return Count(true);
}
-39
Ver Arquivo
@@ -1,39 +0,0 @@
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#ifndef TEST_EXT_ICU_USPOOF_H
#define TEST_EXT_ICU_USPOOF_H
// >>>>>> Generated by idl.php. Do NOT modify. <<<<<<
#include "hphp/test/ext/test_cpp_ext.h"
///////////////////////////////////////////////////////////////////////////////
class TestExtIcu_uspoof : public TestCppExt {
public:
virtual bool RunTests(const std::string &which);
bool test_SpoofChecker_issuspicious();
bool test_SpoofChecker_areconfusable();
bool test_SpoofChecker_issuesfound();
bool test_SpoofChecker_setchecks();
bool test_SpoofChecker_setallowedlocales();
};
///////////////////////////////////////////////////////////////////////////////
#endif // TEST_EXT_ICU_USPOOF_H
+322
Ver Arquivo
@@ -0,0 +1,322 @@
<?php
function VS($x, $y) {
var_dump($x === $y);
if ($x !== $y) { echo "Failed: $y\n"; echo "Got: $x\n";
var_dump(debug_backtrace()); }
}
function VERIFY($x) { VS($x, true); }
//////////////////////////////////////////////////////////////////////
// Php doesn't support \u escapes.
function getunicode($x) { return json_decode("\"" . $x . "\""); }
function detect_and_convert_to_utf8($bytes, $utf8) {
$detector = new EncodingDetector();
$detector->settext($bytes);
$match = $detector->detect();
if (!$match->isvalid()) {
return false;
}
// echo "Got: " . $match->getutf8() . "\n";
// echo "Want: " . $utf8 . "\n";
return $match->getutf8() == $utf8;
}
function test_basics() {
// This is as unmistakably UTF-8 as it gets.
$utf8_snowman_with_bom = getunicode("\\uFEFF\\u2603");
$detector = new EncodingDetector();
$detector->settext($utf8_snowman_with_bom);
$match = $detector->detect();
VERIFY($match->isvalid() == true);
VERIFY($match->getencoding() == "UTF-8");
VERIFY($match->getconfidence() == 100);
VERIFY($match->getutf8() == $utf8_snowman_with_bom);
}
function test_cannot_detect() {
$detector = new EncodingDetector();
// The detector has no idea what to do with this.
$detector->settext("\xc7\xe8\xec\xed\xe8\xe9 \xe2\xe5\xf7\xe5\xf0");
$match = $detector->detect();
VERIFY($match->isvalid() == false);
}
function test_declared_encoding() {
// Right now (ICU 4.6), this API doesn't actually do anything, but
// let's at least verify it doesn't crash.
$detector = new EncodingDetector();
$detector->settext("Yo!");
$detector->setdeclaredencoding("windows-1251");
$match = $detector->detect();
VERIFY($match->isvalid() == true);
VERIFY($match->getutf8() == "Yo!");
}
function test_hello_world() {
VERIFY(detect_and_convert_to_utf8("Hello, world!", "Hello, world!") == true);
}
function test_windows_1252() {
VERIFY(detect_and_convert_to_utf8(
"Toda Europa ley\xf3 Don Quijote como una s\xe1tira.",
getunicode('Toda Europa ley\u00f3 Don Quijote como una s\u00e1tira.'))
== true);
VERIFY(detect_and_convert_to_utf8(
"Notre P\xe8re, qui \xeates aux cieux",
getunicode('Notre P\u00e8re, qui \u00eates aux cieux')) == true);
VERIFY(detect_and_convert_to_utf8(
"Marta da Silva, als beste Spielerin und beste Torsch\xFCtzin der WM ".
"2007 sowie bisher f\xFCnf Mal als \x84Weltfu\xDF". "ballerin des ".
"Jahres\x93 ausgezeichnet, kommt zur Welt.",
getunicode(
'Marta da Silva, als beste Spielerin und beste Torsch\u00fctzin der WM '.
'2007 sowie bisher f\u00FCnf Mal als \u201EWeltfu\u00DFballerin des '.
'Jahres\u201C ausgezeichnet, kommt zur Welt.')) == true);
}
function test_windows_1250() {
VERIFY(detect_and_convert_to_utf8(
"Do Wikipedie m\xf9\x9e" ."e p\xf8isp\xedvat kdokoliv.",
getunicode('Do Wikipedie m\u016f\u017ee p\u0159isp\u00edvat kdokoliv.'))
== true);
VERIFY(detect_and_convert_to_utf8(
"O\xe8". "e na\x9a, koji jesi na nebesima, sveti se ime Tvoje.",
getunicode('O\u010de na\u0161, koji jesi na nebesima, sveti se ime Tvoje.'))
== true);
VERIFY(detect_and_convert_to_utf8(
"Prezentacj\xea pierwszego graficznego \x9crodowiska pracy z rodziny ".
"Windows firmy Microsoft przeprowadzono w listopadzie 1985.",
getunicode('Prezentacj\u0119 pierwszego graficznego \u015brodowiska pracy z rodziny '.
'Windows firmy Microsoft przeprowadzono w listopadzie 1985.')) == true);
}
function test_windows_1256() {
VERIFY(detect_and_convert_to_utf8(
"\xe1\xc7 \xc3\xca\xdf\xe1\xe3 \xc7\xe1\xda\xd1\xc8\xed\xc9",
getunicode(
'\u0644\u0627 \u0623\u062a\u0643\u0644\u0645 \u0627\u0644\u0639\u0631\u0628'.
'\u064a\u0629')) == true);
VERIFY(detect_and_convert_to_utf8(
"\xe6\xed\xdf\xed\xc8\xed\xcf\xed\xc7 \xe5\xed \xe3\xd4\xd1\xe6\xda \xe3".
"\xe6\xd3\xe6\xda\xc9 \xe3\xca\xda\xcf\xcf\xc9 \xc7\xe1\xe1\xdb\xc7\xca".
"\xa1 \xe3\xc8\xe4\xed\xc9 \xda\xe1\xec \xc7\xe1\xe6\xed\xc8\xa1 \xd0\xc7".
"\xca \xe3\xcd\xca\xe6\xec \xcd\xd1\xa1 \xca\xd4\xdb\xe1\xe5\xc7 \xe3\xc4".
"\xd3\xd3\xc9 \xe6\xed\xdf\xed\xe3\xed\xcf\xed\xc7\xa1 \xc7\xe1\xca".
"\xed \xe5\xed \xe3\xe4\xd9\xe3\xc9 \xdb\xed\xd1 \xd1\xc8\xcd\xed\xc9.",
getunicode(
'\u0648\u064a\u0643\u064a\u0628\u064a\u062f\u064a\u0627 \u0647\u064a \u0645'.
'\u0634\u0631\u0648\u0639 \u0645\u0648\u0633\u0648\u0639\u0629 \u0645\u062a'.
'\u0639\u062f\u062f\u0629 \u0627\u0644\u0644\u063a\u0627\u062a\u060c \u0645'.
'\u0628\u0646\u064a\u0629 \u0639\u0644\u0649 \u0627\u0644\u0648\u064a\u0628'.
'\u060c \u0630\u0627\u062a \u0645\u062d\u062a\u0648\u0649 \u062d\u0631'.
'\u060c \u062a\u0634\u063a\u0644\u0647\u0627 \u0645\u0624\u0633\u0633'.
'\u0629 \u0648\u064a\u0643\u064a\u0645\u064a\u062f\u064a\u0627\u060c \u0627'.
'\u0644\u062a\u064a \u0647\u064a \u0645\u0646\u0638\u0645\u0629 \u063a'.
'\u064a\u0631 \u0631\u0628\u062d\u064a\u0629.')) == true);
}
function test_shift_jis() {
VERIFY(detect_and_convert_to_utf8(
"\x81w\x82\xc6\x82\xc8\x82\xe8\x82\xcc\x83g\x83g\x83\x8d\x81x\x82\xcd\x81".
"A\x83X\x83^\x83W\x83I\x83W\x83u\x83\x8a\x90\xa7\x8d\xec\x82\xcc\x93\xfa".
"\x96{\x82\xcc\x92\xb7\x95\xd2\x83". "A\x83j\x83\x81\x81[\x83V\x83\x87\x83".
"\x93\x8d\xec\x95i\x81". "B",
getunicode(
'\u300e\u3068\u306a\u308a\u306e\u30c8\u30c8\u30ed\u300f\u306f\u3001\u30b9'.
'\u30bf\u30b8\u30aa\u30b8\u30d6\u30ea\u5236\u4f5c\u306e\u65e5\u672c\u306e'.
'\u9577\u7de8\u30a2\u30cb\u30e1\u30fc\u30b7\u30e7\u30f3\u4f5c\u54c1\u3002'))
== true);
VERIFY(detect_and_convert_to_utf8(
"\x83". "E\x83". "B\x83L\x83y\x83". "f\x83". "B\x83". "A (Wikipedia) \x82\xcd".
"\x83". "E\x83". "B\x83L\x83\x81\x83". "f\x83". "B\x83". "A\x8d\xe0\x92".
"c\x82\xaa\x89^\x89". "c\x82\xb7\x82\xe9\x83I\x83\x93\x83\x89\x83". "C\x83".
"\x93\x95S\x89\xc8\x8e\x96\x93T\x81". "B",
getunicode(
'\u30a6\u30a3\u30ad\u30da\u30c7\u30a3\u30a2 (Wikipedia) \u306f\u30a6'.
'\u30a3\u30ad\u30e1\u30c7\u30a3\u30a2\u8ca1\u56e3\u304c\u904b\u55b6\u3059'.
'\u308b\u30aa\u30f3\u30e9\u30a4\u30f3\u767e\u79d1\u4e8b\u5178\u3002'))
== true);
// Too short; detector thinks it's most likely Windows-1252.
//
// VERIFY(detect_and_convert_to_utf8(
// "\x90\xa2\x8a" "E\x90l\x8c\xa0\x90\xe9\x8c\xbe",
// "\u4e16\u754c\u4eba\u6a29\u5ba3\u8a00") == true);
}
function test_euc_jp() {
VERIFY(detect_and_convert_to_utf8(
"\xb2\xbf\xbf\xcd\xa4\xe2\xa1\xa2\xa4\xdb\xa4\xb7\xa4\xa4\xa4\xde\xa4\xde".
"\xa4\xcb\xc2\xe1\xca\xe1\xa1\xa2\xb9\xb4\xb6\xd8\xa1\xa2\xcb\xf4\xa4\xcf".
"\xc4\xc9\xca\xfc\xa4\xb5\xa4\xec\xa4\xeb\xa4\xb3\xa4\xc8\xa4\xcf\xa4\xca".
"\xa4\xa4\xa1\xa3",
getunicode(
'\u4f55\u4eba\u3082\u3001\u307b\u3057\u3044\u307e\u307e\u306b\u902e\u6355'.
'\u3001\u62d8\u7981\u3001\u53c8\u306f\u8ffd\u653e\u3055\u308c\u308b\u3053'.
'\u3068\u306f\u306a\u3044\u3002')) == true);
VERIFY(detect_and_convert_to_utf8(
"1920\xc7\xaf \xa5\xa6\xa5\xa3\xa5\xf3\xa5\xd6\xa5\xeb\xa5\xc9\xa5\xf3\xc1".
"\xaa\xbc\xea\xb8\xa2\xa1\xcaThe Championships, Wimbledon 1920\xa1\xcb\xa4".
"\xcb\xb4\xd8\xa4\xb9\xa4\xeb\xb5\xad\xbb\xf6\xa1\xa3",
getunicode(
'1920\u5e74 \u30a6\u30a3\u30f3\u30d6\u30eb\u30c9\u30f3\u9078\u624b\u6a29'.
'\uff08The Championships, Wimbledon 1920\uff09\u306b\u95a2\u3059\u308b'.
'\u8a18\u4e8b\u3002')) == true);
}
function test_iso_2022_jp() {
VERIFY(detect_and_convert_to_utf8(
"\x1b\$B%-%M%F%#%C%/%3%M%/%7%g%s!J\x1b(Bkinetic connection\x1b\$B!K\$O\x1b(B1".
"986\x1b\$BG/\$K%=%K!<\$,\x1b(BMSX2\x1b\$B\$GH/Gd\$7\$?%Q%:%k%2!<%`!#\x1b(B",
getunicode(
'\u30ad\u30cd\u30c6\u30a3\u30c3\u30af\u30b3\u30cd\u30af\u30b7\u30e7\u30f3'.
'\uff08kinetic connection\uff09\u306f1986\u5e74\u306b\u30bd\u30cb\u30fc'.
'\u304cMSX2\u3067\u767a\u58f2\u3057\u305f\u30d1\u30ba\u30eb\u30b2\u30fc'.
'\u30e0\u3002')) == true);
VERIFY(detect_and_convert_to_utf8(
"\x1b\$B5f6K\x1b(B!!\x1b\$BJQBV2>LL\x1b(B",
getunicode("\u7a76\u6975!!\u5909\u614b\u4eee\u9762")) == true);
}
function test_gb2312() {
VERIFY(detect_and_convert_to_utf8(
"\xca\xc7\xd2\xbb\xb8\xf6\xd3\xef\xd1\xd4\xa1\xa2\xc4\xda\xc8\xdd\xbf\xaa".
"\xb7\xc5\xb5\xc4\xcd\xf8\xc2\xe7\xb0\xd9\xbf\xc6\xc8\xab\xca\xe9\xbc\xc6".
"\xbb\xae",
getunicode(
'\u662f\u4e00\u4e2a\u8bed\u8a00\u3001\u5185\u5bb9\u5f00\u653e\u7684\u7f51'.
'\u7edc\u767e\u79d1\u5168\u4e66\u8ba1\u5212')) == true);
VERIFY(detect_and_convert_to_utf8(
"\xa1\xb6\xd2\xbb\xc1\xa3\xd6\xd3\xd5\xe6\xc8\xcb\xcb\xd5\xa1\xb7\xa3\xa8".
"\xd3\xa2\xce\xc4\xc3\xfb\xa3\xbaSo Real Time Cooking\xa3\xa9",
getunicode(
'\u300a\u4e00\u7c92\u949f\u771f\u4eba\u82cf\u300b\uff08\u82f1\u6587\u540d'.
'\uff1aSo Real Time Cooking\uff09')) == true);
// Too short; detector thinks it's most likely Shift-JIS.
// VERIFY(detect_and_convert_to_utf8(
// "\xce\xe2\xb8\xe7\xbf\xdf",
// "\u5434\u54e5\u7a9f") == true);
}
function test_big5() {
VERIFY(detect_and_convert_to_utf8(
"1\xa1]\xa4@\xa1^\xacO0\xbbP2\xa4\xa7\xb6\xa1\xaa\xba\xa6\xdb\xb5M\xbc\xc6".
"\xa1". "A\xacO\xb3\xcc\xa4p\xaa\xba\xa5\xbf\xa9_\xbc\xc6\xa1". "C",
getunicode(
'1\uff08\u4e00\uff09\u662f0\u82072\u4e4b\u9593\u7684\u81ea\u7136\u6578'.
'\uff0c\u662f\u6700\u5c0f\u7684\u6b63\u5947\u6578\u3002')) == true);
VERIFY(detect_and_convert_to_utf8(
"\xbeG\xa5\xf2\xaf\xf4\xa1]\xad^\xa4\xe5\xa6W \xa1G Cheng Chung Yin\xa1^".
"\xa1". "A\xacO\xa4@\xa6W\xa5x\xc6W\xa4k\xbat\xad\xfb",
getunicode(
'\u912d\u4ef2\u8335\uff08\u82f1\u6587\u540d \uff1a Cheng Chung Yin\uff09'.
'\uff0c\u662f\u4e00\u540d\u53f0\u7063\u5973\u6f14\u54e1')) == true);
// Too short; detector thinks it's most likely Shift-JIS.
// VERIFY(detect_and_convert_to_utf8(
// "\xa7" "d\xad\xf4\xb8]",
// "\u5433\u54e5\u7a9f") == true);
}
function test_koi8r() {
VERIFY(detect_and_convert_to_utf8(
"\xeb\xd7\xc5\xc2\xc5\xcb \xd0\xc5\xd2\xd7\xc1\xd1 \xd0\xcf \xd0\xcc\xcf".
"\xdd\xc1\xc4\xc9 \xc9 \xd7\xd4\xcf\xd2\xc1\xd1 \xd0\xcf \xce\xc1\xd3\xc5".
"\xcc\xc5\xce\xc9\xc0 \xd0\xd2\xcf\xd7\xc9\xce\xc3\xc9\xd1 \xeb\xc1\xce".
"\xc1\xc4\xd9.",
getunicode(
'\u041a\u0432\u0435\u0431\u0435\u043a \u043f\u0435\u0440\u0432\u0430'.
'\u044f \u043f\u043e \u043f\u043b\u043e\u0449\u0430\u0434\u0438 \u0438 '.
'\u0432\u0442\u043e\u0440\u0430\u044f \u043f\u043e \u043d\u0430\u0441'.
'\u0435\u043b\u0435\u043d\u0438\u044e \u043f\u0440\u043e\u0432\u0438'.
'\u043d\u0446\u0438\u044f \u041a\u0430\u043d\u0430\u0434\u044b.')) == true);
VERIFY(detect_and_convert_to_utf8(
"\xe2\xdf\xd2\xc4\xc5\xce\xc9 \xc5 \xd3\xc5\xcc\xcf \xd7 \xf3\xc5\xd7\xc5".
"\xd2\xce\xc1 \xe2\xdf\xcc\xc7\xc1\xd2\xc9\xd1",
getunicode(
'\u0411\u044a\u0440\u0434\u0435\u043d\u0438 \u0435 \u0441\u0435\u043b'.
'\u043e \u0432 \u0421\u0435\u0432\u0435\u0440\u043d\u0430 \u0411\u044a'.
'\u043b\u0433\u0430\u0440\u0438\u044f')) == true);
VERIFY(detect_and_convert_to_utf8(
"\xfe\xc5\xd2\xd7\xc5\xce\xc1 \xd0\xd2\xc5\xd7\xdf\xda\xc8\xcf\xc4\xce".
"\xc1",
getunicode(
'\u0427\u0435\u0440\u0432\u0435\u043d\u0430 \u043f\u0440\u0435\u0432'.
'\u044a\u0437\u0445\u043e\u0434\u043d\u0430')) == true);
}
function test_windows_1251() {
VERIFY(detect_and_convert_to_utf8(
"\xce\xf7\xe5 \xed\xe0\xf8, \xea\xee\xbc \xf1\xe8 \xed\xe0 \xed\xe5\xe1".
"\xe5\xf1\xe0\xf2\xe0",
getunicode(
'\u041e\u0447\u0435 \u043d\u0430\u0448, \u043a\u043e\u0458 \u0441\u0438 '.
'\u043d\u0430 \u043d\u0435\u0431\u0435\u0441\u0430\u0442\u0430')) == true);
VERIFY(detect_and_convert_to_utf8(
"\xd1 \xe7\xee\xee\xeb\xee\xe3\xe8\xf7\xe5\xf1\xea\xee\xe9 \xf2\xee\xf7\xea".
"\xe8 \xe7\xf0\xe5\xed\xe8\xff, \xe4\xee\xec\xe0\xf8\xed\xff\xff \xea\xee".
"\xf8\xea\xe0 \x97 \xec\xeb\xe5\xea\xee\xef\xe8\xf2\xe0\xfe\xf9\xe5\xe5".
" \xf1\xe5\xec\xe5\xe9\xf1\xf2\xe2\xe0 \xea\xee\xf8\xe0\xf7\xfc\xe8\xf5 ".
"\xee\xf2\xf0\xff\xe4\xe0 \xf5\xe8\xf9\xed\xfb\xf5.",
getunicode(
'\u0421 \u0437\u043e\u043e\u043b\u043e\u0433\u0438\u0447\u0435\u0441\u043a'.
'\u043e\u0439 \u0442\u043e\u0447\u043a\u0438 \u0437\u0440\u0435\u043d\u0438'.
'\u044f, \u0434\u043e\u043c\u0430\u0448\u043d\u044f\u044f \u043a\u043e'.
'\u0448\u043a\u0430 \u2014 \u043c\u043b\u0435\u043a\u043e\u043f\u0438\u0442'.
'\u0430\u044e\u0449\u0435\u0435 \u0441\u0435\u043c\u0435\u0439\u0441\u0442'.
'\u0432\u0430 \u043a\u043e\u0448\u0430\u0447\u044c\u0438\u0445 \u043e\u0442'.
'\u0440\u044f\u0434\u0430 \u0445\u0438\u0449\u043d\u044b\u0445.')) == true);
}
function test_utf8() {
VERIFY(detect_and_convert_to_utf8(
getunicode(
"\u10e8\u10d8\u10dc\u10d0\u10e3\u10e0\u10d8 \u10d9\u10d0\u10e2\u10d0"),
getunicode(
'\u10e8\u10d8\u10dc\u10d0\u10e3\u10e0\u10d8 \u10d9\u10d0\u10e2\u10d0'))
== true);
VERIFY(detect_and_convert_to_utf8(
getunicode('\u0e2b\u0e19\u0e49\u0e32\u0e2b\u0e25\u0e31\u0e01'),
getunicode('\u0e2b\u0e19\u0e49\u0e32\u0e2b\u0e25\u0e31\u0e01')) == true);
}
function test_utf16() {
// The detector only handles UTF-16 if there's a BOM at the front.
$utf16 =
"\xff\xfeH\x00". "e\x00l\x00l\x00o\x00,\x00 \x00w\x00o\x00r\x00l\x00".
"d\x00!\x00";
VERIFY(detect_and_convert_to_utf8(
$utf16,
getunicode("\ufeffHello, world!")) == true);
}
test_basics();
test_cannot_detect();
test_declared_encoding();
test_hello_world();
test_windows_1252();
test_windows_1250();
test_windows_1256();
test_shift_jis();
test_euc_jp();
test_iso_2022_jp();
test_gb2312();
test_big5();
test_koi8r();
test_windows_1251();
test_utf8();
test_utf16();
+34
Ver Arquivo
@@ -0,0 +1,34 @@
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
+187
Ver Arquivo
@@ -0,0 +1,187 @@
<?php
function VS($x, $y) {
var_dump($x === $y);
if ($x !== $y) { echo "Failed: $y\n"; echo "Got: $x\n";
var_dump(debug_backtrace()); }
}
function VERIFY($x) { VS($x, true); }
//////////////////////////////////////////////////////////////////////
// Php doesn't support \u escapes.
function u($x) { return json_decode("\"" . $x . "\""); }
function test_SpoofChecker_issuspicious() {
$checker = new SpoofChecker();
VS($checker->issuspicious("facebook"), false);
// facebook with Cyrillic spoof characters
VS($checker->issuspicious(u('f\u0430\u0441\u0435b\u043e\u043ek')), true);
// "Russia" in Cyrillic with Latin spoof characters
VS($checker->issuspicious(u('Pocc\u0438\u044f')), true);
// paypal with Cyrillic spoof characters
VS($checker->issuspicious(u('http://www.payp\u0430l.com')), true);
// certain all-uppercase Latin sequences can be spoof of Greek
VS($checker->issuspicious('NAPKIN PEZ'), true);
VS($checker->issuspicious('napkin pez'), false);
// English with Japanese characters
VS($checker->issuspicious(u('True fact: \u5fcd\u8005 are mammals')), false);
// Japanese name with mixed kanji and hiragana
VS($checker->issuspicious(u('\u6a4b\u672c\u611b\u307f')), false);
// try {
// $checker->issuspicious("this is not UTF-8: \x87\xFB\xCA\x94\xDB");
// } catch (Exception $e) {
// VS(true, true);
// }
}
function test_SpoofChecker_areconfusable() {
$checker = new SpoofChecker();
VS($checker->areconfusable("hello, world", "goodbye, world"), false);
VS($checker->areconfusable("hello, world", "hello, world"), true);
VS($checker->areconfusable("hello, world", "he11o, wor1d"), true);
VS($checker->areconfusable(u('hell\u00f8'), u('hello\u0337')), true);
VS($checker->areconfusable("facebook",
u('f\u0430\u0441\u0435b\u043e\u043ek')),
true);
VS($checker->areconfusable("facebook", "\xf0\x9d\x90\x9faceboo".u('\u1d0b')),
true);
VS($checker->areconfusable("facebook", u('\u017facebook')), true);
VS($checker->areconfusable("paypal", u('payp\u0430l')), true);
VS($checker->areconfusable(
"NAPKIN PEZ",
u('\u039d\u0391\u03a1\u039a\u0399\u039d \u03a1\u0395\u0396')),
true);
VS($checker->areconfusable(
"facebook",
u('ufiek-a\u048ba\u049d \u049da\u048b\u00f0a\u048b\u01e5a\u048b-\u049dota-'.
'\u00f0o\u00f0ol')),
false);
// try {
// $checker->areconfusable(
// "this is not UTF-8: \x87\xFB\xCA\x94\xDB",
// "so there.");
// } catch (Exception $e) {
// VS(true, true);
// }
}
function test_SpoofChecker_issuesfound() {
$checker = new SpoofChecker();
VS($checker->issuspicious("NAPKIN PEZ", $ret), true);
VS($ret, Spoofchecker::WHOLE_SCRIPT_CONFUSABLE);
VS($checker->issuspicious(u('f\u0430\u0441\u0435b\u043e\u043ek'), $ret),
true);
VS($ret, SpoofChecker::MIXED_SCRIPT_CONFUSABLE);
VS($checker->areconfusable("hello, world", "he11o, wor1d", $ret), true);
VS($ret, SpoofChecker::SINGLE_SCRIPT_CONFUSABLE);
return Count(true);
}
function test_SpoofChecker_setchecks() {
$checker = new SpoofChecker();
// The checker should start in any-case mode.
VS($checker->areconfusable("HELLO", u('H\u0415LLO')), true);
VS($checker->areconfusable("hello", u('h\u0435llo')), true);
// Go to lower-case only mode (assumes all strings have been
// case-folded).
$checker->setchecks(
SpoofChecker::MIXED_SCRIPT_CONFUSABLE |
SpoofChecker::WHOLE_SCRIPT_CONFUSABLE |
SpoofChecker::SINGLE_SCRIPT_CONFUSABLE
);
VS($checker->areconfusable("HELLO", u('H\u0415LLO')), false);
VS($checker->areconfusable("hello", u('h\u0435llo')), true);
$checker = new SpoofChecker();
VS($checker->issuspicious(u('True fact: \u5fcd\u8005 are mammals')), false);
// Only allow characters of a single script.
$checker->setchecks(SpoofChecker::SINGLE_SCRIPT);
VS($checker->issuspicious(u('True fact: \u5fcd\u8005 are mammals')), true);
// try {
// $checker = new SpoofChecker();
// $checker->setchecks(0xDEADBEEF);
// } catch (Exception $e) {
// VS(true, true);
// }
}
function test_SpoofChecker_setallowedlocales() {
$checker = new SpoofChecker();
$common = "Rogers";
$japanese_kanji_hiragana = u('\u6a4b\u672c\u611b\u307f');
$korean = u('\ud55c\uad6d\ub9d0');
$arabic = u('\u0645\u0631\u062d\u0628\u064b\u0627');
$russian_cyrillic =
u('\u0417\u0438\u0301\u043c\u043d\u0438\u0439 '.
'\u0432\u0435\u0301\u0447\u0435\u0440');
$snowman = u('\u2603');
$checker->setallowedlocales("en_US");
VS($checker->issuspicious($common), false);
VS($checker->issuspicious($japanese_kanji_hiragana), true);
VS($checker->issuspicious($russian_cyrillic), true);
VS($checker->issuspicious($arabic), true);
VS($checker->issuspicious($korean), true);
VS($checker->issuspicious($snowman), false);
$checker->setallowedlocales("en_US, ja_JP");
VS($checker->issuspicious($common), false);
VS($checker->issuspicious($japanese_kanji_hiragana), false);
VS($checker->issuspicious($russian_cyrillic), true);
VS($checker->issuspicious($arabic), true);
VS($checker->issuspicious($korean), true);
VS($checker->issuspicious($snowman), false);
$checker->setallowedlocales("en_US, ko_KR");
VS($checker->issuspicious($common), false);
VS($checker->issuspicious($japanese_kanji_hiragana), true);
VS($checker->issuspicious($russian_cyrillic), true);
VS($checker->issuspicious($arabic), true);
VS($checker->issuspicious($korean), false);
VS($checker->issuspicious($snowman), false);
$checker->setallowedlocales("en_US, ar_AR");
VS($checker->issuspicious($common), false);
VS($checker->issuspicious($japanese_kanji_hiragana), true);
VS($checker->issuspicious($russian_cyrillic), true);
VS($checker->issuspicious($arabic), false);
VS($checker->issuspicious($korean), true);
VS($checker->issuspicious($snowman), false);
$checker->setallowedlocales("en_US, ru_RU");
VS($checker->issuspicious($common), false);
VS($checker->issuspicious($japanese_kanji_hiragana), true);
VS($checker->issuspicious($russian_cyrillic), false);
VS($checker->issuspicious($arabic), true);
VS($checker->issuspicious($korean), true);
VS($checker->issuspicious($snowman), false);
}
test_SpoofChecker_issuspicious();
test_SpoofChecker_areconfusable();
test_SpoofChecker_issuesfound();
test_SpoofChecker_setchecks();
test_SpoofChecker_setallowedlocales();
+60
Ver Arquivo
@@ -0,0 +1,60 @@
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)