Arquivos
hhvm/hphp/test/test_ext_icu.cpp
T
Edwin Smith f29ee5314d Remove String::operator const char*().
Too many ways to shoot self in foot with this gem.
2013-04-25 11:34:21 -07:00

267 linhas
8.1 KiB
C++

/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#include <test/test_ext_icu.h>
#include <runtime/ext/ext_icu.h>
#include <iostream>
///////////////////////////////////////////////////////////////////////////////
bool TestExtIcu::RunTests(const std::string &which) {
bool ret = true;
RUN_TEST(test_icu_match);
RUN_TEST(test_icu_transliterate);
RUN_TEST(test_icu_tokenize);
return ret;
}
///////////////////////////////////////////////////////////////////////////////
bool TestExtIcu::test_icu_match() {
// Test subject strings.
String subject = String(
"\u05d6\U00010905 PHP is a scripting language. \ufeb0\ufef3",
CopyString);
String subject_32 = String(
"\U00010905\U00010905\U00010905\U00010905\U00010905\U00010905",
CopyString);
String subject_en = String("this is an english string", CopyString);
// "this is a hebrew string"
String subject_he = String(
"\u05d6\u05d4 \u05d4\u05d5\u05d0 \u05de\u05d7\u05e8\u05d5\u05d6\u05ea "
"\u05e2\u05d1\u05e8\u05d9\u05ea",
CopyString);
// "this is an arabic string"
String subject_ar = String(
"\ufee9\ufeab\ufe8d \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb "
"\ufe8d\ufefa\ufee8\ufea0\ufee0\ufef3\ufeb0\ufef3",
CopyString);
// "this is a hebrew string"
String subject_mixed = String(
"this is a \u05e2\u05d1\u05e8\u05d9\u05ea string",
CopyString);
// Test basic regex parsing functionality.
VERIFY(f_icu_match("scripting", subject));
VERIFY(!f_icu_match("php", subject));
VERIFY(f_icu_match("(\\bPHP\\b)", subject));
VERIFY(!f_icu_match("(\\bPHP\\b))", subject));
// Test returning matches functionality.
Variant matches;
VERIFY(f_icu_match("(PHP) is", subject, ref(matches)));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => PHP is\n"
" [1] => PHP\n"
")\n");
VERIFY(f_icu_match("is (a)", subject, ref(matches),
k_UREGEX_OFFSET_CAPTURE));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => Array\n"
" (\n"
" [0] => is a\n"
" [1] => 7\n"
" )\n"
"\n"
" [1] => Array\n"
" (\n"
" [0] => a\n"
" [1] => 10\n"
" )\n"
"\n"
")\n");
VERIFY(f_icu_match("\\. \ufeb0", subject, ref(matches),
k_UREGEX_OFFSET_CAPTURE));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => Array\n"
" (\n"
" [0] => . \ufeb0\n"
" [1] => 30\n"
" )\n"
"\n"
")\n");
VERIFY(f_icu_match("\ufee9\ufeed (\ufe8e\ufee0\ufee8\ufebb)",
subject_ar, ref(matches), k_UREGEX_OFFSET_CAPTURE));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => Array\n"
" (\n"
" [0] => \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb\n"
" [1] => 4\n"
" )\n"
"\n"
" [1] => Array\n"
" (\n"
" [0] => \ufe8e\ufee0\ufee8\ufebb\n"
" [1] => 7\n"
" )\n"
"\n"
")\n");
// Test match for 32-bit code points.
VERIFY(f_icu_match(".*", subject_32, ref(matches)));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => \U00010905\U00010905\U00010905\U00010905\U00010905\U00010905\n"
")\n");
// Test regex caching functionality.
VERIFY(f_icu_match("(php)", subject, uninit_null(), k_UREGEX_CASE_INSENSITIVE));
VERIFY(!f_icu_match("(php)", subject));
// Test ICU specific (ie bidi) functionality.
String pattern_ltr = String("\\p{Bidi_Class=Left_To_Right}", CopyString);
String pattern_rtl = String("\\p{Bidi_Class=Right_To_Left}", CopyString);
String pattern_arl = String("\\p{Bidi_Class=Arabic_Letter}", CopyString);
VERIFY(f_icu_match(pattern_ltr, subject_en));
VERIFY(!f_icu_match(pattern_rtl, subject_en));
VERIFY(!f_icu_match(pattern_ltr, subject_he));
VERIFY(f_icu_match(pattern_rtl, subject_he));
VERIFY(!f_icu_match(pattern_arl, subject_he));
VERIFY(!f_icu_match(pattern_ltr, subject_ar));
VERIFY(!f_icu_match(pattern_rtl, subject_ar));
VERIFY(f_icu_match(pattern_arl, subject_ar));
VERIFY(f_icu_match(pattern_ltr, subject_mixed));
VERIFY(f_icu_match(pattern_rtl, subject_mixed));
return Count(true);
}
// Test string lifted from tests/intl/utf8.h
bool TestExtIcu::test_icu_transliterate() {
String input_ru =
String("\xd1\x84\xd0\xb5\xd0\xb9\xd1"
"\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba",
CopyString);
String output_ru = f_icu_transliterate(input_ru, false);
// Note: different than php test ('y' -> 'j')
VERIFY(output_ru == "fejsbu\xc5\x93k");
// Verify that removing accents works.
String input_de = String("Ich m\xc3\xb6"
"chte \xc3\xbc"
"berzeugend "
"oder \xc3\xa4hnliche sein",
CopyString);
String output_de = f_icu_transliterate(input_de, true);
VERIFY(output_de == "Ich mochte uberzeugend oder ahnliche sein");
// Verify that keeping accents works.
VERIFY(f_icu_transliterate(input_de, false) == input_de.c_str());
// Check an non-Latin language.
String input_zh = String("\xe5\x9b\x9b"
"\xe5\x8d\x81\xe5\x9b\x9b\xe7"
"\x9f\xb3\xe7\x8d\x85\xe5\xad\x90",
CopyString);
String output_zh = f_icu_transliterate(input_zh, true);
VERIFY(output_zh == "si shi si shi shi zi");
return Count(true);
}
bool TestExtIcu::test_icu_tokenize() {
String input_eng = String("Hello World");
Array output_eng = f_icu_tokenize(input_eng);
VS(f_print_r(output_eng, true),
"Array\n"
"(\n"
" [0] => _B_\n"
" [1] => hello\n"
" [2] => world\n"
" [3] => _E_\n"
")\n"
);
String input_long = String("Hello! You are visitor #1234 to "
"http://www.facebook.com! "
"<3 How are you today (6/14/2011),"
" hello@world.com?");
Array output_long = f_icu_tokenize(input_long);
VS(f_print_r(output_long, true),
"Array\n"
"(\n"
" [0] => _B_\n"
" [1] => hello\n"
" [2] => !\n"
" [3] => you\n"
" [4] => are\n"
" [5] => visitor\n"
" [6] => #\n"
" [7] => XXXX\n"
" [8] => to\n"
" [9] => TOKEN_URL\n"
" [10] => !\n"
" [11] => TOKEN_HEART\n"
" [12] => how\n"
" [13] => are\n"
" [14] => you\n"
" [15] => today\n"
" [16] => (\n"
" [17] => TOKEN_DATE\n"
" [18] => )\n"
" [19] => ,\n"
" [20] => TOKEN_EMAIL\n"
" [21] => ?\n"
" [22] => _E_\n"
")\n"
);
String input_de = String("Ich möchte überzeugend oder ähnliche sein");
Array output_de = f_icu_tokenize(input_de);
VS(f_print_r(output_de, true),
"Array\n"
"(\n"
" [0] => _B_\n"
" [1] => ich\n"
" [2] => mã\n"
" [3] => ¶\n"
" [4] => chte\n"
" [5] => ã\n"
" [6] => ¼\n"
" [7] => berzeugend\n"
" [8] => oder\n"
" [9] => ã\n"
" [10] => ¤\n"
" [11] => hnliche\n"
" [12] => sein\n"
" [13] => _E_\n"
")\n");
return Count(true);
}