From 99ce60eda7ab64f3c089dfe5e8dd01225eab5243 Mon Sep 17 00:00:00 2001 From: Jordan DeLong Date: Fri, 14 Jun 2013 23:32:15 -0700 Subject: [PATCH] Port TestExtIcu to php --- hphp/test/ext/test_ext.h | 1 - hphp/test/ext/test_ext_icu.cpp | 266 ---------------------- hphp/test/ext/test_ext_icu.h | 37 --- hphp/test/slow/ext_icu/ext_icu.php | 239 +++++++++++++++++++ hphp/test/slow/ext_icu/ext_icu.php.expect | 33 +++ 5 files changed, 272 insertions(+), 304 deletions(-) delete mode 100644 hphp/test/ext/test_ext_icu.cpp delete mode 100644 hphp/test/ext/test_ext_icu.h create mode 100644 hphp/test/slow/ext_icu/ext_icu.php create mode 100644 hphp/test/slow/ext_icu/ext_icu.php.expect diff --git a/hphp/test/ext/test_ext.h b/hphp/test/ext/test_ext.h index 42b5ae486..0e46079b0 100644 --- a/hphp/test/ext/test_ext.h +++ b/hphp/test/ext/test_ext.h @@ -36,7 +36,6 @@ #include "hphp/facebook/extensions/urlextraction/test_ext_urlextraction.h" #include "hphp/test/ext/test_ext_curl.h" #include "hphp/test/ext/test_ext_file.h" -#include "hphp/test/ext/test_ext_icu.h" #include "hphp/test/ext/test_ext_icu_ucnv.h" #include "hphp/test/ext/test_ext_icu_ucsdet.h" #include "hphp/test/ext/test_ext_icu_uspoof.h" diff --git a/hphp/test/ext/test_ext_icu.cpp b/hphp/test/ext/test_ext_icu.cpp deleted file mode 100644 index 2f95f50ca..000000000 --- a/hphp/test/ext/test_ext_icu.cpp +++ /dev/null @@ -1,266 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | HipHop for PHP | - +----------------------------------------------------------------------+ - | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | http://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ -*/ - -#include "hphp/test/ext/test_ext_icu.h" -#include "hphp/runtime/ext/ext_icu.h" -#include - -/////////////////////////////////////////////////////////////////////////////// - -bool TestExtIcu::RunTests(const std::string &which) { - bool ret = true; - - RUN_TEST(test_icu_match); - RUN_TEST(test_icu_transliterate); - RUN_TEST(test_icu_tokenize); - - return ret; -} - -/////////////////////////////////////////////////////////////////////////////// - -bool TestExtIcu::test_icu_match() { - // Test subject strings. - String subject = String( - "\u05d6\U00010905 PHP is a scripting language. \ufeb0\ufef3", - CopyString); - String subject_32 = String( - "\U00010905\U00010905\U00010905\U00010905\U00010905\U00010905", - CopyString); - String subject_en = String("this is an english string", CopyString); - // "this is a hebrew string" - String subject_he = String( - "\u05d6\u05d4 \u05d4\u05d5\u05d0 \u05de\u05d7\u05e8\u05d5\u05d6\u05ea " - "\u05e2\u05d1\u05e8\u05d9\u05ea", - CopyString); - // "this is an arabic string" - String subject_ar = String( - "\ufee9\ufeab\ufe8d \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb " - "\ufe8d\ufefa\ufee8\ufea0\ufee0\ufef3\ufeb0\ufef3", - CopyString); - // "this is a hebrew string" - String subject_mixed = String( - "this is a \u05e2\u05d1\u05e8\u05d9\u05ea string", - CopyString); - - // Test basic regex parsing functionality. - VERIFY(f_icu_match("scripting", subject)); - VERIFY(!f_icu_match("php", subject)); - VERIFY(f_icu_match("(\\bPHP\\b)", subject)); - VERIFY(!f_icu_match("(\\bPHP\\b))", subject)); - - // Test returning matches functionality. - Variant matches; - VERIFY(f_icu_match("(PHP) is", subject, ref(matches))); - VS(f_print_r(matches, true), - "Array\n" - "(\n" - " [0] => PHP is\n" - " [1] => PHP\n" - ")\n"); - VERIFY(f_icu_match("is (a)", subject, ref(matches), - k_UREGEX_OFFSET_CAPTURE)); - VS(f_print_r(matches, true), - "Array\n" - "(\n" - " [0] => Array\n" - " (\n" - " [0] => is a\n" - " [1] => 7\n" - " )\n" - "\n" - " [1] => Array\n" - " (\n" - " [0] => a\n" - " [1] => 10\n" - " )\n" - "\n" - ")\n"); - VERIFY(f_icu_match("\\. \ufeb0", subject, ref(matches), - k_UREGEX_OFFSET_CAPTURE)); - VS(f_print_r(matches, true), - "Array\n" - "(\n" - " [0] => Array\n" - " (\n" - " [0] => . \ufeb0\n" - " [1] => 30\n" - " )\n" - "\n" - ")\n"); - VERIFY(f_icu_match("\ufee9\ufeed (\ufe8e\ufee0\ufee8\ufebb)", - subject_ar, ref(matches), k_UREGEX_OFFSET_CAPTURE)); - VS(f_print_r(matches, true), - "Array\n" - "(\n" - " [0] => Array\n" - " (\n" - " [0] => \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb\n" - " [1] => 4\n" - " )\n" - "\n" - " [1] => Array\n" - " (\n" - " [0] => \ufe8e\ufee0\ufee8\ufebb\n" - " [1] => 7\n" - " )\n" - "\n" - ")\n"); - - // Test match for 32-bit code points. - VERIFY(f_icu_match(".*", subject_32, ref(matches))); - VS(f_print_r(matches, true), - "Array\n" - "(\n" - " [0] => \U00010905\U00010905\U00010905\U00010905\U00010905\U00010905\n" - ")\n"); - - // Test regex caching functionality. - VERIFY(f_icu_match("(php)", subject, uninit_null(), k_UREGEX_CASE_INSENSITIVE)); - VERIFY(!f_icu_match("(php)", subject)); - - // Test ICU specific (ie bidi) functionality. - String pattern_ltr = String("\\p{Bidi_Class=Left_To_Right}", CopyString); - String pattern_rtl = String("\\p{Bidi_Class=Right_To_Left}", CopyString); - String pattern_arl = String("\\p{Bidi_Class=Arabic_Letter}", CopyString); - - VERIFY(f_icu_match(pattern_ltr, subject_en)); - VERIFY(!f_icu_match(pattern_rtl, subject_en)); - - VERIFY(!f_icu_match(pattern_ltr, subject_he)); - VERIFY(f_icu_match(pattern_rtl, subject_he)); - VERIFY(!f_icu_match(pattern_arl, subject_he)); - - VERIFY(!f_icu_match(pattern_ltr, subject_ar)); - VERIFY(!f_icu_match(pattern_rtl, subject_ar)); - VERIFY(f_icu_match(pattern_arl, subject_ar)); - - VERIFY(f_icu_match(pattern_ltr, subject_mixed)); - VERIFY(f_icu_match(pattern_rtl, subject_mixed)); - - return Count(true); -} - -// Test string lifted from tests/intl/utf8.h -bool TestExtIcu::test_icu_transliterate() { - String input_ru = - String("\xd1\x84\xd0\xb5\xd0\xb9\xd1" - "\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba", - CopyString); - String output_ru = f_icu_transliterate(input_ru, false); - // Note: different than php test ('y' -> 'j') - VERIFY(output_ru == "fejsbu\xc5\x93k"); - - // Verify that removing accents works. - String input_de = String("Ich m\xc3\xb6" - "chte \xc3\xbc" - "berzeugend " - "oder \xc3\xa4hnliche sein", - CopyString); - String output_de = f_icu_transliterate(input_de, true); - VERIFY(output_de == "Ich mochte uberzeugend oder ahnliche sein"); - - // Verify that keeping accents works. - VERIFY(f_icu_transliterate(input_de, false) == input_de.c_str()); - - // Check an non-Latin language. - String input_zh = String("\xe5\x9b\x9b" - "\xe5\x8d\x81\xe5\x9b\x9b\xe7" - "\x9f\xb3\xe7\x8d\x85\xe5\xad\x90", - CopyString); - String output_zh = f_icu_transliterate(input_zh, true); - VERIFY(output_zh == "si shi si shi shi zi"); - - return Count(true); -} - - -bool TestExtIcu::test_icu_tokenize() { - - - String input_eng = String("Hello World"); - Array output_eng = f_icu_tokenize(input_eng); - - VS(f_print_r(output_eng, true), - "Array\n" - "(\n" - " [0] => _B_\n" - " [1] => hello\n" - " [2] => world\n" - " [3] => _E_\n" - ")\n" - ); - String input_long = String("Hello! You are visitor #1234 to " - "http://www.facebook.com! " - "<3 How are you today (6/14/2011)," - " hello@world.com?"); - - Array output_long = f_icu_tokenize(input_long); - - VS(f_print_r(output_long, true), - "Array\n" - "(\n" - " [0] => _B_\n" - " [1] => hello\n" - " [2] => !\n" - " [3] => you\n" - " [4] => are\n" - " [5] => visitor\n" - " [6] => #\n" - " [7] => XXXX\n" - " [8] => to\n" - " [9] => TOKEN_URL\n" - " [10] => !\n" - " [11] => TOKEN_HEART\n" - " [12] => how\n" - " [13] => are\n" - " [14] => you\n" - " [15] => today\n" - " [16] => (\n" - " [17] => TOKEN_DATE\n" - " [18] => )\n" - " [19] => ,\n" - " [20] => TOKEN_EMAIL\n" - " [21] => ?\n" - " [22] => _E_\n" - ")\n" - ); - - String input_de = String("Ich möchte überzeugend oder ähnliche sein"); - Array output_de = f_icu_tokenize(input_de); - - VS(f_print_r(output_de, true), - "Array\n" - "(\n" - " [0] => _B_\n" - " [1] => ich\n" - " [2] => mã\n" - " [3] => ¶\n" - " [4] => chte\n" - " [5] => ã\n" - " [6] => ¼\n" - " [7] => berzeugend\n" - " [8] => oder\n" - " [9] => ã\n" - " [10] => ¤\n" - " [11] => hnliche\n" - " [12] => sein\n" - " [13] => _E_\n" - ")\n"); - - - return Count(true); -} diff --git a/hphp/test/ext/test_ext_icu.h b/hphp/test/ext/test_ext_icu.h deleted file mode 100644 index ad8e5fefc..000000000 --- a/hphp/test/ext/test_ext_icu.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | HipHop for PHP | - +----------------------------------------------------------------------+ - | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | http://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ -*/ - -#ifndef incl_HPHP_TEST_EXT_ICU_H_ -#define incl_HPHP_TEST_EXT_ICU_H_ - -// >>>>>> Generated by idl.php. Do NOT modify. <<<<<< - -#include "hphp/test/ext/test_cpp_ext.h" - -/////////////////////////////////////////////////////////////////////////////// - -class TestExtIcu : public TestCppExt { - public: - virtual bool RunTests(const std::string &which); - - bool test_icu_match(); - bool test_icu_transliterate(); - bool test_icu_tokenize(); -}; - -/////////////////////////////////////////////////////////////////////////////// - -#endif // incl_HPHP_TEST_EXT_ICU_H_ diff --git a/hphp/test/slow/ext_icu/ext_icu.php b/hphp/test/slow/ext_icu/ext_icu.php new file mode 100644 index 000000000..7fc195f75 --- /dev/null +++ b/hphp/test/slow/ext_icu/ext_icu.php @@ -0,0 +1,239 @@ + PHP is\n". + " [1] => PHP\n". + ")\n"); + VERIFY(icu_match("is (a)", $subject, $matches, + UREGEX_OFFSET_CAPTURE) != false); + VS(print_r($matches, true), + "Array\n". + "(\n". + " [0] => Array\n". + " (\n". + " [0] => is a\n". + " [1] => 7\n". + " )\n". + "\n". + " [1] => Array\n". + " (\n". + " [0] => a\n". + " [1] => 10\n". + " )\n". + "\n". + ")\n"); + VERIFY(icu_match("\\. \xef\xba\xb0", $subject, $matches, + UREGEX_OFFSET_CAPTURE) != false); + VS(print_r($matches, true), + "Array\n". + "(\n". + " [0] => Array\n". + " (\n". + " [0] => . \xef\xba\xb0\n". + " [1] => 30\n". + " )\n". + "\n". + ")\n"); + $junk1="\xef\xbb\xa9\xef\xbb\xad"; + $junk2="\xef\xba\x8e\xef\xbb\xa0\xef\xbb\xa8\xef\xba\xbb"; + VERIFY(icu_match("$junk1 ($junk2)", + $subject_ar, $matches, UREGEX_OFFSET_CAPTURE) != false); + VS(print_r($matches, true), + "Array\n". + "(\n". + " [0] => Array\n". + " (\n". + " [0] => $junk1 $junk2\n". + " [1] => 4\n". + " )\n". + "\n". + " [1] => Array\n". + " (\n". + " [0] => $junk2\n". + " [1] => 7\n". + " )\n". + "\n". + ")\n"); + + // Test match for 32-bit code points. + VERIFY(icu_match(".*", $subject_32, $matches) != false); + $expected="\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4". + "\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85"; + VS(print_r($matches, true), + "Array\n". + "(\n". + " [0] => $expected\n". + ")\n"); + + // Test regex caching functionality. + VERIFY(icu_match("(php)", $subject, $ignore, UREGEX_CASE_INSENSITIVE) != false); + VERIFY(icu_match("(php)", $subject) == false); + + // Test ICU specific (ie bidi) functionality. + $pattern_ltr = "\\p{Bidi_Class=Left_To_Right}"; + $pattern_rtl = "\\p{Bidi_Class=Right_To_Left}"; + $pattern_arl = "\\p{Bidi_Class=Arabic_Letter}"; + + VERIFY(icu_match($pattern_ltr, $subject_en) != false); + VERIFY(icu_match($pattern_rtl, $subject_en) == false); + + VERIFY(icu_match($pattern_ltr, $subject_he) == false); + VERIFY(icu_match($pattern_rtl, $subject_he) != false); + VERIFY(icu_match($pattern_arl, $subject_he) == false); + + VERIFY(icu_match($pattern_ltr, $subject_ar) == false); + VERIFY(icu_match($pattern_rtl, $subject_ar) == false); + VERIFY(icu_match($pattern_arl, $subject_ar) != false); + + VERIFY(icu_match($pattern_ltr, $subject_mixed) != false); + VERIFY(icu_match($pattern_rtl, $subject_mixed) != false); +} + +// Test string lifted from tests/intl/utf8.h +function test_icu_transliterate() { + $input_ru = "\xd1\x84\xd0\xb5\xd0\xb9\xd1". + "\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba"; + $output_ru = icu_transliterate($input_ru, false); + // Note: different than php test ('y' -> 'j') + VERIFY($output_ru == "fejsbu\xc5\x93k"); + + // Verify that removing accents works. + $input_de = "Ich m\xc3\xb6". + "chte \xc3\xbc". + "berzeugend ". + "oder \xc3\xa4hnliche sein"; + $output_de = icu_transliterate($input_de, true); + VERIFY($output_de == "Ich mochte uberzeugend oder ahnliche sein"); + + // Verify that keeping accents works. + VERIFY(icu_transliterate($input_de, false) == $input_de); + + // Check a non-Latin language. + $input_zh = "\xe5\x9b\x9b". + "\xe5\x8d\x81\xe5\x9b\x9b\xe7". + "\x9f\xb3\xe7\x8d\x85\xe5\xad\x90"; + $output_zh = icu_transliterate($input_zh, true); + VERIFY($output_zh == "si shi si shi shi zi"); +} + + +function test_icu_tokenize() { + $input_eng = "Hello World"; + $output_eng = icu_tokenize($input_eng); + + VS(print_r($output_eng, true), + "Array\n". + "(\n". + " [0] => _B_\n". + " [1] => hello\n". + " [2] => world\n". + " [3] => _E_\n". + ")\n" + ); + $input_long = "Hello! You are visitor #1234 to ". + "http://www.facebook.com! ". + "<3 How are you today (6/14/2011),". + " hello@world.com?"; + + $output_long = icu_tokenize($input_long); + + VS(print_r($output_long, true), + "Array\n". + "(\n". + " [0] => _B_\n". + " [1] => hello\n". + " [2] => !\n". + " [3] => you\n". + " [4] => are\n". + " [5] => visitor\n". + " [6] => #\n". + " [7] => XXXX\n". + " [8] => to\n". + " [9] => TOKEN_URL\n". + " [10] => !\n". + " [11] => TOKEN_HEART\n". + " [12] => how\n". + " [13] => are\n". + " [14] => you\n". + " [15] => today\n". + " [16] => (\n". + " [17] => TOKEN_DATE\n". + " [18] => )\n". + " [19] => ,\n". + " [20] => TOKEN_EMAIL\n". + " [21] => ?\n". + " [22] => _E_\n". + ")\n" + ); + + $input_de = "Ich möchte überzeugend oder ähnliche sein"; + $output_de = icu_tokenize($input_de); + + VS(print_r($output_de, true), + "Array\n". + "(\n". + " [0] => _B_\n". + " [1] => ich\n". + " [2] => mã\n". + " [3] => ¶\n". + " [4] => chte\n". + " [5] => ã\n". + " [6] => ¼\n". + " [7] => berzeugend\n". + " [8] => oder\n". + " [9] => ã\n". + " [10] => ¤\n". + " [11] => hnliche\n". + " [12] => sein\n". + " [13] => _E_\n". + ")\n"); +} + +test_icu_match(); +test_icu_transliterate(); +test_icu_tokenize(); diff --git a/hphp/test/slow/ext_icu/ext_icu.php.expect b/hphp/test/slow/ext_icu/ext_icu.php.expect new file mode 100644 index 000000000..e7038bc35 --- /dev/null +++ b/hphp/test/slow/ext_icu/ext_icu.php.expect @@ -0,0 +1,33 @@ +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true)