Port TestExtIcu to php

Esse commit está contido em:
Jordan DeLong
2013-06-14 23:32:15 -07:00
commit de Sara Golemon
commit 99ce60eda7
5 arquivos alterados com 272 adições e 304 exclusões
-1
Ver Arquivo
@@ -36,7 +36,6 @@
#include "hphp/facebook/extensions/urlextraction/test_ext_urlextraction.h"
#include "hphp/test/ext/test_ext_curl.h"
#include "hphp/test/ext/test_ext_file.h"
#include "hphp/test/ext/test_ext_icu.h"
#include "hphp/test/ext/test_ext_icu_ucnv.h"
#include "hphp/test/ext/test_ext_icu_ucsdet.h"
#include "hphp/test/ext/test_ext_icu_uspoof.h"
-266
Ver Arquivo
@@ -1,266 +0,0 @@
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#include "hphp/test/ext/test_ext_icu.h"
#include "hphp/runtime/ext/ext_icu.h"
#include <iostream>
///////////////////////////////////////////////////////////////////////////////
bool TestExtIcu::RunTests(const std::string &which) {
bool ret = true;
RUN_TEST(test_icu_match);
RUN_TEST(test_icu_transliterate);
RUN_TEST(test_icu_tokenize);
return ret;
}
///////////////////////////////////////////////////////////////////////////////
bool TestExtIcu::test_icu_match() {
// Test subject strings.
String subject = String(
"\u05d6\U00010905 PHP is a scripting language. \ufeb0\ufef3",
CopyString);
String subject_32 = String(
"\U00010905\U00010905\U00010905\U00010905\U00010905\U00010905",
CopyString);
String subject_en = String("this is an english string", CopyString);
// "this is a hebrew string"
String subject_he = String(
"\u05d6\u05d4 \u05d4\u05d5\u05d0 \u05de\u05d7\u05e8\u05d5\u05d6\u05ea "
"\u05e2\u05d1\u05e8\u05d9\u05ea",
CopyString);
// "this is an arabic string"
String subject_ar = String(
"\ufee9\ufeab\ufe8d \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb "
"\ufe8d\ufefa\ufee8\ufea0\ufee0\ufef3\ufeb0\ufef3",
CopyString);
// "this is a hebrew string"
String subject_mixed = String(
"this is a \u05e2\u05d1\u05e8\u05d9\u05ea string",
CopyString);
// Test basic regex parsing functionality.
VERIFY(f_icu_match("scripting", subject));
VERIFY(!f_icu_match("php", subject));
VERIFY(f_icu_match("(\\bPHP\\b)", subject));
VERIFY(!f_icu_match("(\\bPHP\\b))", subject));
// Test returning matches functionality.
Variant matches;
VERIFY(f_icu_match("(PHP) is", subject, ref(matches)));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => PHP is\n"
" [1] => PHP\n"
")\n");
VERIFY(f_icu_match("is (a)", subject, ref(matches),
k_UREGEX_OFFSET_CAPTURE));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => Array\n"
" (\n"
" [0] => is a\n"
" [1] => 7\n"
" )\n"
"\n"
" [1] => Array\n"
" (\n"
" [0] => a\n"
" [1] => 10\n"
" )\n"
"\n"
")\n");
VERIFY(f_icu_match("\\. \ufeb0", subject, ref(matches),
k_UREGEX_OFFSET_CAPTURE));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => Array\n"
" (\n"
" [0] => . \ufeb0\n"
" [1] => 30\n"
" )\n"
"\n"
")\n");
VERIFY(f_icu_match("\ufee9\ufeed (\ufe8e\ufee0\ufee8\ufebb)",
subject_ar, ref(matches), k_UREGEX_OFFSET_CAPTURE));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => Array\n"
" (\n"
" [0] => \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb\n"
" [1] => 4\n"
" )\n"
"\n"
" [1] => Array\n"
" (\n"
" [0] => \ufe8e\ufee0\ufee8\ufebb\n"
" [1] => 7\n"
" )\n"
"\n"
")\n");
// Test match for 32-bit code points.
VERIFY(f_icu_match(".*", subject_32, ref(matches)));
VS(f_print_r(matches, true),
"Array\n"
"(\n"
" [0] => \U00010905\U00010905\U00010905\U00010905\U00010905\U00010905\n"
")\n");
// Test regex caching functionality.
VERIFY(f_icu_match("(php)", subject, uninit_null(), k_UREGEX_CASE_INSENSITIVE));
VERIFY(!f_icu_match("(php)", subject));
// Test ICU specific (ie bidi) functionality.
String pattern_ltr = String("\\p{Bidi_Class=Left_To_Right}", CopyString);
String pattern_rtl = String("\\p{Bidi_Class=Right_To_Left}", CopyString);
String pattern_arl = String("\\p{Bidi_Class=Arabic_Letter}", CopyString);
VERIFY(f_icu_match(pattern_ltr, subject_en));
VERIFY(!f_icu_match(pattern_rtl, subject_en));
VERIFY(!f_icu_match(pattern_ltr, subject_he));
VERIFY(f_icu_match(pattern_rtl, subject_he));
VERIFY(!f_icu_match(pattern_arl, subject_he));
VERIFY(!f_icu_match(pattern_ltr, subject_ar));
VERIFY(!f_icu_match(pattern_rtl, subject_ar));
VERIFY(f_icu_match(pattern_arl, subject_ar));
VERIFY(f_icu_match(pattern_ltr, subject_mixed));
VERIFY(f_icu_match(pattern_rtl, subject_mixed));
return Count(true);
}
// Test string lifted from tests/intl/utf8.h
bool TestExtIcu::test_icu_transliterate() {
String input_ru =
String("\xd1\x84\xd0\xb5\xd0\xb9\xd1"
"\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba",
CopyString);
String output_ru = f_icu_transliterate(input_ru, false);
// Note: different than php test ('y' -> 'j')
VERIFY(output_ru == "fejsbu\xc5\x93k");
// Verify that removing accents works.
String input_de = String("Ich m\xc3\xb6"
"chte \xc3\xbc"
"berzeugend "
"oder \xc3\xa4hnliche sein",
CopyString);
String output_de = f_icu_transliterate(input_de, true);
VERIFY(output_de == "Ich mochte uberzeugend oder ahnliche sein");
// Verify that keeping accents works.
VERIFY(f_icu_transliterate(input_de, false) == input_de.c_str());
// Check an non-Latin language.
String input_zh = String("\xe5\x9b\x9b"
"\xe5\x8d\x81\xe5\x9b\x9b\xe7"
"\x9f\xb3\xe7\x8d\x85\xe5\xad\x90",
CopyString);
String output_zh = f_icu_transliterate(input_zh, true);
VERIFY(output_zh == "si shi si shi shi zi");
return Count(true);
}
bool TestExtIcu::test_icu_tokenize() {
String input_eng = String("Hello World");
Array output_eng = f_icu_tokenize(input_eng);
VS(f_print_r(output_eng, true),
"Array\n"
"(\n"
" [0] => _B_\n"
" [1] => hello\n"
" [2] => world\n"
" [3] => _E_\n"
")\n"
);
String input_long = String("Hello! You are visitor #1234 to "
"http://www.facebook.com! "
"<3 How are you today (6/14/2011),"
" hello@world.com?");
Array output_long = f_icu_tokenize(input_long);
VS(f_print_r(output_long, true),
"Array\n"
"(\n"
" [0] => _B_\n"
" [1] => hello\n"
" [2] => !\n"
" [3] => you\n"
" [4] => are\n"
" [5] => visitor\n"
" [6] => #\n"
" [7] => XXXX\n"
" [8] => to\n"
" [9] => TOKEN_URL\n"
" [10] => !\n"
" [11] => TOKEN_HEART\n"
" [12] => how\n"
" [13] => are\n"
" [14] => you\n"
" [15] => today\n"
" [16] => (\n"
" [17] => TOKEN_DATE\n"
" [18] => )\n"
" [19] => ,\n"
" [20] => TOKEN_EMAIL\n"
" [21] => ?\n"
" [22] => _E_\n"
")\n"
);
String input_de = String("Ich möchte überzeugend oder ähnliche sein");
Array output_de = f_icu_tokenize(input_de);
VS(f_print_r(output_de, true),
"Array\n"
"(\n"
" [0] => _B_\n"
" [1] => ich\n"
" [2] => mã\n"
" [3] => ¶\n"
" [4] => chte\n"
" [5] => ã\n"
" [6] => ¼\n"
" [7] => berzeugend\n"
" [8] => oder\n"
" [9] => ã\n"
" [10] => ¤\n"
" [11] => hnliche\n"
" [12] => sein\n"
" [13] => _E_\n"
")\n");
return Count(true);
}
-37
Ver Arquivo
@@ -1,37 +0,0 @@
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#ifndef incl_HPHP_TEST_EXT_ICU_H_
#define incl_HPHP_TEST_EXT_ICU_H_
// >>>>>> Generated by idl.php. Do NOT modify. <<<<<<
#include "hphp/test/ext/test_cpp_ext.h"
///////////////////////////////////////////////////////////////////////////////
class TestExtIcu : public TestCppExt {
public:
virtual bool RunTests(const std::string &which);
bool test_icu_match();
bool test_icu_transliterate();
bool test_icu_tokenize();
};
///////////////////////////////////////////////////////////////////////////////
#endif // incl_HPHP_TEST_EXT_ICU_H_
+239
Ver Arquivo
@@ -0,0 +1,239 @@
<?php
function VS($x, $y) {
var_dump($x === $y);
if ($x !== $y) { echo "Failed: $y\n"; echo "Got: $x\n";
var_dump(debug_backtrace()); }
}
function VERIFY($x) { VS($x, true); }
//////////////////////////////////////////////////////////////////////
function test_icu_match() {
// Test subject strings.
$subject = "\xd7\x96\xf0\x90\xa4\x85". " PHP is a scripting language. " .
"\xef\xba\xb0\xef\xbb\xb3";
$subject_32 =
"\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85" .
"\xf0\x90\xa4\x85\xf0\x90\xa4\x85";
$subject_en = "this is an english string";
// "this is a hebrew string"
$subject_he =
"\xd7\x96\xd7\x94\x20" .
"\xd7\x94\xd7\x95\xd7\x90\x20\xd7\x9e\xd7\x97\xd7\xa8\xd7\x95\xd7" .
"\x96\xd7\xaa\x20\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa";
// "this is an arabic string"
$subject_ar =
"\xef\xbb\xa9\xef".
"\xba\xab\xef\xba\x8d\x20\xef\xbb\xa9\xef\xbb\xad\x20\xef\xba\x8e".
"\xef\xbb\xa0\xef\xbb\xa8\xef\xba\xbb\x20\xef\xba\x8d\xef\xbb\xba".
"\xef\xbb\xa8\xef\xba\xa0\xef\xbb\xa0\xef\xbb\xb3\xef\xba\xb0\xef".
"\xbb\xb3";
// "this is a hebrew string"
$subject_mixed =
"this is a ".
"\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa"
." string";
// Test basic regex parsing functionality.
VERIFY(icu_match("scripting", $subject) != false);
VERIFY(icu_match("php", $subject) == false);
VERIFY(icu_match("(\\bPHP\\b)", $subject) != false);
VERIFY(icu_match("(\\bPHP\\b))", $subject) == false);
// Test returning matches functionality.
VERIFY(icu_match("(PHP) is", $subject, $matches) != false);
VS(print_r($matches, true),
"Array\n".
"(\n".
" [0] => PHP is\n".
" [1] => PHP\n".
")\n");
VERIFY(icu_match("is (a)", $subject, $matches,
UREGEX_OFFSET_CAPTURE) != false);
VS(print_r($matches, true),
"Array\n".
"(\n".
" [0] => Array\n".
" (\n".
" [0] => is a\n".
" [1] => 7\n".
" )\n".
"\n".
" [1] => Array\n".
" (\n".
" [0] => a\n".
" [1] => 10\n".
" )\n".
"\n".
")\n");
VERIFY(icu_match("\\. \xef\xba\xb0", $subject, $matches,
UREGEX_OFFSET_CAPTURE) != false);
VS(print_r($matches, true),
"Array\n".
"(\n".
" [0] => Array\n".
" (\n".
" [0] => . \xef\xba\xb0\n".
" [1] => 30\n".
" )\n".
"\n".
")\n");
$junk1="\xef\xbb\xa9\xef\xbb\xad";
$junk2="\xef\xba\x8e\xef\xbb\xa0\xef\xbb\xa8\xef\xba\xbb";
VERIFY(icu_match("$junk1 ($junk2)",
$subject_ar, $matches, UREGEX_OFFSET_CAPTURE) != false);
VS(print_r($matches, true),
"Array\n".
"(\n".
" [0] => Array\n".
" (\n".
" [0] => $junk1 $junk2\n".
" [1] => 4\n".
" )\n".
"\n".
" [1] => Array\n".
" (\n".
" [0] => $junk2\n".
" [1] => 7\n".
" )\n".
"\n".
")\n");
// Test match for 32-bit code points.
VERIFY(icu_match(".*", $subject_32, $matches) != false);
$expected="\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4".
"\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85";
VS(print_r($matches, true),
"Array\n".
"(\n".
" [0] => $expected\n".
")\n");
// Test regex caching functionality.
VERIFY(icu_match("(php)", $subject, $ignore, UREGEX_CASE_INSENSITIVE) != false);
VERIFY(icu_match("(php)", $subject) == false);
// Test ICU specific (ie bidi) functionality.
$pattern_ltr = "\\p{Bidi_Class=Left_To_Right}";
$pattern_rtl = "\\p{Bidi_Class=Right_To_Left}";
$pattern_arl = "\\p{Bidi_Class=Arabic_Letter}";
VERIFY(icu_match($pattern_ltr, $subject_en) != false);
VERIFY(icu_match($pattern_rtl, $subject_en) == false);
VERIFY(icu_match($pattern_ltr, $subject_he) == false);
VERIFY(icu_match($pattern_rtl, $subject_he) != false);
VERIFY(icu_match($pattern_arl, $subject_he) == false);
VERIFY(icu_match($pattern_ltr, $subject_ar) == false);
VERIFY(icu_match($pattern_rtl, $subject_ar) == false);
VERIFY(icu_match($pattern_arl, $subject_ar) != false);
VERIFY(icu_match($pattern_ltr, $subject_mixed) != false);
VERIFY(icu_match($pattern_rtl, $subject_mixed) != false);
}
// Test string lifted from tests/intl/utf8.h
function test_icu_transliterate() {
$input_ru = "\xd1\x84\xd0\xb5\xd0\xb9\xd1".
"\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba";
$output_ru = icu_transliterate($input_ru, false);
// Note: different than php test ('y' -> 'j')
VERIFY($output_ru == "fejsbu\xc5\x93k");
// Verify that removing accents works.
$input_de = "Ich m\xc3\xb6".
"chte \xc3\xbc".
"berzeugend ".
"oder \xc3\xa4hnliche sein";
$output_de = icu_transliterate($input_de, true);
VERIFY($output_de == "Ich mochte uberzeugend oder ahnliche sein");
// Verify that keeping accents works.
VERIFY(icu_transliterate($input_de, false) == $input_de);
// Check a non-Latin language.
$input_zh = "\xe5\x9b\x9b".
"\xe5\x8d\x81\xe5\x9b\x9b\xe7".
"\x9f\xb3\xe7\x8d\x85\xe5\xad\x90";
$output_zh = icu_transliterate($input_zh, true);
VERIFY($output_zh == "si shi si shi shi zi");
}
function test_icu_tokenize() {
$input_eng = "Hello World";
$output_eng = icu_tokenize($input_eng);
VS(print_r($output_eng, true),
"Array\n".
"(\n".
" [0] => _B_\n".
" [1] => hello\n".
" [2] => world\n".
" [3] => _E_\n".
")\n"
);
$input_long = "Hello! You are visitor #1234 to ".
"http://www.facebook.com! ".
"<3 How are you today (6/14/2011),".
" hello@world.com?";
$output_long = icu_tokenize($input_long);
VS(print_r($output_long, true),
"Array\n".
"(\n".
" [0] => _B_\n".
" [1] => hello\n".
" [2] => !\n".
" [3] => you\n".
" [4] => are\n".
" [5] => visitor\n".
" [6] => #\n".
" [7] => XXXX\n".
" [8] => to\n".
" [9] => TOKEN_URL\n".
" [10] => !\n".
" [11] => TOKEN_HEART\n".
" [12] => how\n".
" [13] => are\n".
" [14] => you\n".
" [15] => today\n".
" [16] => (\n".
" [17] => TOKEN_DATE\n".
" [18] => )\n".
" [19] => ,\n".
" [20] => TOKEN_EMAIL\n".
" [21] => ?\n".
" [22] => _E_\n".
")\n"
);
$input_de = "Ich möchte überzeugend oder ähnliche sein";
$output_de = icu_tokenize($input_de);
VS(print_r($output_de, true),
"Array\n".
"(\n".
" [0] => _B_\n".
" [1] => ich\n".
" [2] => mã\n".
" [3] => ¶\n".
" [4] => chte\n".
" [5] => ã\n".
" [6] => ¼\n".
" [7] => berzeugend\n".
" [8] => oder\n".
" [9] => ã\n".
" [10] => ¤\n".
" [11] => hnliche\n".
" [12] => sein\n".
" [13] => _E_\n".
")\n");
}
test_icu_match();
test_icu_transliterate();
test_icu_tokenize();
+33
Ver Arquivo
@@ -0,0 +1,33 @@
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)