Port TestExtIcu to php

2013-06-14 23:32:15 -07:00
commit 99ce60eda7
@@ -36,7 +36,6 @@
 #include "hphp/facebook/extensions/urlextraction/test_ext_urlextraction.h"
 #include "hphp/test/ext/test_ext_curl.h"
 #include "hphp/test/ext/test_ext_file.h"
-#include "hphp/test/ext/test_ext_icu.h"
 #include "hphp/test/ext/test_ext_icu_ucnv.h"
 #include "hphp/test/ext/test_ext_icu_ucsdet.h"
 #include "hphp/test/ext/test_ext_icu_uspoof.h"
@@ -1,266 +0,0 @@
-/*
-   +----------------------------------------------------------------------+
-   | HipHop for PHP                                                       |
-   +----------------------------------------------------------------------+
-   | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com)     |
-   +----------------------------------------------------------------------+
-   | This source file is subject to version 3.01 of the PHP license,      |
-   | that is bundled with this package in the file LICENSE, and is        |
-   | available through the world-wide-web at the following url:           |
-   | http://www.php.net/license/3_01.txt                                  |
-   | If you did not receive a copy of the PHP license and are unable to   |
-   | obtain it through the world-wide-web, please send a note to          |
-   | license@php.net so we can mail you a copy immediately.               |
-   +----------------------------------------------------------------------+
-*/
-
-#include "hphp/test/ext/test_ext_icu.h"
-#include "hphp/runtime/ext/ext_icu.h"
-#include <iostream>
-
-///////////////////////////////////////////////////////////////////////////////
-
-bool TestExtIcu::RunTests(const std::string &which) {
-  bool ret = true;
-
-  RUN_TEST(test_icu_match);
-  RUN_TEST(test_icu_transliterate);
-  RUN_TEST(test_icu_tokenize);
-
-  return ret;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-bool TestExtIcu::test_icu_match() {
-  // Test subject strings.
-  String subject = String(
-    "\u05d6\U00010905 PHP is a scripting language. \ufeb0\ufef3",
-    CopyString);
-  String subject_32 = String(
-    "\U00010905\U00010905\U00010905\U00010905\U00010905\U00010905",
-    CopyString);
-  String subject_en = String("this is an english string", CopyString);
-  // "this is a hebrew string"
-  String subject_he = String(
-    "\u05d6\u05d4 \u05d4\u05d5\u05d0 \u05de\u05d7\u05e8\u05d5\u05d6\u05ea "
-    "\u05e2\u05d1\u05e8\u05d9\u05ea",
-    CopyString);
-  // "this is an arabic string"
-  String subject_ar = String(
-    "\ufee9\ufeab\ufe8d \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb "
-    "\ufe8d\ufefa\ufee8\ufea0\ufee0\ufef3\ufeb0\ufef3",
-    CopyString);
-  // "this is a hebrew string"
-  String subject_mixed = String(
-    "this is a \u05e2\u05d1\u05e8\u05d9\u05ea string",
-    CopyString);
-
-  // Test basic regex parsing functionality.
-  VERIFY(f_icu_match("scripting", subject));
-  VERIFY(!f_icu_match("php", subject));
-  VERIFY(f_icu_match("(\\bPHP\\b)", subject));
-  VERIFY(!f_icu_match("(\\bPHP\\b))", subject));
-
-  // Test returning matches functionality.
-  Variant matches;
-  VERIFY(f_icu_match("(PHP) is", subject, ref(matches)));
-  VS(f_print_r(matches, true),
-    "Array\n"
-    "(\n"
-    "    [0] => PHP is\n"
-    "    [1] => PHP\n"
-    ")\n");
-  VERIFY(f_icu_match("is (a)", subject, ref(matches),
-                     k_UREGEX_OFFSET_CAPTURE));
-  VS(f_print_r(matches, true),
-     "Array\n"
-     "(\n"
-     "    [0] => Array\n"
-     "        (\n"
-     "            [0] => is a\n"
-     "            [1] => 7\n"
-     "        )\n"
-     "\n"
-     "    [1] => Array\n"
-     "        (\n"
-     "            [0] => a\n"
-     "            [1] => 10\n"
-     "        )\n"
-     "\n"
-     ")\n");
-  VERIFY(f_icu_match("\\. \ufeb0", subject, ref(matches),
-                     k_UREGEX_OFFSET_CAPTURE));
-  VS(f_print_r(matches, true),
-    "Array\n"
-    "(\n"
-    "    [0] => Array\n"
-    "        (\n"
-    "            [0] => . \ufeb0\n"
-    "            [1] => 30\n"
-    "        )\n"
-    "\n"
-    ")\n");
-  VERIFY(f_icu_match("\ufee9\ufeed (\ufe8e\ufee0\ufee8\ufebb)",
-                     subject_ar, ref(matches), k_UREGEX_OFFSET_CAPTURE));
-  VS(f_print_r(matches, true),
-    "Array\n"
-    "(\n"
-    "    [0] => Array\n"
-    "        (\n"
-    "            [0] => \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb\n"
-    "            [1] => 4\n"
-    "        )\n"
-    "\n"
-    "    [1] => Array\n"
-    "        (\n"
-    "            [0] => \ufe8e\ufee0\ufee8\ufebb\n"
-    "            [1] => 7\n"
-    "        )\n"
-    "\n"
-    ")\n");
-
-  // Test match for 32-bit code points.
-  VERIFY(f_icu_match(".*", subject_32, ref(matches)));
-  VS(f_print_r(matches, true),
-    "Array\n"
-    "(\n"
-    "    [0] => \U00010905\U00010905\U00010905\U00010905\U00010905\U00010905\n"
-    ")\n");
-
-  // Test regex caching functionality.
-  VERIFY(f_icu_match("(php)", subject, uninit_null(), k_UREGEX_CASE_INSENSITIVE));
-  VERIFY(!f_icu_match("(php)", subject));
-
-  // Test ICU specific (ie bidi) functionality.
-  String pattern_ltr = String("\\p{Bidi_Class=Left_To_Right}", CopyString);
-  String pattern_rtl = String("\\p{Bidi_Class=Right_To_Left}", CopyString);
-  String pattern_arl = String("\\p{Bidi_Class=Arabic_Letter}", CopyString);
-
- VERIFY(f_icu_match(pattern_ltr, subject_en));
-  VERIFY(!f_icu_match(pattern_rtl, subject_en));
-
-  VERIFY(!f_icu_match(pattern_ltr, subject_he));
-  VERIFY(f_icu_match(pattern_rtl, subject_he));
-  VERIFY(!f_icu_match(pattern_arl, subject_he));
-
-  VERIFY(!f_icu_match(pattern_ltr, subject_ar));
-  VERIFY(!f_icu_match(pattern_rtl, subject_ar));
-  VERIFY(f_icu_match(pattern_arl, subject_ar));
-
-  VERIFY(f_icu_match(pattern_ltr, subject_mixed));
-  VERIFY(f_icu_match(pattern_rtl, subject_mixed));
-
-  return Count(true);
-}
-
-// Test string lifted from tests/intl/utf8.h
-bool TestExtIcu::test_icu_transliterate() {
-  String input_ru =
-    String("\xd1\x84\xd0\xb5\xd0\xb9\xd1"
-           "\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba",
-           CopyString);
-  String output_ru = f_icu_transliterate(input_ru, false);
-  // Note: different than php test ('y' -> 'j')
-  VERIFY(output_ru == "fejsbu\xc5\x93k");
-
-  // Verify that removing accents works.
-  String input_de = String("Ich m\xc3\xb6"
-                           "chte \xc3\xbc"
-                           "berzeugend "
-                            "oder \xc3\xa4hnliche sein",
-                           CopyString);
-  String output_de = f_icu_transliterate(input_de, true);
-  VERIFY(output_de == "Ich mochte uberzeugend oder ahnliche sein");
-
-  // Verify that keeping accents works.
-  VERIFY(f_icu_transliterate(input_de, false) == input_de.c_str());
-
-  // Check an non-Latin language.
-  String input_zh = String("\xe5\x9b\x9b"
-                           "\xe5\x8d\x81\xe5\x9b\x9b\xe7"
-                           "\x9f\xb3\xe7\x8d\x85\xe5\xad\x90",
-                           CopyString);
-  String output_zh = f_icu_transliterate(input_zh, true);
-  VERIFY(output_zh == "si shi si shi shi zi");
-
-  return Count(true);
-}
-
-
-bool TestExtIcu::test_icu_tokenize() {
-
-
-  String input_eng = String("Hello World");
-  Array output_eng = f_icu_tokenize(input_eng);
-
-  VS(f_print_r(output_eng, true),
-     "Array\n"
-     "(\n"
-     "    [0] => _B_\n"
-     "    [1] => hello\n"
-     "    [2] => world\n"
-     "    [3] => _E_\n"
-     ")\n"
-    );
-  String input_long = String("Hello! You are visitor #1234 to "
-                            "http://www.facebook.com! "
-                            "<3 How are you today (6/14/2011),"
-                            " hello@world.com?");
-
-  Array output_long = f_icu_tokenize(input_long);
-
-  VS(f_print_r(output_long, true),
-     "Array\n"
-     "(\n"
-     "    [0] => _B_\n"
-     "    [1] => hello\n"
-     "    [2] => !\n"
-     "    [3] => you\n"
-     "    [4] => are\n"
-     "    [5] => visitor\n"
-     "    [6] => #\n"
-     "    [7] => XXXX\n"
-     "    [8] => to\n"
-     "    [9] => TOKEN_URL\n"
-     "    [10] => !\n"
-     "    [11] => TOKEN_HEART\n"
-     "    [12] => how\n"
-     "    [13] => are\n"
-     "    [14] => you\n"
-     "    [15] => today\n"
-     "    [16] => (\n"
-     "    [17] => TOKEN_DATE\n"
-     "    [18] => )\n"
-     "    [19] => ,\n"
-     "    [20] => TOKEN_EMAIL\n"
-     "    [21] => ?\n"
-     "    [22] => _E_\n"
-     ")\n"
-    );
-
-  String input_de = String("Ich mÃ¶chte Ã¼berzeugend oder Ã¤hnliche sein");
-  Array output_de = f_icu_tokenize(input_de);
-
-  VS(f_print_r(output_de, true),
-     "Array\n"
-     "(\n"
-     "    [0] => _B_\n"
-     "    [1] => ich\n"
-     "    [2] => mã\n"
-     "    [3] => ¶\n"
-     "    [4] => chte\n"
-     "    [5] => ã\n"
-     "    [6] => ¼\n"
-     "    [7] => berzeugend\n"
-     "    [8] => oder\n"
-     "    [9] => ã\n"
-     "    [10] => ¤\n"
-     "    [11] => hnliche\n"
-     "    [12] => sein\n"
-     "    [13] => _E_\n"
-     ")\n");
-
-
-  return Count(true);
-}
@@ -1,37 +0,0 @@
-/*
-   +----------------------------------------------------------------------+
-   | HipHop for PHP                                                       |
-   +----------------------------------------------------------------------+
-   | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com)     |
-   +----------------------------------------------------------------------+
-   | This source file is subject to version 3.01 of the PHP license,      |
-   | that is bundled with this package in the file LICENSE, and is        |
-   | available through the world-wide-web at the following url:           |
-   | http://www.php.net/license/3_01.txt                                  |
-   | If you did not receive a copy of the PHP license and are unable to   |
-   | obtain it through the world-wide-web, please send a note to          |
-   | license@php.net so we can mail you a copy immediately.               |
-   +----------------------------------------------------------------------+
-*/
-
-#ifndef incl_HPHP_TEST_EXT_ICU_H_
-#define incl_HPHP_TEST_EXT_ICU_H_
-
-// >>>>>> Generated by idl.php. Do NOT modify. <<<<<<
-
-#include "hphp/test/ext/test_cpp_ext.h"
-
-///////////////////////////////////////////////////////////////////////////////
-
-class TestExtIcu : public TestCppExt {
- public:
-  virtual bool RunTests(const std::string &which);
-
-  bool test_icu_match();
-  bool test_icu_transliterate();
-  bool test_icu_tokenize();
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-#endif // incl_HPHP_TEST_EXT_ICU_H_
@@ -0,0 +1,239 @@
+<?php
+
+function VS($x, $y) {
+  var_dump($x === $y);
+  if ($x !== $y) { echo "Failed: $y\n"; echo "Got: $x\n";
+                   var_dump(debug_backtrace()); }
+}
+function VERIFY($x) { VS($x, true); }
+
+//////////////////////////////////////////////////////////////////////
+
+function test_icu_match() {
+  // Test subject strings.
+  $subject = "\xd7\x96\xf0\x90\xa4\x85". " PHP is a scripting language. " .
+             "\xef\xba\xb0\xef\xbb\xb3";
+  $subject_32 =
+    "\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85" .
+    "\xf0\x90\xa4\x85\xf0\x90\xa4\x85";
+  $subject_en = "this is an english string";
+  // "this is a hebrew string"
+  $subject_he =
+    "\xd7\x96\xd7\x94\x20" .
+    "\xd7\x94\xd7\x95\xd7\x90\x20\xd7\x9e\xd7\x97\xd7\xa8\xd7\x95\xd7" .
+    "\x96\xd7\xaa\x20\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa";
+  // "this is an arabic string"
+  $subject_ar =
+    "\xef\xbb\xa9\xef".
+    "\xba\xab\xef\xba\x8d\x20\xef\xbb\xa9\xef\xbb\xad\x20\xef\xba\x8e".
+    "\xef\xbb\xa0\xef\xbb\xa8\xef\xba\xbb\x20\xef\xba\x8d\xef\xbb\xba".
+    "\xef\xbb\xa8\xef\xba\xa0\xef\xbb\xa0\xef\xbb\xb3\xef\xba\xb0\xef".
+    "\xbb\xb3";
+  // "this is a hebrew string"
+  $subject_mixed =
+    "this is a ".
+    "\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa"
+    ." string";
+
+  // Test basic regex parsing functionality.
+  VERIFY(icu_match("scripting", $subject) != false);
+  VERIFY(icu_match("php", $subject) == false);
+  VERIFY(icu_match("(\\bPHP\\b)", $subject) != false);
+  VERIFY(icu_match("(\\bPHP\\b))", $subject) == false);
+
+  // Test returning matches functionality.
+  VERIFY(icu_match("(PHP) is", $subject, $matches) != false);
+  VS(print_r($matches, true),
+    "Array\n".
+    "(\n".
+    "    [0] => PHP is\n".
+    "    [1] => PHP\n".
+    ")\n");
+  VERIFY(icu_match("is (a)", $subject, $matches,
+                     UREGEX_OFFSET_CAPTURE) != false);
+  VS(print_r($matches, true),
+     "Array\n".
+     "(\n".
+     "    [0] => Array\n".
+     "        (\n".
+     "            [0] => is a\n".
+     "            [1] => 7\n".
+     "        )\n".
+     "\n".
+     "    [1] => Array\n".
+     "        (\n".
+     "            [0] => a\n".
+     "            [1] => 10\n".
+     "        )\n".
+     "\n".
+     ")\n");
+  VERIFY(icu_match("\\. \xef\xba\xb0", $subject, $matches,
+                     UREGEX_OFFSET_CAPTURE) != false);
+  VS(print_r($matches, true),
+    "Array\n".
+    "(\n".
+    "    [0] => Array\n".
+    "        (\n".
+    "            [0] => . \xef\xba\xb0\n".
+    "            [1] => 30\n".
+    "        )\n".
+    "\n".
+    ")\n");
+  $junk1="\xef\xbb\xa9\xef\xbb\xad";
+  $junk2="\xef\xba\x8e\xef\xbb\xa0\xef\xbb\xa8\xef\xba\xbb";
+  VERIFY(icu_match("$junk1 ($junk2)",
+                     $subject_ar, $matches, UREGEX_OFFSET_CAPTURE) != false);
+  VS(print_r($matches, true),
+    "Array\n".
+    "(\n".
+    "    [0] => Array\n".
+    "        (\n".
+    "            [0] => $junk1 $junk2\n".
+    "            [1] => 4\n".
+    "        )\n".
+    "\n".
+    "    [1] => Array\n".
+    "        (\n".
+    "            [0] => $junk2\n".
+    "            [1] => 7\n".
+    "        )\n".
+    "\n".
+    ")\n");
+
+  // Test match for 32-bit code points.
+  VERIFY(icu_match(".*", $subject_32, $matches) != false);
+  $expected="\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4".
+    "\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85\xf0\x90\xa4\x85";
+  VS(print_r($matches, true),
+    "Array\n".
+    "(\n".
+    "    [0] => $expected\n".
+    ")\n");
+
+  // Test regex caching functionality.
+  VERIFY(icu_match("(php)", $subject, $ignore, UREGEX_CASE_INSENSITIVE) != false);
+  VERIFY(icu_match("(php)", $subject) == false);
+
+  // Test ICU specific (ie bidi) functionality.
+  $pattern_ltr = "\\p{Bidi_Class=Left_To_Right}";
+  $pattern_rtl = "\\p{Bidi_Class=Right_To_Left}";
+  $pattern_arl = "\\p{Bidi_Class=Arabic_Letter}";
+
+  VERIFY(icu_match($pattern_ltr, $subject_en) != false);
+  VERIFY(icu_match($pattern_rtl, $subject_en) == false);
+
+  VERIFY(icu_match($pattern_ltr, $subject_he) == false);
+  VERIFY(icu_match($pattern_rtl, $subject_he) != false);
+  VERIFY(icu_match($pattern_arl, $subject_he) == false);
+
+  VERIFY(icu_match($pattern_ltr, $subject_ar) == false);
+  VERIFY(icu_match($pattern_rtl, $subject_ar) == false);
+  VERIFY(icu_match($pattern_arl, $subject_ar) != false);
+
+  VERIFY(icu_match($pattern_ltr, $subject_mixed) != false);
+  VERIFY(icu_match($pattern_rtl, $subject_mixed) != false);
+}
+
+// Test string lifted from tests/intl/utf8.h
+function test_icu_transliterate() {
+  $input_ru = "\xd1\x84\xd0\xb5\xd0\xb9\xd1".
+           "\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba";
+  $output_ru = icu_transliterate($input_ru, false);
+  // Note: different than php test ('y' -> 'j')
+  VERIFY($output_ru == "fejsbu\xc5\x93k");
+
+  // Verify that removing accents works.
+  $input_de = "Ich m\xc3\xb6".
+                           "chte \xc3\xbc".
+                           "berzeugend ".
+                            "oder \xc3\xa4hnliche sein";
+  $output_de = icu_transliterate($input_de, true);
+  VERIFY($output_de == "Ich mochte uberzeugend oder ahnliche sein");
+
+  // Verify that keeping accents works.
+  VERIFY(icu_transliterate($input_de, false) == $input_de);
+
+  // Check a non-Latin language.
+  $input_zh = "\xe5\x9b\x9b".
+                           "\xe5\x8d\x81\xe5\x9b\x9b\xe7".
+                           "\x9f\xb3\xe7\x8d\x85\xe5\xad\x90";
+  $output_zh = icu_transliterate($input_zh, true);
+  VERIFY($output_zh == "si shi si shi shi zi");
+}
+
+
+function test_icu_tokenize() {
+  $input_eng = "Hello World";
+  $output_eng = icu_tokenize($input_eng);
+
+  VS(print_r($output_eng, true),
+     "Array\n".
+     "(\n".
+     "    [0] => _B_\n".
+     "    [1] => hello\n".
+     "    [2] => world\n".
+     "    [3] => _E_\n".
+     ")\n"
+    );
+  $input_long = "Hello! You are visitor #1234 to ".
+                            "http://www.facebook.com! ".
+                            "<3 How are you today (6/14/2011),".
+                            " hello@world.com?";
+
+  $output_long = icu_tokenize($input_long);
+
+  VS(print_r($output_long, true),
+     "Array\n".
+     "(\n".
+     "    [0] => _B_\n".
+     "    [1] => hello\n".
+     "    [2] => !\n".
+     "    [3] => you\n".
+     "    [4] => are\n".
+     "    [5] => visitor\n".
+     "    [6] => #\n".
+     "    [7] => XXXX\n".
+     "    [8] => to\n".
+     "    [9] => TOKEN_URL\n".
+     "    [10] => !\n".
+     "    [11] => TOKEN_HEART\n".
+     "    [12] => how\n".
+     "    [13] => are\n".
+     "    [14] => you\n".
+     "    [15] => today\n".
+     "    [16] => (\n".
+     "    [17] => TOKEN_DATE\n".
+     "    [18] => )\n".
+     "    [19] => ,\n".
+     "    [20] => TOKEN_EMAIL\n".
+     "    [21] => ?\n".
+     "    [22] => _E_\n".
+     ")\n"
+    );
+
+  $input_de = "Ich mÃ¶chte Ã¼berzeugend oder Ã¤hnliche sein";
+  $output_de = icu_tokenize($input_de);
+
+  VS(print_r($output_de, true),
+     "Array\n".
+     "(\n".
+     "    [0] => _B_\n".
+     "    [1] => ich\n".
+     "    [2] => mã\n".
+     "    [3] => ¶\n".
+     "    [4] => chte\n".
+     "    [5] => ã\n".
+     "    [6] => ¼\n".
+     "    [7] => berzeugend\n".
+     "    [8] => oder\n".
+     "    [9] => ã\n".
+     "    [10] => ¤\n".
+     "    [11] => hnliche\n".
+     "    [12] => sein\n".
+     "    [13] => _E_\n".
+     ")\n");
+}
+
+test_icu_match();
+test_icu_transliterate();
+test_icu_tokenize();
@@ -0,0 +1,33 @@
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)