Arquivos
hhvm/hphp/runtime/base/zend_collator.cpp
T
Edwin Smith 721f89b890 Flatten directories under runtime/base
This moves runtime/base/*/* to runtime/base, and fixes paths.
2013-07-18 17:28:35 -07:00

628 linhas
19 KiB
C++

/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
| Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 2.00 of the Zend license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.zend.com/license/2_00.txt. |
| If you did not receive a copy of the Zend license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@zend.com so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
#include "hphp/runtime/base/zend_collator.h"
#include "hphp/runtime/base/zend_strtod.h"
#include "hphp/runtime/base/intl_convert.h"
#include "hphp/runtime/base/type_conversions.h"
#include "hphp/runtime/base/builtin_functions.h"
#include "hphp/runtime/base/types.h"
#include "hphp/runtime/base/complex_types.h"
#include "hphp/runtime/base/runtime_error.h"
#include "hphp/runtime/base/array_iterator.h"
#include "hphp/runtime/base/comparisons.h"
namespace HPHP {
IMPLEMENT_REQUEST_LOCAL(IntlError, s_intl_error);
#define UCHARS(len) ((len) / sizeof(UChar))
#define UBYTES(len) ((len) * sizeof(UChar))
static Variant collator_convert_string_to_number_if_possible(CVarRef str);
static double collator_u_strtod(const UChar *nptr, UChar **endptr) {
const UChar *u = nptr, *nstart;
UChar c = *u;
int any = 0;
while (u_isspace(c)) {
c = *++u;
}
nstart = u;
if (c == 0x2D /*'-'*/ || c == 0x2B /*'+'*/) {
c = *++u;
}
while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) {
any = 1;
c = *++u;
}
if (c == 0x2E /*'.'*/) {
c = *++u;
while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) {
any = 1;
c = *++u;
}
}
if ((c == 0x65 /*'e'*/ || c == 0x45 /*'E'*/) && any) {
const UChar *e = u;
int any_exp = 0;
c = *++u;
if (c == 0x2D /*'-'*/ || c == 0x2B /*'+'*/) {
c = *++u;
}
while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) {
any_exp = 1;
c = *++u;
}
if (!any_exp) {
u = e;
}
}
if (any) {
char buf[64], *numbuf, *bufpos;
int length = u - nstart;
double value;
if (length < (int)sizeof(buf)) {
numbuf = buf;
} else {
numbuf = (char *) malloc(length + 1);
}
bufpos = numbuf;
while (nstart < u) {
*bufpos++ = (char) *nstart++;
}
*bufpos = '\0';
value = zend_strtod(numbuf, nullptr);
if (numbuf != buf) {
free(numbuf);
}
if (endptr != nullptr) {
*endptr = (UChar *)u;
}
return value;
}
if (endptr != nullptr) {
*endptr = (UChar *)nptr;
}
return 0;
}
static long collator_u_strtol(const UChar *nptr, UChar **endptr,
int base) {
const UChar *s = nptr;
unsigned long acc;
UChar c;
unsigned long cutoff;
int neg = 0, any, cutlim;
if (s == nullptr) {
errno = ERANGE;
if (endptr != nullptr) {
*endptr = nullptr;
}
return 0;
}
/*
* Skip white space and pick up leading +/- sign if any.
* If base is 0, allow 0x for hex and 0 for octal, else
* assume decimal; if base is already 16, allow 0x.
*/
do {
c = *s++;
} while (u_isspace(c));
if (c == 0x2D /*'-'*/) {
neg = 1;
c = *s++;
} else if (c == 0x2B /*'+'*/)
c = *s++;
if ((base == 0 || base == 16) &&
(c == 0x30 /*'0'*/)
&& (*s == 0x78 /*'x'*/ || *s == 0x58 /*'X'*/)) {
c = s[1];
s += 2;
base = 16;
}
if (base == 0)
base = (c == 0x30 /*'0'*/) ? 8 : 10;
/*
* Compute the cutoff value between legal numbers and illegal
* numbers. That is the largest legal value, divided by the
* base. An input number that is greater than this value, if
* followed by a legal input character, is too big. One that
* is equal to this value may be valid or not; the limit
* between valid and invalid numbers is then based on the last
* digit. For instance, if the range for longs is
* [-2147483648..2147483647] and the input base is 10,
* cutoff will be set to 214748364 and cutlim to either
* 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
* a value > 214748364, or equal but the next digit is > 7 (or 8),
* the number is too big, and we will return a range error.
*
* Set any if any `digits' consumed; make it negative to indicate
* overflow.
*/
cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
cutlim = cutoff % (unsigned long)base;
cutoff /= (unsigned long)base;
for (acc = 0, any = 0;; c = *s++) {
if (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/)
c -= 0x30 /*'0'*/;
else if (c >= 0x41 /*'A'*/ && c <= 0x5A /*'Z'*/)
c -= 0x41 /*'A'*/ - 10;
else if (c >= 0x61 /*'a'*/ && c <= 0x7A /*'z'*/)
c -= 0x61 /*'a'*/ - 10;
else
break;
if (c >= base)
break;
if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
any = -1;
else {
any = 1;
acc *= base;
acc += c;
}
}
if (any < 0) {
acc = neg ? LONG_MIN : LONG_MAX;
errno = ERANGE;
} else if (neg)
acc = -acc;
if (endptr != nullptr)
*endptr = (UChar *)(any ? s - 1 : nptr);
return (acc);
}
static DataType collator_is_numeric(UChar *str, int length, int64_t *lval,
double *dval, int allow_errors ) {
int64_t local_lval;
double local_dval;
UChar *end_ptr_long, *end_ptr_double;
int conv_base=10;
if (!length) {
return KindOfNull;
}
/* handle hex numbers */
if (length>=2 && str[0]=='0' && (str[1]=='x' || str[1]=='X')) {
conv_base=16;
}
errno=0;
local_lval = collator_u_strtol(str, &end_ptr_long, conv_base);
if (errno != ERANGE) {
if (end_ptr_long == str+length) { /* integer string */
if (lval) {
*lval = local_lval;
}
return KindOfInt64;
} else if (end_ptr_long == str &&
*end_ptr_long != '\0' &&
*str != '.' &&
*str != '-') { /* ignore partial string matches */
return KindOfNull;
}
} else {
end_ptr_long = nullptr;
}
if (conv_base == 16) { /* hex string, under UNIX strtod() messes it up */
/* UTODO: keep compatibility with is_numeric_string() here? */
return KindOfNull;
}
local_dval = collator_u_strtod(str, &end_ptr_double);
if (local_dval == 0 && end_ptr_double == str) {
end_ptr_double = nullptr;
} else {
if (end_ptr_double == str+length) { /* floating point string */
if (!finite(local_dval)) {
/* "inf","nan" and maybe other weird ones */
return KindOfNull;
}
if (dval) {
*dval = local_dval;
}
return KindOfDouble;
}
}
if (!allow_errors) {
return KindOfNull;
}
if (allow_errors == -1) {
raise_notice("A non well formed numeric value encountered");
}
if (allow_errors) {
if (end_ptr_double > end_ptr_long && dval) {
*dval = local_dval;
return KindOfDouble;
} else if (end_ptr_long && lval) {
*lval = local_lval;
return KindOfInt64;
}
}
return KindOfNull;
}
static String intl_convert_str_utf8_to_utf16(CStrRef utf8_str,
UErrorCode * status) {
UChar* ustr = nullptr;
int ustr_len = 0;
intl_convert_utf8_to_utf16(&ustr, &ustr_len,
utf8_str.data(), utf8_str.length(),
status);
if (U_FAILURE(*status)) {
return (const char *)(L"");
}
return String((char*)ustr, UBYTES(ustr_len), AttachString);
}
static String intl_convert_str_utf16_to_utf8(CStrRef utf16_str,
UErrorCode * status) {
char* str = nullptr;
int str_len = 0;
intl_convert_utf16_to_utf8(&str, &str_len,
(UChar*)(utf16_str.data()),
UCHARS(utf16_str.length()),
status);
if (U_FAILURE(*status)) {
return "";
}
return String(str, str_len, AttachString);
}
static Variant collator_convert_string_to_number(CVarRef str) {
Variant num = collator_convert_string_to_number_if_possible(str);
if (same(num, false)) {
/* String wasn't converted => return zero. */
return 0;
}
return num;
}
static Variant collator_convert_string_to_double(CVarRef str) {
Variant num = collator_convert_string_to_number(str);
return num.toDouble();
}
static Variant collator_convert_string_to_number_if_possible(CVarRef str) {
int64_t lval = 0;
double dval = 0;
if (!str.isString()) return false;
DataType ret = collator_is_numeric((UChar*)(str.toString().data()),
UCHARS(str.toString().length()),
&lval, &dval, 1);
if (ret == KindOfInt64) return lval;
if (ret == KindOfDouble) return dval;
return false;
}
static Variant collator_convert_object_to_string(CVarRef obj) {
if (!obj.isObject()) return obj;
String str;
try {
str = obj.toString();
} catch (Exception &e) {
return obj;
}
UErrorCode status;
String ustr = intl_convert_str_utf8_to_utf16(str, &status);
if (U_FAILURE(status)) {
raise_warning("Error casting object to string in "
"collator_convert_object_to_string()");
return uninit_null();
}
return ustr;
}
static void collator_convert_array_from_utf16_to_utf8(Array &array,
UErrorCode * status) {
for (ArrayIter iter(array); iter; ++iter) {
CVarRef value = iter.secondRef();
/* Process string values only. */
if (!value.isString()) continue;
String str = intl_convert_str_utf16_to_utf8(value.toString(), status);
if (U_FAILURE(*status)) {
return;
}
/* Update current value with the converted value. */
const_cast<Variant&>(value) = str;
}
}
static void collator_convert_array_from_utf8_to_utf16(Array &array,
UErrorCode * status) {
for (ArrayIter iter(array); iter; ++iter) {
CVarRef value = iter.secondRef();
/* Process string values only. */
if (!value.isString()) continue;
String str = intl_convert_str_utf8_to_utf16(value.toString(), status);
if (U_FAILURE(*status)) {
return;
}
/* Update current value with the converted value. */
Variant key = iter.first();
array.set(key, str);
}
}
static Variant collator_normalize_sort_argument(CVarRef arg) {
if (!arg.isString()) return arg;
Variant n_arg = collator_convert_string_to_number_if_possible(arg);
if (same(n_arg, false)) {
/* Conversion to number failed. */
UErrorCode status;
n_arg = intl_convert_str_utf16_to_utf8(arg.toString(), &status);
if (U_FAILURE(status)) {
raise_warning("Error converting utf16 to utf8 in "
"collator_normalize_sort_argument()");
}
}
return n_arg;
}
static int collator_regular_compare_function(CVarRef v1, CVarRef v2,
const void *data,
bool ascending) {
Variant str1 = collator_convert_object_to_string(v1);
Variant str2 = collator_convert_object_to_string(v2);
Variant num1;
Variant num2;
Variant norm1;
Variant norm2;
/* If both args are strings AND either of args is not numeric string
* then use ICU-compare. Otherwise PHP-compare. */
if (str1.isString() && str2.isString()) {
num1 = collator_convert_string_to_number_if_possible(str1);
if (!same(num1, false)) {
num2 = collator_convert_string_to_number_if_possible(str2);
}
if (same(num1, false) || same(num2, false)) {
assert(data);
int ret = ucol_strcoll((const UCollator *)data,
(UChar*)(str1.toString().data()),
UCHARS(str1.toString().length()),
(UChar*)(str2.toString().data()),
UCHARS(str2.toString().length()));
return ascending ? ret : (-ret);
}
}
/* num1 is set if str1 and str2 are strings. */
if (!num1.isNull()) {
if (same(num1, false)) {
/* str1 is string but not numeric string just convert it to utf8. */
UErrorCode status;
norm1 = intl_convert_str_utf16_to_utf8(str1.toString(), &status);
if (U_FAILURE(status)) {
raise_warning("Error converting utf16 to utf8 in "
"collator_regular_compare_function()");
}
/* num2 is not set but str2 is string => do normalization. */
norm2 = collator_normalize_sort_argument(str2);
} else {
/* str1 is numeric strings => passthru to PHP-compare. */
norm1 = num1;
norm2 = num2;
}
} else {
/* str1 or str2 is not a string => do normalization. */
norm1 = collator_normalize_sort_argument(str1);
norm2 = collator_normalize_sort_argument(str2);
}
if (ascending) {
if (less(norm1, norm2)) return -1;
if (equal(norm1, norm2)) return 0;
return 1;
}
if (less(norm1, norm2)) return 1;
if (equal(norm1, norm2)) return 0;
return -1;
}
static int collator_regular_compare_ascending(CVarRef v1, CVarRef v2,
const void *data) {
return collator_regular_compare_function(v1, v2, data, true);
}
static int collator_regular_compare_descending(CVarRef v1, CVarRef v2,
const void *data) {
return collator_regular_compare_function(v1, v2, data, false);
}
static int collator_numeric_compare_function(CVarRef v1, CVarRef v2,
const void *data,
bool ascending) {
Variant num1;
Variant num2;
if (v1.isString()) {
num1 = collator_convert_string_to_double(v1);
} else {
num1 = v1.toDouble();
}
if (v2.isString()) {
num2 = collator_convert_string_to_double(v2);
} else {
num2 = v2.toDouble();
}
if (ascending) {
if (less(num1, num2)) return -1;
if (equal(num1, num2)) return 0;
return 1;
}
if (less(num1, num2)) return 1;
if (equal(num1, num2)) return 0;
return -1;
}
static int collator_numeric_compare_ascending(CVarRef v1, CVarRef v2,
const void *data) {
return collator_numeric_compare_function(v1, v2, data, true);
}
static int collator_numeric_compare_descending(CVarRef v1, CVarRef v2,
const void *data) {
return collator_numeric_compare_function(v1, v2, data, false);
}
static int collator_string_compare_function(CVarRef v1, CVarRef v2,
const void *data,
bool ascending) {
assert(data);
String str1;
if (v1.isString()) {
str1 = v1.toString();
} else {
UErrorCode status;
str1 = intl_convert_str_utf8_to_utf16(v1.toString(), &status);
if (U_FAILURE(status)) {
raise_warning("Error converting utf8 to utf16 in "
"collator_string_compare_function()");
}
}
String str2;
if (v2.isString()) {
str2 = v2.toString();
} else {
UErrorCode status;
str2 = intl_convert_str_utf8_to_utf16(v2.toString(), &status);
if (U_FAILURE(status)) {
raise_warning("Error converting utf8 to utf16 in "
"collator_string_compare_function()");
}
}
int ret = ucol_strcoll((const UCollator *)data,
(UChar*)(str1.data()),
UCHARS(str1.length()),
(UChar*)(str2.data()),
UCHARS(str2.length()));
return ascending ? ret : (-ret);
}
static int collator_string_compare_ascending(CVarRef v1, CVarRef v2,
const void *data) {
return collator_string_compare_function(v1, v2, data, true);
}
static int collator_string_compare_descending(CVarRef v1, CVarRef v2,
const void *data) {
return collator_string_compare_function(v1, v2, data, false);
}
static bool collator_sort_internal(bool renumber, Variant &array,
int sort_flags, bool ascending,
UCollator *coll, intl_error * errcode) {
assert(coll);
errcode->clear();
s_intl_error->m_error.clear();
Array temp = array.toArray();
Array::PFUNC_CMP cmp_func;
switch (sort_flags) {
case COLLATOR_SORT_NUMERIC:
cmp_func = ascending ? collator_numeric_compare_ascending
: collator_numeric_compare_descending;
break;
case COLLATOR_SORT_STRING:
cmp_func = ascending ? collator_string_compare_ascending
: collator_string_compare_descending;
break;
case COLLATOR_SORT_REGULAR:
default:
cmp_func = ascending ? collator_regular_compare_ascending
: collator_regular_compare_descending;
break;
}
/* Convert strings in the specified array from UTF-8 to UTF-16. */
collator_convert_array_from_utf8_to_utf16(temp, &(errcode->code));
if (U_FAILURE(errcode->code)) {
errcode->custom_error_message =
"Error converting array from UTF-8 to UTF-16";
s_intl_error->m_error.code = errcode->code;
s_intl_error->m_error.custom_error_message = errcode->custom_error_message;
return false;
}
/* Sort specified array. */
temp.sort(cmp_func, false, renumber, coll);
/* Convert strings in the specified array back to UTF-8. */
errcode->clear();
s_intl_error->m_error.clear();
collator_convert_array_from_utf16_to_utf8(temp, &(errcode->code));
if (U_FAILURE(errcode->code)) {
errcode->custom_error_message =
"Error converting array from UTF-16 to UTF-8";
s_intl_error->m_error.code = errcode->code;
s_intl_error->m_error.custom_error_message = errcode->custom_error_message;
return false;
}
array = temp;
return true;
}
bool collator_sort(Variant &array, int sort_flags, bool ascending,
UCollator *coll, intl_error *errcode) {
assert(coll);
bool ret = collator_sort_internal(true, array, sort_flags, ascending, coll,
errcode);
return ret;
}
bool collator_asort(Variant &array, int sort_flags, bool ascending,
UCollator *coll, intl_error *errcode) {
assert(coll);
bool ret = collator_sort_internal(false, array, sort_flags, ascending, coll,
errcode);
return ret;
}
///////////////////////////////////////////////////////////////////////////////
}