/* +----------------------------------------------------------------------+ | HipHop for PHP | +----------------------------------------------------------------------+ | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) | | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) | +----------------------------------------------------------------------+ | This source file is subject to version 2.00 of the Zend license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.zend.com/license/2_00.txt. | | If you did not receive a copy of the Zend license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@zend.com so we can mail you a copy immediately. | +----------------------------------------------------------------------+ */ #include "hphp/runtime/base/zend_collator.h" #include "hphp/runtime/base/zend_strtod.h" #include "hphp/runtime/base/intl_convert.h" #include "hphp/runtime/base/type_conversions.h" #include "hphp/runtime/base/builtin_functions.h" #include "hphp/runtime/base/types.h" #include "hphp/runtime/base/complex_types.h" #include "hphp/runtime/base/runtime_error.h" #include "hphp/runtime/base/array_iterator.h" #include "hphp/runtime/base/comparisons.h" namespace HPHP { IMPLEMENT_REQUEST_LOCAL(IntlError, s_intl_error); #define UCHARS(len) ((len) / sizeof(UChar)) #define UBYTES(len) ((len) * sizeof(UChar)) static Variant collator_convert_string_to_number_if_possible(CVarRef str); static double collator_u_strtod(const UChar *nptr, UChar **endptr) { const UChar *u = nptr, *nstart; UChar c = *u; int any = 0; while (u_isspace(c)) { c = *++u; } nstart = u; if (c == 0x2D /*'-'*/ || c == 0x2B /*'+'*/) { c = *++u; } while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) { any = 1; c = *++u; } if (c == 0x2E /*'.'*/) { c = *++u; while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) { any = 1; c = *++u; } } if ((c == 0x65 /*'e'*/ || c == 0x45 /*'E'*/) && any) { const UChar *e = u; int any_exp = 0; c = *++u; if (c == 0x2D /*'-'*/ || c == 0x2B /*'+'*/) { c = *++u; } while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) { any_exp = 1; c = *++u; } if (!any_exp) { u = e; } } if (any) { char buf[64], *numbuf, *bufpos; int length = u - nstart; double value; if (length < (int)sizeof(buf)) { numbuf = buf; } else { numbuf = (char *) malloc(length + 1); } bufpos = numbuf; while (nstart < u) { *bufpos++ = (char) *nstart++; } *bufpos = '\0'; value = zend_strtod(numbuf, nullptr); if (numbuf != buf) { free(numbuf); } if (endptr != nullptr) { *endptr = (UChar *)u; } return value; } if (endptr != nullptr) { *endptr = (UChar *)nptr; } return 0; } static long collator_u_strtol(const UChar *nptr, UChar **endptr, int base) { const UChar *s = nptr; unsigned long acc; UChar c; unsigned long cutoff; int neg = 0, any, cutlim; if (s == nullptr) { errno = ERANGE; if (endptr != nullptr) { *endptr = nullptr; } return 0; } /* * Skip white space and pick up leading +/- sign if any. * If base is 0, allow 0x for hex and 0 for octal, else * assume decimal; if base is already 16, allow 0x. */ do { c = *s++; } while (u_isspace(c)); if (c == 0x2D /*'-'*/) { neg = 1; c = *s++; } else if (c == 0x2B /*'+'*/) c = *s++; if ((base == 0 || base == 16) && (c == 0x30 /*'0'*/) && (*s == 0x78 /*'x'*/ || *s == 0x58 /*'X'*/)) { c = s[1]; s += 2; base = 16; } if (base == 0) base = (c == 0x30 /*'0'*/) ? 8 : 10; /* * Compute the cutoff value between legal numbers and illegal * numbers. That is the largest legal value, divided by the * base. An input number that is greater than this value, if * followed by a legal input character, is too big. One that * is equal to this value may be valid or not; the limit * between valid and invalid numbers is then based on the last * digit. For instance, if the range for longs is * [-2147483648..2147483647] and the input base is 10, * cutoff will be set to 214748364 and cutlim to either * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated * a value > 214748364, or equal but the next digit is > 7 (or 8), * the number is too big, and we will return a range error. * * Set any if any `digits' consumed; make it negative to indicate * overflow. */ cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX; cutlim = cutoff % (unsigned long)base; cutoff /= (unsigned long)base; for (acc = 0, any = 0;; c = *s++) { if (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) c -= 0x30 /*'0'*/; else if (c >= 0x41 /*'A'*/ && c <= 0x5A /*'Z'*/) c -= 0x41 /*'A'*/ - 10; else if (c >= 0x61 /*'a'*/ && c <= 0x7A /*'z'*/) c -= 0x61 /*'a'*/ - 10; else break; if (c >= base) break; if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim)) any = -1; else { any = 1; acc *= base; acc += c; } } if (any < 0) { acc = neg ? LONG_MIN : LONG_MAX; errno = ERANGE; } else if (neg) acc = -acc; if (endptr != nullptr) *endptr = (UChar *)(any ? s - 1 : nptr); return (acc); } static DataType collator_is_numeric(UChar *str, int length, int64_t *lval, double *dval, int allow_errors ) { int64_t local_lval; double local_dval; UChar *end_ptr_long, *end_ptr_double; int conv_base=10; if (!length) { return KindOfNull; } /* handle hex numbers */ if (length>=2 && str[0]=='0' && (str[1]=='x' || str[1]=='X')) { conv_base=16; } errno=0; local_lval = collator_u_strtol(str, &end_ptr_long, conv_base); if (errno != ERANGE) { if (end_ptr_long == str+length) { /* integer string */ if (lval) { *lval = local_lval; } return KindOfInt64; } else if (end_ptr_long == str && *end_ptr_long != '\0' && *str != '.' && *str != '-') { /* ignore partial string matches */ return KindOfNull; } } else { end_ptr_long = nullptr; } if (conv_base == 16) { /* hex string, under UNIX strtod() messes it up */ /* UTODO: keep compatibility with is_numeric_string() here? */ return KindOfNull; } local_dval = collator_u_strtod(str, &end_ptr_double); if (local_dval == 0 && end_ptr_double == str) { end_ptr_double = nullptr; } else { if (end_ptr_double == str+length) { /* floating point string */ if (!finite(local_dval)) { /* "inf","nan" and maybe other weird ones */ return KindOfNull; } if (dval) { *dval = local_dval; } return KindOfDouble; } } if (!allow_errors) { return KindOfNull; } if (allow_errors == -1) { raise_notice("A non well formed numeric value encountered"); } if (allow_errors) { if (end_ptr_double > end_ptr_long && dval) { *dval = local_dval; return KindOfDouble; } else if (end_ptr_long && lval) { *lval = local_lval; return KindOfInt64; } } return KindOfNull; } static String intl_convert_str_utf8_to_utf16(CStrRef utf8_str, UErrorCode * status) { UChar* ustr = nullptr; int ustr_len = 0; intl_convert_utf8_to_utf16(&ustr, &ustr_len, utf8_str.data(), utf8_str.length(), status); if (U_FAILURE(*status)) { return (const char *)(L""); } return String((char*)ustr, UBYTES(ustr_len), AttachString); } static String intl_convert_str_utf16_to_utf8(CStrRef utf16_str, UErrorCode * status) { char* str = nullptr; int str_len = 0; intl_convert_utf16_to_utf8(&str, &str_len, (UChar*)(utf16_str.data()), UCHARS(utf16_str.length()), status); if (U_FAILURE(*status)) { return ""; } return String(str, str_len, AttachString); } static Variant collator_convert_string_to_number(CVarRef str) { Variant num = collator_convert_string_to_number_if_possible(str); if (same(num, false)) { /* String wasn't converted => return zero. */ return 0; } return num; } static Variant collator_convert_string_to_double(CVarRef str) { Variant num = collator_convert_string_to_number(str); return num.toDouble(); } static Variant collator_convert_string_to_number_if_possible(CVarRef str) { int64_t lval = 0; double dval = 0; if (!str.isString()) return false; DataType ret = collator_is_numeric((UChar*)(str.toString().data()), UCHARS(str.toString().length()), &lval, &dval, 1); if (ret == KindOfInt64) return lval; if (ret == KindOfDouble) return dval; return false; } static Variant collator_convert_object_to_string(CVarRef obj) { if (!obj.isObject()) return obj; String str; try { str = obj.toString(); } catch (Exception &e) { return obj; } UErrorCode status; String ustr = intl_convert_str_utf8_to_utf16(str, &status); if (U_FAILURE(status)) { raise_warning("Error casting object to string in " "collator_convert_object_to_string()"); return uninit_null(); } return ustr; } static void collator_convert_array_from_utf16_to_utf8(Array &array, UErrorCode * status) { for (ArrayIter iter(array); iter; ++iter) { CVarRef value = iter.secondRef(); /* Process string values only. */ if (!value.isString()) continue; String str = intl_convert_str_utf16_to_utf8(value.toString(), status); if (U_FAILURE(*status)) { return; } /* Update current value with the converted value. */ const_cast(value) = str; } } static void collator_convert_array_from_utf8_to_utf16(Array &array, UErrorCode * status) { for (ArrayIter iter(array); iter; ++iter) { CVarRef value = iter.secondRef(); /* Process string values only. */ if (!value.isString()) continue; String str = intl_convert_str_utf8_to_utf16(value.toString(), status); if (U_FAILURE(*status)) { return; } /* Update current value with the converted value. */ Variant key = iter.first(); array.set(key, str); } } static Variant collator_normalize_sort_argument(CVarRef arg) { if (!arg.isString()) return arg; Variant n_arg = collator_convert_string_to_number_if_possible(arg); if (same(n_arg, false)) { /* Conversion to number failed. */ UErrorCode status; n_arg = intl_convert_str_utf16_to_utf8(arg.toString(), &status); if (U_FAILURE(status)) { raise_warning("Error converting utf16 to utf8 in " "collator_normalize_sort_argument()"); } } return n_arg; } static int collator_regular_compare_function(CVarRef v1, CVarRef v2, const void *data, bool ascending) { Variant str1 = collator_convert_object_to_string(v1); Variant str2 = collator_convert_object_to_string(v2); Variant num1; Variant num2; Variant norm1; Variant norm2; /* If both args are strings AND either of args is not numeric string * then use ICU-compare. Otherwise PHP-compare. */ if (str1.isString() && str2.isString()) { num1 = collator_convert_string_to_number_if_possible(str1); if (!same(num1, false)) { num2 = collator_convert_string_to_number_if_possible(str2); } if (same(num1, false) || same(num2, false)) { assert(data); int ret = ucol_strcoll((const UCollator *)data, (UChar*)(str1.toString().data()), UCHARS(str1.toString().length()), (UChar*)(str2.toString().data()), UCHARS(str2.toString().length())); return ascending ? ret : (-ret); } } /* num1 is set if str1 and str2 are strings. */ if (!num1.isNull()) { if (same(num1, false)) { /* str1 is string but not numeric string just convert it to utf8. */ UErrorCode status; norm1 = intl_convert_str_utf16_to_utf8(str1.toString(), &status); if (U_FAILURE(status)) { raise_warning("Error converting utf16 to utf8 in " "collator_regular_compare_function()"); } /* num2 is not set but str2 is string => do normalization. */ norm2 = collator_normalize_sort_argument(str2); } else { /* str1 is numeric strings => passthru to PHP-compare. */ norm1 = num1; norm2 = num2; } } else { /* str1 or str2 is not a string => do normalization. */ norm1 = collator_normalize_sort_argument(str1); norm2 = collator_normalize_sort_argument(str2); } if (ascending) { if (less(norm1, norm2)) return -1; if (equal(norm1, norm2)) return 0; return 1; } if (less(norm1, norm2)) return 1; if (equal(norm1, norm2)) return 0; return -1; } static int collator_regular_compare_ascending(CVarRef v1, CVarRef v2, const void *data) { return collator_regular_compare_function(v1, v2, data, true); } static int collator_regular_compare_descending(CVarRef v1, CVarRef v2, const void *data) { return collator_regular_compare_function(v1, v2, data, false); } static int collator_numeric_compare_function(CVarRef v1, CVarRef v2, const void *data, bool ascending) { Variant num1; Variant num2; if (v1.isString()) { num1 = collator_convert_string_to_double(v1); } else { num1 = v1.toDouble(); } if (v2.isString()) { num2 = collator_convert_string_to_double(v2); } else { num2 = v2.toDouble(); } if (ascending) { if (less(num1, num2)) return -1; if (equal(num1, num2)) return 0; return 1; } if (less(num1, num2)) return 1; if (equal(num1, num2)) return 0; return -1; } static int collator_numeric_compare_ascending(CVarRef v1, CVarRef v2, const void *data) { return collator_numeric_compare_function(v1, v2, data, true); } static int collator_numeric_compare_descending(CVarRef v1, CVarRef v2, const void *data) { return collator_numeric_compare_function(v1, v2, data, false); } static int collator_string_compare_function(CVarRef v1, CVarRef v2, const void *data, bool ascending) { assert(data); String str1; if (v1.isString()) { str1 = v1.toString(); } else { UErrorCode status; str1 = intl_convert_str_utf8_to_utf16(v1.toString(), &status); if (U_FAILURE(status)) { raise_warning("Error converting utf8 to utf16 in " "collator_string_compare_function()"); } } String str2; if (v2.isString()) { str2 = v2.toString(); } else { UErrorCode status; str2 = intl_convert_str_utf8_to_utf16(v2.toString(), &status); if (U_FAILURE(status)) { raise_warning("Error converting utf8 to utf16 in " "collator_string_compare_function()"); } } int ret = ucol_strcoll((const UCollator *)data, (UChar*)(str1.data()), UCHARS(str1.length()), (UChar*)(str2.data()), UCHARS(str2.length())); return ascending ? ret : (-ret); } static int collator_string_compare_ascending(CVarRef v1, CVarRef v2, const void *data) { return collator_string_compare_function(v1, v2, data, true); } static int collator_string_compare_descending(CVarRef v1, CVarRef v2, const void *data) { return collator_string_compare_function(v1, v2, data, false); } static bool collator_sort_internal(bool renumber, Variant &array, int sort_flags, bool ascending, UCollator *coll, intl_error * errcode) { assert(coll); errcode->clear(); s_intl_error->m_error.clear(); Array temp = array.toArray(); Array::PFUNC_CMP cmp_func; switch (sort_flags) { case COLLATOR_SORT_NUMERIC: cmp_func = ascending ? collator_numeric_compare_ascending : collator_numeric_compare_descending; break; case COLLATOR_SORT_STRING: cmp_func = ascending ? collator_string_compare_ascending : collator_string_compare_descending; break; case COLLATOR_SORT_REGULAR: default: cmp_func = ascending ? collator_regular_compare_ascending : collator_regular_compare_descending; break; } /* Convert strings in the specified array from UTF-8 to UTF-16. */ collator_convert_array_from_utf8_to_utf16(temp, &(errcode->code)); if (U_FAILURE(errcode->code)) { errcode->custom_error_message = "Error converting array from UTF-8 to UTF-16"; s_intl_error->m_error.code = errcode->code; s_intl_error->m_error.custom_error_message = errcode->custom_error_message; return false; } /* Sort specified array. */ temp.sort(cmp_func, false, renumber, coll); /* Convert strings in the specified array back to UTF-8. */ errcode->clear(); s_intl_error->m_error.clear(); collator_convert_array_from_utf16_to_utf8(temp, &(errcode->code)); if (U_FAILURE(errcode->code)) { errcode->custom_error_message = "Error converting array from UTF-16 to UTF-8"; s_intl_error->m_error.code = errcode->code; s_intl_error->m_error.custom_error_message = errcode->custom_error_message; return false; } array = temp; return true; } bool collator_sort(Variant &array, int sort_flags, bool ascending, UCollator *coll, intl_error *errcode) { assert(coll); bool ret = collator_sort_internal(true, array, sort_flags, ascending, coll, errcode); return ret; } bool collator_asort(Variant &array, int sort_flags, bool ascending, UCollator *coll, intl_error *errcode) { assert(coll); bool ret = collator_sort_internal(false, array, sort_flags, ascending, coll, errcode); return ret; } /////////////////////////////////////////////////////////////////////////////// }