From 8eb706747dafe8aa72cba2ac9588ae4a16a1e8d3 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 6 Oct 2019 23:34:36 +0200 Subject: Add precise floating point parsing --- src/int_math.h | 1 + src/string.c | 187 ++++++++++++++++++++++- tests/decimal_test.c | 410 ++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 593 insertions(+), 5 deletions(-) diff --git a/src/int_math.h b/src/int_math.h index 85305a4a..d36a44d2 100644 --- a/src/int_math.h +++ b/src/int_math.h @@ -21,6 +21,7 @@ #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define CLAMP(x, l, h) MAX(l, MIN(h, x)) static const uint64_t POW10[] = {1ull, 10ull, diff --git a/src/string.c b/src/string.c index b2ef8188..b867fc76 100644 --- a/src/string.c +++ b/src/string.c @@ -14,11 +14,15 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include "bigint.h" +#include "ieee_float.h" #include "int_math.h" -#include "string_utils.h" - #include "serd/serd.h" +#include "soft_float.h" +#include "string_utils.h" +#include +#include #include #include #include @@ -181,12 +185,151 @@ serd_parse_double(const char* const str) return result; } +static uint64_t +normalize(SerdSoftFloat* value, const uint64_t error) +{ + const int original_e = value->e; + + *value = soft_float_normalize(*value); + + return error << (original_e - value->e); +} + +/** + Return the error added by floating point multiplication. + + Should be l + r + l*r/(2^64) + 0.5, but we short the denominator to 63 due + to lack of precision, which effectively rounds up. +*/ +static inline uint64_t +product_error(const uint64_t lerror, + const uint64_t rerror, + const uint64_t half_ulp) +{ + return lerror + rerror + ((lerror * rerror) >> 63) + half_ulp; +} + +/** + Guess the binary floating point value for decimal input. + + @param significand Significand from the input. + @param expt10 Decimal exponent from the input. + @param n_digits Number of decimal digits in the significand. + @param[out] guess Either the exact number, or its predecessor. + @return True if `guess` is correct. +*/ +static bool +sftod(const uint64_t significand, + const int expt10, + const int n_digits, + SerdSoftFloat* const guess) +{ + assert(sizeof(guess->f) == sizeof(significand)); + assert(expt10 <= max_dec_expt); + assert(expt10 >= min_dec_expt); + + /* The general idea here is to try and find a power of 10 that we can + multiply by the significand to get the number. We get one from the + cache which is possibly too small, then multiply by another power of 10 + to make up the difference if necessary. For example, with a target + power of 10^70, if we get 10^68 from the cache, then we multiply again + by 10^2. This, as well as normalization, accumulates error, which is + tracked throughout to know if we got the precise number. */ + + // Use a common denominator of 2^3 to avoid fractions + static const int lg_denom = 3; + static const uint64_t denom = 1 << 3; + static const uint64_t half_ulp = 4; + + // Start out with just the significand, and no error + SerdSoftFloat input = {significand, 0}; + uint64_t error = normalize(&input, 0); + + // Get a power of 10 that takes us most of the way without overshooting + int cached_expt10 = 0; + SerdSoftFloat pow10 = soft_float_pow10_under(expt10, &cached_expt10); + + // Get an exact fixup power if necessary + const int d_expt10 = expt10 - cached_expt10; + if (d_expt10) { + input = soft_float_multiply(input, soft_float_exact_pow10(d_expt10)); + if (d_expt10 > uint64_digits10 - n_digits) { + error += half_ulp; // Product does not fit in an integer + } + } + + // Multiply the significand by the power, normalize, and update the error + input = soft_float_multiply(input, pow10); + error = normalize(&input, product_error(error, half_ulp, half_ulp)); + + // Get the effective number of significant bits from the order of magnitude + const int magnitude = 64 + input.e; + const int real_magnitude = magnitude - dbl_subnormal_expt; + const int n_significant_bits = CLAMP(real_magnitude, 0, DBL_MANT_DIG); + + // Calculate the number of "extra" bits of precision we have + int n_extra_bits = 64 - n_significant_bits; + if (n_extra_bits + lg_denom >= 64) { + // Very small subnormal where extra * denom does not fit in an integer + // Shift right (and accumulate some more error) to compensate + const int amount = (n_extra_bits + lg_denom) - 63; + + input.f >>= amount; + input.e += amount; + error = product_error((error >> amount) + 1, half_ulp, half_ulp); + n_extra_bits -= amount; + } + + // Calculate boundaries for the extra bits (with the common denominator) + assert(n_extra_bits < 64); + const uint64_t extra_mask = (1ull << n_extra_bits) - 1; + const uint64_t extra_bits = (input.f & extra_mask) * denom; + const uint64_t middle = (1ull << (n_extra_bits - 1)) * denom; + const uint64_t low = middle - error; + const uint64_t high = middle + error; + + // Round to nearest representable double + guess->f = (input.f >> n_extra_bits) + (extra_bits >= high); + guess->e = input.e + n_extra_bits; + + // Too inaccurate if the extra bits are within the error around the middle + return extra_bits <= low || extra_bits >= high; +} + +static int +compare_buffer(const char* buf, const int expt, const SerdSoftFloat upper) +{ + SerdBigint buf_bigint; + serd_bigint_set_decimal_string(&buf_bigint, buf); + + SerdBigint upper_bigint; + serd_bigint_set_u64(&upper_bigint, upper.f); + + if (expt >= 0) { + serd_bigint_multiply_pow10(&buf_bigint, (unsigned)expt); + } else { + serd_bigint_multiply_pow10(&upper_bigint, (unsigned)-expt); + } + + if (upper.e > 0) { + serd_bigint_shift_left(&upper_bigint, (unsigned)upper.e); + } else { + serd_bigint_shift_left(&buf_bigint, (unsigned)-upper.e); + } + + return serd_bigint_compare(&buf_bigint, &upper_bigint); +} double -serd_strtod(const char* str, size_t* end) +serd_strtod(const char* const str, size_t* const end) { #define SET_END(index) if (end) { *end = (size_t)(index); } + static const int n_exact_pow10 = sizeof(POW10) / sizeof(POW10[0]); + static const int max_exact_int_digits = 15; // Digits that fit exactly + static const int max_decimal_power = 309; // Max finite power + static const int min_decimal_power = -324; // Min non-zero power + // Point s at the first non-whitespace character const char* s = str; while (is_space(*s)) { @@ -219,5 +362,41 @@ serd_strtod(const char* str, size_t* end) return (double)NAN; } - return in.sign * (in.frac * pow(10, in.frac_expt)); + const int expt = in.frac_expt; + const int result_power = in.n_digits + expt; + + // Return early for simple exact cases + if (result_power > max_decimal_power) { + return (double)in.sign * (double)INFINITY; + } else if (result_power < min_decimal_power) { + return in.sign * 0.0; + } else if (in.n_digits < max_exact_int_digits) { + if (expt < 0 && -expt < n_exact_pow10) { + return in.sign * ((double)in.frac / (double)POW10[-expt]); + } else if (expt >= 0 && expt < n_exact_pow10) { + return in.sign * ((double)in.frac * (double)POW10[expt]); + } + } + + // Try to guess the number using only soft floating point (fast path) + SerdSoftFloat guess = {0, 0}; + const bool exact = sftod(in.frac, expt, in.n_digits, &guess); + const double g = soft_float_to_double(guess); + if (exact) { + return (double)in.sign * g; + } + + // Not sure, guess is either the number or its predecessor (rare slow path) + // Compare it with the buffer using bigints to find out which + const SerdSoftFloat upper = {guess.f * 2 + 1, guess.e - 1}; + const int cmp = compare_buffer(in.digits, in.digits_expt, upper); + if (cmp < 0) { + return in.sign * g; + } else if (cmp > 0) { + return in.sign * nextafter(g, (double)INFINITY); + } else if ((guess.f & 1) == 0) { + return in.sign * g; // Round towards even + } else { + return in.sign * nextafter(g, (double)INFINITY); // Round odd up + } } diff --git a/tests/decimal_test.c b/tests/decimal_test.c index 4fb521a5..f43a1143 100644 --- a/tests/decimal_test.c +++ b/tests/decimal_test.c @@ -17,14 +17,31 @@ #undef NDEBUG #include "../src/decimal.h" +#include "../src/string_utils.h" +#include "test_data.h" #include "serd/serd.h" +#ifdef _WIN32 +#include +#else +#include +#endif + #include +#include #include #include +#include #include +#include #include +#include + +#define DBL_INFINITY ((double)INFINITY) + +static size_t n_tests = 16384ul; +static uint32_t seed = 0; static void test_count_digits(void) @@ -54,6 +71,16 @@ test_count_digits(void) assert(20 == serd_count_digits(18446744073709551615ull)); } +static void +test_strtod(void) +{ + assert(serd_strtod("1E999", NULL) == (double)INFINITY); + assert(serd_strtod("-1E999", NULL) == (double)-INFINITY); + assert(serd_strtod("1E-999", NULL) == 0.0); + assert(serd_strtod("-1E-999", NULL) == -0.0); + assert(isnan(serd_strtod("ABCDEF", NULL))); +} + static void check_precision(const double d, const unsigned precision, @@ -94,10 +121,391 @@ test_precision(void) check_precision(12345.678900, 9, 1, "12345.6"); } +/// Check that `str` is a canonical xsd:float or xsd:double string +static void +test_canonical(const char* str, const size_t len) +{ + if (!strcmp(str, "NaN") || !strcmp(str, "-INF") || !strcmp(str, "INF")) { + return; + } + + assert(len > 4); // Shortest possible is something like 1.2E3 + assert(str[0] == '-' || is_digit(str[0])); + + const int first_digit = str[0] == '-' ? 1 : 0; + assert(is_digit(str[first_digit])); + assert(str[first_digit + 1] == '.'); + assert(is_digit(str[first_digit + 2])); + + const char* const e = strchr(str, 'E'); + assert(e); + assert(*e == 'E'); + assert(*(e + 1) == '-' || is_digit(*(e + 1))); +} + +/// Check that `f` round-trips, and serialises to `expected` if given +static void +test_float_value(const float f, const char* expected) +{ + SerdNode* const node = serd_new_float(f); + const char* str = serd_node_get_string(node); + size_t end = 0; + const float result = (float)serd_strtod(str, &end); + const bool match = result == f || (isnan(f) && isnan(result)); + + if (!match) { + fprintf(stderr, "error: value is %.9g\n", (double)result); + fprintf(stderr, "note: expected %.9g\n", (double)f); + fprintf(stderr, "note: string %s\n", str); + } + + assert(match); + assert(end == serd_node_get_length(node)); + assert((isnan(f) && isnan(result)) || result == f); + assert(!expected || !strcmp(str, expected)); + + test_canonical(str, serd_node_get_length(node)); + serd_node_free(node); +} + +static void +test_float(const bool exhaustive) +{ + test_float_value(NAN, "NaN"); + test_float_value(-INFINITY, "-INF"); + test_float_value(INFINITY, "INF"); + + test_float_value(-0.0f, "-0.0E0"); + test_float_value(+0.0f, "0.0E0"); + test_float_value(-1.0f, "-1.0E0"); + test_float_value(+1.0f, "1.0E0"); + + test_float_value(5.0f, "5.0E0"); + test_float_value(50.0f, "5.0E1"); + test_float_value(5000000000.0f, "5.0E9"); + test_float_value(-0.5f, "-5.0E-1"); + test_float_value(0.5f, "5.0E-1"); + test_float_value(0.0625f, "6.25E-2"); + test_float_value(0.0078125f, "7.8125E-3"); + + // Every digit of precision + test_float_value(134217728.0f, "1.34217728E8"); + + // Normal limits + test_float_value(FLT_MIN, NULL); + test_float_value(FLT_EPSILON, NULL); + test_float_value(FLT_MAX, NULL); + + // Subnormals + test_float_value(nextafterf(0.0f, 1.0f), NULL); + test_float_value(nextafterf(0.0f, -1.0f), NULL); + + // Past limits + assert((float)serd_strtod("1e39", NULL) == INFINITY); + assert((float)serd_strtod("1e-46", NULL) == 0.0f); + + // Powers of two (where the lower boundary is closer) + for (int i = -127; i <= 127; ++i) { + test_float_value(powf(2, (float)i), NULL); + } + + if (exhaustive) { + fprintf(stderr, "Testing xsd:float exhaustively\n"); + + for (uint64_t i = 0; i <= UINT32_MAX; ++i) { + const float f = float_from_rep((uint32_t)i); + test_float_value(f, NULL); + + if (i % 1000000 == 1) { + fprintf(stderr, + "%f%%\n", + ((double)i / (double)UINT32_MAX * 100.0)); + } + } + } else { + fprintf(stderr, "Testing xsd:float randomly\n"); + const size_t n_per_report = n_tests / 10ul; + uint64_t last_report = 0; + uint32_t rep = seed; + for (uint64_t i = 0; i < n_tests; ++i) { + rep = lcg32(rep); + + const float f = float_from_rep(rep); + + test_float_value(nextafterf(f, -INFINITY), NULL); + test_float_value(f, NULL); + test_float_value(nextafterf(f, INFINITY), NULL); + + if (i / n_per_report != last_report / n_per_report) { + fprintf(stderr, + "%u%%\n", + (unsigned)((double)i / (double)n_tests * 100.0)); + last_report = i; + } + } + } +} + +/// Check that `d` round-trips, and serialises to `expected` if given +static void +test_double_value(const double d, const char* expected) +{ + SerdNode* const node = serd_new_double(d); + const char* str = serd_node_get_string(node); + size_t end = 0; + const double result = serd_strtod(str, &end); + const bool match = result == d || (isnan(d) && isnan(result)); + + if (!match) { + fprintf(stderr, "error: value is %.17g\n", result); + fprintf(stderr, "note: expected %.17g\n", d); + fprintf(stderr, "note: string %s\n", str); + } + + assert(match); + assert(end == serd_node_get_length(node)); + assert((isnan(d) && isnan(result)) || result == d); + assert(!expected || !strcmp(str, expected)); + + test_canonical(str, serd_node_get_length(node)); + serd_node_free(node); +} + +static void +test_double(void) +{ + test_double_value((double)NAN, "NaN"); + test_double_value(-DBL_INFINITY, "-INF"); + test_double_value(DBL_INFINITY, "INF"); + + test_double_value(-0.0, "-0.0E0"); + test_double_value(+0.0, "0.0E0"); + test_double_value(-1.0, "-1.0E0"); + test_double_value(+1.0, "1.0E0"); + + test_double_value(5.0, "5.0E0"); + test_double_value(50.0, "5.0E1"); + test_double_value(500000000000000000000.0, "5.0E20"); + test_double_value(-0.5, "-5.0E-1"); + test_double_value(0.5, "5.0E-1"); + test_double_value(0.05, "5.0E-2"); + test_double_value(0.005, "5.0E-3"); + test_double_value(0.00000000000000000005, "5.0E-20"); + + // Leading whitespace special cases + assert(isnan(serd_strtod(" NaN", NULL))); + assert(serd_strtod(" -INF", NULL) == -DBL_INFINITY); + assert(serd_strtod(" INF", NULL) == DBL_INFINITY); + assert(serd_strtod(" +INF", NULL) == DBL_INFINITY); + + // Every digit of precision + test_double_value(18014398509481984.0, "1.8014398509481984E16"); + + // Normal limits + test_double_value(DBL_MIN, NULL); + test_double_value(nextafter(DBL_MIN, DBL_INFINITY), NULL); + test_double_value(DBL_EPSILON, NULL); + test_double_value(DBL_MAX, NULL); + test_double_value(nextafter(DBL_MAX, -DBL_INFINITY), NULL); + + // Subnormals + test_double_value(nextafter(0.0, 1.0), NULL); + test_double_value(nextafter(nextafter(0.0, 1.0), 1.0), NULL); + test_double_value(nextafter(0.0, -1.0), NULL); + test_double_value(nextafter(nextafter(0.0, -1.0), -1.0), NULL); + + // Past limits + assert(serd_strtod("1e309", NULL) == DBL_INFINITY); + assert(serd_strtod("12345678901234567123", NULL) == 12345678901234567000.0); + assert(serd_strtod("1e-325", NULL) == 0.0); + + // Various tricky cases + test_double_value(1e23, "1.0E23"); + test_double_value(6.02951420360127e-309, "6.02951420360127E-309"); + test_double_value(9.17857104364115e+288, "9.17857104364115E288"); + test_double_value(2.68248422823759e+22, "2.68248422823759E22"); + + // Powers of two (where the lower boundary is closer) + for (int i = -1023; i <= 1023; ++i) { + test_double_value(pow(2, i), NULL); + } + + fprintf(stderr, "Testing xsd:double randomly\n"); + + const size_t n_per_report = n_tests / 10ul; + uint64_t last_report = 0; + uint64_t rep = seed; + for (uint64_t i = 0; i < n_tests; ++i) { + rep = lcg64(rep); + + const double d = double_from_rep(rep); + + test_double_value(nextafter(d, -DBL_INFINITY), NULL); + test_double_value(d, NULL); + test_double_value(nextafter(d, DBL_INFINITY), NULL); + + if (i / n_per_report != last_report / n_per_report) { + fprintf(stderr, + "%u%%\n", + (unsigned)((double)i / (double)n_tests * 100.0)); + last_report = i; + } + } +} + +/// Check that `d` round-trips, and serialises to `expected` if given +static void +test_decimal_value(const double d, const char* expected) +{ + SerdNode* const node = serd_new_decimal(d, 17, 0, NULL); + const char* str = serd_node_get_string(node); + size_t end = 0; + const double result = serd_strtod(str, &end); + const bool match = result == d || (isnan(d) && isnan(result)); + + if (!match) { + fprintf(stderr, "error: value is %.17g\n", result); + fprintf(stderr, "note: expected %.17g\n", d); + fprintf(stderr, "note: string %s\n", str); + } + + assert(match); + assert(end == serd_node_get_length(node)); + + if (expected && strcmp(str, expected)) { + fprintf(stderr, "error: string is \"%s\"\n", str); + fprintf(stderr, "note: expected \"%s\"\n", expected); + } + assert(!expected || !strcmp(str, expected)); + + serd_node_free(node); +} + +static void +test_decimal(void) +{ + test_decimal_value(-0.0, "-0.0"); + test_decimal_value(+0.0, "0.0"); + test_decimal_value(-1.0, "-1.0"); + test_decimal_value(+1.0, "1.0"); + + test_decimal_value(5.0, "5.0"); + test_decimal_value(50.0, "50.0"); + test_decimal_value(500000000000000000000.0, "500000000000000000000.0"); + test_decimal_value(-0.5, "-0.5"); + test_decimal_value(0.5, "0.5"); + test_decimal_value(0.05, "0.05"); + test_decimal_value(0.005, "0.005"); + test_decimal_value(0.00000000000000000005, "0.00000000000000000005"); + + // Every digit of precision + test_decimal_value(18014398509481984.0, "18014398509481984.0"); + + // Normal limits + test_decimal_value(DBL_MIN, NULL); + test_decimal_value(nextafter(DBL_MIN, DBL_INFINITY), NULL); + test_decimal_value(DBL_EPSILON, NULL); + test_decimal_value(DBL_MAX, NULL); + test_decimal_value(nextafter(DBL_MAX, -DBL_INFINITY), NULL); + + // Subnormals + test_decimal_value(nextafter(0.0, 1.0), NULL); + test_decimal_value(nextafter(nextafter(0.0, 1.0), 1.0), NULL); + test_decimal_value(nextafter(0.0, -1.0), NULL); + test_decimal_value(nextafter(nextafter(0.0, -1.0), -1.0), NULL); + + // Past limits + assert(serd_strtod("1e309", NULL) == DBL_INFINITY); + assert(serd_strtod("12345678901234567123", NULL) == 12345678901234567000.0); + assert(serd_strtod("1e-325", NULL) == 0.0); + + // Various tricky cases + test_decimal_value(1e23, NULL); + test_decimal_value(6.02951420360127e-309, NULL); + test_decimal_value(9.17857104364115e+288, NULL); + test_decimal_value(2.68248422823759e+22, NULL); + + // Powers of two (where the lower boundary is closer) + for (int i = -1023; i <= 1023; ++i) { + test_decimal_value(pow(2, i), NULL); + } + + fprintf(stderr, "Testing xsd:decimal randomly\n"); + + const size_t n_per_report = n_tests / 10ul; + uint64_t last_report = 0; + uint64_t rep = seed; + for (uint64_t i = 0; i < n_tests; ++i) { + rep = lcg64(rep); + + const double d = double_from_rep(rep); + if (!isfinite(d)) { + continue; + } + + test_decimal_value(nextafter(d, (double)-INFINITY), NULL); + test_decimal_value(d, NULL); + test_decimal_value(nextafter(d, (double)INFINITY), NULL); + + if (i / n_per_report != last_report / n_per_report) { + fprintf(stderr, + "%u%%\n", + (unsigned)((double)i / (double)n_tests * 100.0)); + last_report = i; + } + } +} + +static int +print_usage(const char* name) +{ + fprintf(stderr, "Usage: %s [OPTION]...\n", name); + fprintf(stderr, "Test floating point conversion.\n"); + fprintf(stderr, " -n NUM_TESTS Number of random tests to run.\n"); + fprintf(stderr, " -s SEED Use random seed.\n"); + fprintf(stderr, " -x Exhaustively test floats.\n"); + return 1; +} + int -main(void) +main(int argc, char** argv) { + // Parse command line arguments + int a = 1; + bool exhaustive = false; + for (; a < argc && argv[a][0] == '-'; ++a) { + if (argv[a][1] == '\0') { + break; + } else if (argv[a][1] == 'x') { + exhaustive = true; + } else if (argv[a][1] == 's') { + if (++a == argc) { + return print_usage(argv[0]); + } + + seed = (uint32_t)strtol(argv[a], NULL, 10); + } else if (argv[a][1] == 'n') { + if (++a == argc) { + return print_usage(argv[0]); + } + + n_tests = (uint32_t)strtol(argv[a], NULL, 10); + } + } + + if (!seed) { + seed = (uint32_t)time(NULL) + (uint32_t)getpid(); + } + + fprintf(stderr, "Using random seed %u\n", seed); + test_count_digits(); + test_strtod(); test_precision(); + test_float(exhaustive); + test_double(); + test_decimal(); + + fprintf(stderr, "All tests passed\n"); return 0; } -- cgit v1.2.1