Add precise decimal digit generation

author: David Robillard <d@drobilla.net> 2019-10-06 20:40:56 +0200
committer: David Robillard <d@drobilla.net> 2020-06-21 18:12:04 +0200
commit: a204962e35263d0f0aca8e9b34e5091669c7e057 (patch)
tree: f31fa1e5fe01a45011db5a6e88f47500cce4f06e
parent: e7cacc2a5ec010f6cc0bc12bd0bd4657c04236ee (diff)
download: serd-a204962e35263d0f0aca8e9b34e5091669c7e057.tar.gz
serd-a204962e35263d0f0aca8e9b34e5091669c7e057.tar.bz2
serd-a204962e35263d0f0aca8e9b34e5091669c7e057.zip
6 files changed, 400 insertions, 66 deletions
diff --git a/serd/serd.h b/serd/serd.h
index 8526a9ab..cde9a3c4 100644
--- a/serd/serd.h
+++ b/serd/serd.h
@@ -1,5 +1,5 @@
 /*
-  Copyright 2011-2017 David Robillard <http://drobilla.net>
+  Copyright 2011-2020 David Robillard <http://drobilla.net>
 
   Permission to use, copy, modify, and/or distribute this software for any
   purpose with or without fee is hereby granted, provided that the above
@@ -702,16 +702,24 @@ serd_new_relative_uri(const char*     str,
    will be written after the decimal point, but trailing zeros will
    automatically be omitted (except one if `d` is a round integer).
 
-   Note that about 16 and 8 fractional digits are required to precisely
-   represent a double and float, respectively.
+   Note that 17 and 9 significant digits are required to precisely represent
+   any double and float, respectively.  This function may be lossy if
+   `precision` is too low, or if `frac_digits` is not 0 (unlimited).
+
+   This function may produce very long strings, serd_new_float() or
+   serd_new_double() may be a better choice for very large or small numbers.
 
    @param d The value for the new node.
-   @param frac_digits The maximum number of digits after the decimal place.
+   @param max_precision Maximum number of significant decimal digits.
+   @param max_frac_digits Maximum number of digits after decimal point, or 0.
    @param datatype Datatype of node, or NULL for xsd:decimal.
 */
 SERD_API
 SerdNode*
-serd_new_decimal(double d, unsigned frac_digits, const SerdNode* datatype);
+serd_new_decimal(double          d,
+                 unsigned        max_precision,
+                 unsigned        max_frac_digits,
+                 const SerdNode* datatype);
 
 /**
    Create a new node by serialising `i` into an xsd:integer string
diff --git a/src/decimal.c b/src/decimal.c
index f60eafa7..bdd111a7 100644
--- a/src/decimal.c
+++ b/src/decimal.c
@@ -16,11 +16,14 @@
 
 #include "decimal.h"
 
+#include "bigint.h"
+#include "ieee_float.h"
 #include "int_math.h"
+#include "soft_float.h"
 
 #include <assert.h>
-#include <float.h>
 #include <math.h>
+#include <stdbool.h>
 #include <stddef.h>
 
 int
@@ -29,60 +32,216 @@ serd_count_digits(const uint64_t i)
 	return i == 0 ? 1 : (int)serd_ilog10(i) + 1;
 }
 
-unsigned
-serd_double_int_digits(const double abs)
+/*
+  This is more or less just an implementation of the classic rational number
+  based floating point print routine ("Dragon4").  See "How to Print
+  Floating-Point Numbers Accurately" by Guy L. Steele Jr. and Jon L White for
+  the canonical source.  The basic idea is to find a big rational between 1 and
+  10 where value = (numer / denom) * 10^e, then continuously divide it to
+  generate decimal digits.
+
+  Unfortunately, this algorithm requires pretty massive bigints to work
+  correctly for all doubles, and isn't particularly fast.  Something like
+  Grisu3 could be added to improve performance, but that has the annoying
+  property of needing a more precise fallback in some cases, meaning it would
+  only add more code, not replace any.  Since this is already a pretty
+  ridiculous amount of code, I'll hold off on this until it becomes a problem,
+  or somebody comes up with a better algorithm.
+*/
+
+/// Return true if the number is within the lower boundary
+static bool
+within_lower(const SerdBigint* const numer,
+             const SerdBigint* const d_lower,
+             const bool              is_even)
+{
+	return is_even ? serd_bigint_compare(numer, d_lower) <= 0
+	               : serd_bigint_compare(numer, d_lower) < 0;
+}
+
+/// Return true if the number is within the upper boundary
+static bool
+within_upper(const SerdBigint* const numer,
+             const SerdBigint* const denom,
+             const SerdBigint* const d_upper,
+             const bool              is_even)
 {
-	const double lg = ceil(log10(floor(abs) + 1.0));
-	return lg < 1.0 ? 1U : (unsigned)lg;
+	return is_even ? serd_bigint_plus_compare(numer, d_upper, denom) >= 0
+	               : serd_bigint_plus_compare(numer, d_upper, denom) > 0;
 }
 
-unsigned
-serd_decimals(const double d, char* const buf, const unsigned frac_digits)
+/**
+   Find values so that 0.1 <= numer/denom < 1 or 1 <= numer/denom < 10.
+
+   @param significand Double significand.
+   @param exponent Double exponent (base 2).
+   @param decimal_power Decimal exponent (log10 of the double).
+   @param[out] numer Numerator of rational number.
+   @param[out] denom Denominator of rational number.
+   @param[out] delta Distance to the lower and upper boundaries.
+*/
+static void
+calculate_initial_values(const uint64_t    significand,
+                         const int         exponent,
+                         const int         decimal_power,
+                         const bool        lower_is_closer,
+                         SerdBigint* const numer,
+                         SerdBigint* const denom,
+                         SerdBigint* const delta)
 {
-	assert(isfinite(d));
-
-	// Point s to decimal point location
-	const double   abs_d      = fabs(d);
-	const double   int_part   = floor(abs_d);
-	const unsigned int_digits = serd_double_int_digits(abs_d);
-	unsigned       n_bytes    = 0;
-	char*          s          = buf + int_digits;
-	if (d < 0.0) {
-		*buf = '-';
-		++s;
+	/* Use a common denominator of 2^1 so that boundary distance is an integer.
+	   If the lower boundary is closer, we need to scale everything but the
+	   lower boundary to compensate, so add another factor of two here (this is
+	   faster than shifting them again later as in the paper). */
+	const unsigned lg_denom = 1u + lower_is_closer;
+
+	if (exponent >= 0) {
+		// delta = 2^e
+		serd_bigint_set_u32(delta, 1);
+		serd_bigint_shift_left(delta, (unsigned)exponent);
+
+		// numer = f * 2^e
+		serd_bigint_set_u64(numer, significand);
+		serd_bigint_shift_left(numer, (unsigned)exponent + lg_denom);
+
+		// denom = 10^d
+		serd_bigint_set_pow10(denom, (unsigned)decimal_power);
+		serd_bigint_shift_left(denom, lg_denom);
+	} else if (decimal_power >= 0) {
+		// delta = 2^e, which is just 1 here since 2^-e is in the denominator
+		serd_bigint_set_u32(delta, 1);
+
+		// numer = f
+		serd_bigint_set_u64(numer, significand);
+		serd_bigint_shift_left(numer, lg_denom);
+
+		// denom = 10^d * 2^-e
+		serd_bigint_set_pow10(denom, (unsigned)decimal_power);
+		serd_bigint_shift_left(denom, (unsigned)-exponent + lg_denom);
+	} else {
+		// delta = 10^d
+		serd_bigint_set_pow10(delta, (unsigned)-decimal_power);
+
+		// numer = f * 10^-d
+		serd_bigint_set(numer, delta);
+		serd_bigint_multiply_u64(numer, significand);
+		serd_bigint_shift_left(numer, lg_denom);
+
+		// denom = 2^-exponent
+		serd_bigint_set_u32(denom, 1);
+		serd_bigint_shift_left(denom, (unsigned)-exponent + lg_denom);
 	}
+}
 
-	// Write integer part (right to left)
-	char*    t   = s - 1;
-	uint64_t dec = (uint64_t)int_part;
-	do {
-		*t-- = (char)('0' + dec % 10);
-	} while ((dec /= 10) > 0);
+#ifndef NDEBUG
+static bool
+check_initial_values(const SerdBigint* const numer,
+                     const SerdBigint* const denom,
+                     const SerdBigint* const d_upper)
+{
+	SerdBigint upper = *numer;
+	serd_bigint_add(&upper, d_upper);
+	assert(serd_bigint_compare(&upper, denom) >= 0);
 
+	const uint32_t div = serd_bigint_divmod(&upper, denom);
+	assert(div >= 1 && div < 10);
+	return true;
+}
+#endif
+
+static unsigned
+emit_digits(SerdBigint* const       numer,
+            const SerdBigint* const denom,
+            SerdBigint* const       d_lower,
+            SerdBigint* const       d_upper,
+            const bool              is_even,
+            char* const             buffer,
+            const size_t            max_digits)
+{
+	unsigned length = 0;
+	for (size_t i = 0; i < max_digits; ++i) {
+		// Emit the next digit
+		const uint32_t digit = serd_bigint_divmod(numer, denom);
+		assert(digit <= 9);
+		buffer[length++] = (char)('0' + digit);
+
+		// Check for termination
+		const bool within_low  = within_lower(numer, d_lower, is_even);
+		const bool within_high = within_upper(numer, denom, d_upper, is_even);
+		if (!within_low && !within_high) {
+			serd_bigint_multiply_u32(numer, 10);
+			serd_bigint_multiply_u32(d_lower, 10);
+			if (d_lower != d_upper) {
+				serd_bigint_multiply_u32(d_upper, 10);
+			}
+		} else {
+			if (!within_low ||
+			    (within_high &&
+			     serd_bigint_plus_compare(numer, numer, denom) >= 0)) {
+				// In high only, or halfway and the next digit is > 5, round up
+				assert(buffer[length - 1] != '9');
+				buffer[length - 1]++;
+			}
+
+			break;
+		}
+	}
 
-	*s++ = '.';
+	return length;
+}
 
-	// Write fractional part (right to left)
-	double frac_part = fabs(d - int_part);
-	if (frac_part < DBL_EPSILON) {
-		*s++ = '0';
-		n_bytes = (size_t)(s - buf);
+SerdDecimalCount
+serd_decimals(const double d, char* const buf, const unsigned max_digits)
+{
+	assert(isfinite(d) && fpclassify(d) != FP_ZERO);
+
+	const SerdSoftFloat value           = soft_float_from_double(d);
+	const int           power           = (int)(log10(d));
+	const bool          is_even         = !(value.f & 1);
+	const bool          lower_is_closer = double_lower_boundary_is_closer(d);
+
+	// Calculate initial values so that v = (numer / denom) * 10^power
+	SerdBigint numer;
+	SerdBigint denom;
+	SerdBigint d_lower;
+	calculate_initial_values(
+	        value.f, value.e, power, lower_is_closer, &numer, &denom, &d_lower);
+
+	SerdBigint  d_upper_storage;
+	SerdBigint* d_upper = NULL;
+	if (lower_is_closer) {
+		// Scale upper boundary to account for the closer lower boundary
+		// (the numerator and denominator were already scaled above)
+		d_upper_storage = d_lower;
+		d_upper         = &d_upper_storage;
+		serd_bigint_shift_left(d_upper, 1);
 	} else {
-		uint64_t frac = (uint64_t)llround(frac_part * pow(10.0, (int)frac_digits));
-		s += frac_digits - 1;
-		unsigned i = 0;
+		d_upper = &d_lower; // Boundaries are the same, reuse the lower
+	}
 
-		// Skip trailing zeros
-		for (; i < frac_digits - 1 && !(frac % 10); ++i, --s, frac /= 10) {}
+	// Scale if necessary to make 1 <= (numer + delta) / denom < 10
+	SerdDecimalCount count = {0, 0};
+	if (within_upper(&numer, &denom, d_upper, is_even)) {
+		count.expt = power;
+	} else {
+		count.expt = power - 1;
+		serd_bigint_multiply_u32(&numer, 10);
+		serd_bigint_multiply_u32(&d_lower, 10);
+		if (d_upper != &d_lower) {
+			serd_bigint_multiply_u32(d_upper, 10);
+		}
+	}
 
-		n_bytes = (size_t)(s - buf) + 1u;
+	// Write digits to output
+	assert(check_initial_values(&numer, &denom, d_upper));
+	count.count = emit_digits(
+	        &numer, &denom, &d_lower, d_upper, is_even, buf, max_digits);
 
-		// Write digits from last trailing zero to decimal point
-		for (; i < frac_digits; ++i) {
-			*s-- = (char)('0' + (frac % 10));
-			frac /= 10;
-		}
+	// Trim trailing zeros
+	while (count.count > 1 && buf[count.count - 1] == '0') {
+		buf[--count.count] = 0;
 	}
 
-	return n_bytes;
+	buf[count.count] = '\0';
+	return count;
 }
diff --git a/src/decimal.h b/src/decimal.h
index eefc07ef..8575cd85 100644
--- a/src/decimal.h
+++ b/src/decimal.h
@@ -19,14 +19,26 @@
 
 #include <stdint.h>
 
+typedef struct {
+	unsigned count; ///< Number of digits
+	int      expt;  ///< Power of 10 exponent
+} SerdDecimalCount;
+
 /// Return the number of decimal digits required to represent `n`
 int
 serd_count_digits(uint64_t i);
 
-unsigned
-serd_double_int_digits(double abs);
+/**
+   Write significant decimal digits for `d` into `buf`.
+
+   Writes only significant digits, without any leading or trailing zeros.  The
+   actual number is given by the exponent in the return value.
 
-unsigned
-serd_decimals(double d, char* buf, unsigned frac_digits);
+   @param d The number to convert to decimal, must be finite and non-zero.
+   @param buf The output buffer at least `max_digits` long.
+   @param max_digits The maximum number of digits to write.
+*/
+SerdDecimalCount
+serd_decimals(double d, char* buf, unsigned max_digits);
 
 #endif // SERD_DECIMAL_H
diff --git a/src/node.c b/src/node.c
index 2bd47038..9a0ad3b9 100644
--- a/src/node.c
+++ b/src/node.c
@@ -17,6 +17,7 @@
 #include "node.h"
 
 #include "decimal.h"
+#include "int_math.h"
 #include "namespaces.h"
 #include "static_nodes.h"
 #include "string_utils.h"
@@ -30,6 +31,11 @@
 #include <stdlib.h>
 #include <string.h>
 
+// Define C11 numeric constants if the compiler hasn't already
+#ifndef DBL_DECIMAL_DIG
+#    define DBL_DECIMAL_DIG 17
+#endif
+
 static const size_t serd_node_align = sizeof(SerdNode);
 
 static SerdNode*
@@ -566,25 +572,126 @@ serd_new_relative_uri(const char*     str,
 	return node;
 }
 
-SerdNode*
-serd_new_decimal(double d, unsigned frac_digits, const SerdNode* datatype)
+static size_t
+copy_digits(char* const dest, const char* const src, const size_t n)
+{
+	memcpy(dest, src, n);
+	return n;
+}
+
+static size_t
+set_zeros(char* const dest, const size_t n)
 {
-	if (!isfinite(d)) {
+	memset(dest, '0', n);
+	return n;
+}
+
+typedef enum { POINT_AFTER, POINT_BEFORE, POINT_BETWEEN } PointLocation;
+
+typedef struct {
+	PointLocation point_loc;      ///< Location of decimal point
+	unsigned      n_zeros_before; ///< Number of additional zeros before point
+	unsigned      n_zeros_after;  ///< Number of additional zeros after point
+	unsigned      n_digits;       ///< Number of significant digits
+} SerdDecimalMetrics;
+
+static SerdDecimalMetrics
+get_decimal_metrics(const SerdDecimalCount count)
+{
+	const int expt =
+		count.expt >= 0 ? (count.expt - (int)count.count + 1) : count.expt;
+
+	const unsigned     n_digits = count.count;
+	SerdDecimalMetrics metrics  = {POINT_AFTER, 0, 0, n_digits};
+	if (count.expt >= (int)n_digits - 1) {
+		metrics.point_loc      = POINT_AFTER;
+		metrics.n_zeros_before = (unsigned)count.expt - (count.count - 1u);
+		metrics.n_zeros_after  = 1u;
+	} else if (count.expt < 0) {
+		metrics.point_loc      = POINT_BEFORE;
+		metrics.n_zeros_before = 1u;
+		metrics.n_zeros_after = (unsigned)(-expt - 1);
+	} else {
+		metrics.point_loc = POINT_BETWEEN;
+	}
+
+	return metrics;
+}
+
+SerdNode*
+serd_new_decimal(const double    d,
+                 const unsigned  max_precision,
+                 const unsigned  max_frac_digits,
+                 const SerdNode* datatype)
+{
+	const SerdNode* const type    = datatype ? datatype : &serd_xsd_decimal.node;
+	const int             fpclass = fpclassify(d);
+
+	if (fpclass == FP_ZERO) {
+		return signbit(d) ? serd_new_typed_literal("-0.0", type)
+		                  : serd_new_typed_literal("0.0", type);
+	} else if (fpclass != FP_NORMAL && fpclass != FP_SUBNORMAL) {
 		return NULL;
 	}
 
-	const SerdNode* type       = datatype ? datatype : &serd_xsd_decimal.node;
-	const double    abs_d      = fabs(d);
-	const unsigned  int_digits = serd_double_int_digits(abs_d);
-	const size_t    len        = int_digits + frac_digits + 3;
-	const size_t    type_len   = serd_node_total_size(type);
-	const size_t    total_len  = len + type_len;
+	// Adjust precision to get the right number of fractional digits
+	unsigned precision = max_precision;
+	if (max_frac_digits) {
+		const int order              = (int)(log10(fabs(d)) + 1);
+		const int required_precision = (int)max_frac_digits + order;
 
-	SerdNode* const node =
-		serd_node_malloc(total_len, SERD_HAS_DATATYPE, SERD_LITERAL);
+		precision = (unsigned)MIN((int)max_precision,
+		                          MAX(0, required_precision));
+	}
+
+	if (precision == 0) {
+		return serd_new_typed_literal("0.0", type);
+	}
+
+	// Get decimal digits and measure
+	char                   digits[DBL_DECIMAL_DIG + 1] = {0};
+	const SerdDecimalCount count = serd_decimals(fabs(d), digits, precision);
+	SerdDecimalMetrics     m     = get_decimal_metrics(count);
+
+	// Calculate string length and allocate node
+	const unsigned           n_zeros  = m.n_zeros_before + m.n_zeros_after;
+	const size_t             len      = (d < 0) + m.n_digits + 1 + n_zeros;
+	const size_t             type_len = serd_node_total_size(type);
+	SerdNode* const          node =
+		serd_node_malloc(len + type_len, SERD_HAS_DATATYPE, SERD_LITERAL);
+
+	char* ptr = serd_node_buffer(node);
+	if (d < 0) {
+		*ptr++ = '-';
+	}
+
+	if (m.point_loc == POINT_AFTER) {
+		ptr += copy_digits(ptr, digits, m.n_digits);
+		ptr += set_zeros(ptr, m.n_zeros_before);
+		*ptr++ = '.';
+		*ptr++ = '0';
+	} else if (m.point_loc == POINT_BEFORE) {
+		*ptr++ = '0';
+		*ptr++ = '.';
+		ptr += set_zeros(ptr, m.n_zeros_after);
+		copy_digits(ptr, digits, m.n_digits);
+	} else {
+		assert(m.point_loc == POINT_BETWEEN);
+		assert(count.expt >= -1);
+
+		const size_t n_before = (size_t)count.expt + 1u;
+		const size_t n_after =
+		        (max_frac_digits ? MIN(max_frac_digits, m.n_digits - n_before)
+		                         : m.n_digits - n_before);
 
-	node->n_bytes = serd_decimals(d, serd_node_buffer(node), frac_digits);
+		ptr += copy_digits(ptr, digits, n_before);
+		*ptr++ = '.';
+		memcpy(ptr, digits + n_before, n_after);
+	}
 
+	assert(strlen(serd_node_buffer(node)) == len);
+	node->n_bytes = len;
+	assert(serd_node_get_meta(node)->type == 0);
 	memcpy(serd_node_get_meta(node), type, type_len);
 	serd_node_check_padding(node);
 	return node;
diff --git a/tests/decimal_test.c b/tests/decimal_test.c
index b0361e2c..4fb521a5 100644
--- a/tests/decimal_test.c
+++ b/tests/decimal_test.c
@@ -1,5 +1,5 @@
 /*
-  Copyright 2011-2019 David Robillard <http://drobilla.net>
+  Copyright 2011-2020 David Robillard <http://drobilla.net>
 
   Permission to use, copy, modify, and/or distribute this software for any
   purpose with or without fee is hereby granted, provided that the above
@@ -18,7 +18,13 @@
 
 #include "../src/decimal.h"
 
+#include "serd/serd.h"
+
 #include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
 
 static void
 test_count_digits(void)
@@ -48,8 +54,50 @@ test_count_digits(void)
 	assert(20 == serd_count_digits(18446744073709551615ull));
 }
 
+static void
+check_precision(const double   d,
+                const unsigned precision,
+                const unsigned frac_digits,
+                const char*    expected)
+{
+	SerdNode* const node = serd_new_decimal(d, precision, frac_digits, NULL);
+	const char*     str  = serd_node_get_string(node);
+
+	if (strcmp(str, expected)) {
+		fprintf(stderr, "error: string is \"%s\"\n", str);
+		fprintf(stderr, "note:  expected  \"%s\"\n", expected);
+		assert(false);
+	}
+
+	serd_node_free(node);
+}
+
+static void
+test_precision(void)
+{
+	assert(serd_new_decimal((double)INFINITY, 17, 0, NULL) == NULL);
+	assert(serd_new_decimal((double)-INFINITY, 17, 0, NULL) == NULL);
+	assert(serd_new_decimal((double)NAN, 17, 0, NULL) == NULL);
+
+	check_precision(1.0000000001, 17, 8, "1.0");
+	check_precision(0.0000000001, 17, 10, "0.0000000001");
+	check_precision(0.0000000001, 17, 8, "0.0");
+
+	check_precision(12345.678900, 9, 5, "12345.6789");
+	check_precision(12345.678900, 8, 5, "12345.678");
+	check_precision(12345.678900, 5, 5, "12345.0");
+	check_precision(12345.678900, 3, 5, "12300.0");
+
+	check_precision(12345.678900, 9, 0, "12345.6789");
+	check_precision(12345.678900, 9, 5, "12345.6789");
+	check_precision(12345.678900, 9, 3, "12345.678");
+	check_precision(12345.678900, 9, 1, "12345.6");
+}
+
 int
 main(void)
 {
 	test_count_digits();
+	test_precision();
+	return 0;
 }
diff --git a/tests/serd_test.c b/tests/serd_test.c
index 02527a51..64c4530b 100644
--- a/tests/serd_test.c
+++ b/tests/serd_test.c
@@ -300,13 +300,13 @@ test_double_to_node(void)
 	                                "0.01",
 	                                "2.05",
 	                                "-16.00001",
-	                                "5.00000001",
+	                                "5.0",
 	                                "0.0",
 	                                NULL,
 	                                NULL };
 
 	for (size_t i = 0; i < sizeof(dbl_test_nums) / sizeof(double); ++i) {
-		SerdNode*   node     = serd_new_decimal(dbl_test_nums[i], 8, NULL);
+		SerdNode*   node     = serd_new_decimal(dbl_test_nums[i], 17, 8, NULL);
 		const char* node_str = serd_node_get_string(node);
 		const bool  pass     = (node_str && dbl_test_strs[i])
 		                          ? !strcmp(node_str, dbl_test_strs[i])
author	David Robillard <d@drobilla.net>	2019-10-06 20:40:56 +0200
committer	David Robillard <d@drobilla.net>	2020-06-21 18:12:04 +0200
commit	a204962e35263d0f0aca8e9b34e5091669c7e057 (patch)
tree	f31fa1e5fe01a45011db5a6e88f47500cce4f06e
parent	e7cacc2a5ec010f6cc0bc12bd0bd4657c04236ee (diff)
download	serd-a204962e35263d0f0aca8e9b34e5091669c7e057.tar.gz serd-a204962e35263d0f0aca8e9b34e5091669c7e057.tar.bz2 serd-a204962e35263d0f0aca8e9b34e5091669c7e057.zip