mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
math/big: fix Float.Float64 conversion for denormal corner cases
- This change uses the same code as for Float32 and fixes the case of a number that gets rounded up to the smallest denormal. - Enabled correspoding test case. Change-Id: I8aac874a566cd727863a82717854f603fbdc26c6 Reviewed-on: https://go-review.googlesource.com/10352 Reviewed-by: Alan Donovan <adonovan@google.com>
This commit is contained in:
parent
8de8925c77
commit
5ffbca4823
2 changed files with 40 additions and 39 deletions
|
|
@ -959,10 +959,6 @@ func (x *Float) Float32() (float32, Accuracy) {
|
||||||
panic("unreachable")
|
panic("unreachable")
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(gri) Use same algorithm for Float64 as for Float32. The Float64 code
|
|
||||||
// is incorrect for some corner cases (numbers that get rounded up to smallest
|
|
||||||
// denormal - test cases are missing).
|
|
||||||
|
|
||||||
// Float64 returns the float64 value nearest to x. If x is too small to be
|
// Float64 returns the float64 value nearest to x. If x is too small to be
|
||||||
// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
|
// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
|
||||||
// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
|
// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
|
||||||
|
|
@ -987,64 +983,70 @@ func (x *Float) Float64() (float64, Accuracy) {
|
||||||
emax = bias // 1023 largest unbiased exponent (normal)
|
emax = bias // 1023 largest unbiased exponent (normal)
|
||||||
)
|
)
|
||||||
|
|
||||||
// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
|
// Float mantissa m is 0.5 <= m < 1.0; compute exponent for floatxx mantissa.
|
||||||
// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
|
e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0
|
||||||
// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
|
p := mbits + 1 // precision of normal float
|
||||||
// corresponding Float exponent.
|
|
||||||
// (see also implementation of math.Ldexp for similar code)
|
|
||||||
|
|
||||||
if x.exp < dmin+1 {
|
// If the exponent is too small, we may have a denormal number
|
||||||
// underflow
|
// in which case we have fewer mantissa bits available: reduce
|
||||||
if x.neg {
|
// precision accordingly.
|
||||||
var z float64
|
if e < emin {
|
||||||
return -z, Above
|
p -= emin - int(e)
|
||||||
|
// Make sure we have at least 1 bit so that we don't
|
||||||
|
// lose numbers rounded up to the smallest denormal.
|
||||||
|
if p < 1 {
|
||||||
|
p = 1
|
||||||
}
|
}
|
||||||
return 0.0, Below
|
|
||||||
}
|
}
|
||||||
// x.exp >= dmin+1
|
|
||||||
|
|
||||||
|
// round
|
||||||
var r Float
|
var r Float
|
||||||
r.prec = mbits + 1 // +1 for implicit msb
|
r.prec = uint32(p)
|
||||||
if x.exp < emin+1 {
|
|
||||||
// denormal number - round to fewer bits
|
|
||||||
r.prec = uint32(x.exp - dmin)
|
|
||||||
}
|
|
||||||
r.Set(x)
|
r.Set(x)
|
||||||
|
e = r.exp - 1
|
||||||
|
|
||||||
// Rounding may have caused r to overflow to ±Inf
|
// Rounding may have caused r to overflow to ±Inf
|
||||||
// (rounding never causes underflows to 0).
|
// (rounding never causes underflows to 0).
|
||||||
if r.form == inf {
|
if r.form == inf {
|
||||||
r.exp = emax + 2 // cause overflow below
|
e = emax + 1 // cause overflow below
|
||||||
}
|
}
|
||||||
|
|
||||||
if r.exp > emax+1 {
|
// If the exponent is too large, overflow to ±Inf.
|
||||||
|
if e > emax {
|
||||||
// overflow
|
// overflow
|
||||||
if x.neg {
|
if x.neg {
|
||||||
return math.Inf(-1), Below
|
return math.Inf(-1), Below
|
||||||
}
|
}
|
||||||
return math.Inf(+1), Above
|
return math.Inf(+1), Above
|
||||||
}
|
}
|
||||||
// dmin+1 <= r.exp <= emax+1
|
|
||||||
|
|
||||||
var s uint64
|
// Determine sign, biased exponent, and mantissa.
|
||||||
if r.neg {
|
var sign, bexp, mant uint64
|
||||||
s = 1 << (fbits - 1)
|
if x.neg {
|
||||||
|
sign = 1 << (fbits - 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
m := high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
|
|
||||||
|
|
||||||
// Rounding may have caused a denormal number to
|
// Rounding may have caused a denormal number to
|
||||||
// become normal. Check again.
|
// become normal. Check again.
|
||||||
c := 1.0
|
if e < emin {
|
||||||
if r.exp < emin+1 {
|
|
||||||
// denormal number
|
// denormal number
|
||||||
r.exp += mbits
|
if e < dmin {
|
||||||
c = 1.0 / (1 << mbits) // 2**-mbits
|
// underflow to ±0
|
||||||
|
if x.neg {
|
||||||
|
var z float64
|
||||||
|
return -z, Above
|
||||||
|
}
|
||||||
|
return 0.0, Below
|
||||||
|
}
|
||||||
|
// bexp = 0
|
||||||
|
mant = high64(r.mant) >> (fbits - r.prec)
|
||||||
|
} else {
|
||||||
|
// normal number: emin <= e <= emax
|
||||||
|
bexp = uint64(e+bias) << mbits
|
||||||
|
mant = high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
|
||||||
}
|
}
|
||||||
// emin+1 <= r.exp <= emax+1
|
|
||||||
e := uint64(r.exp-emin) << mbits
|
|
||||||
|
|
||||||
return c * math.Float64frombits(s|e|m), r.acc
|
return math.Float64frombits(sign | bexp | mant), r.acc
|
||||||
|
|
||||||
case zero:
|
case zero:
|
||||||
if x.neg {
|
if x.neg {
|
||||||
|
|
|
||||||
|
|
@ -922,8 +922,7 @@ func TestFloatFloat64(t *testing.T) {
|
||||||
{"0x0.00000000000008p-1022", 0, Below},
|
{"0x0.00000000000008p-1022", 0, Below},
|
||||||
|
|
||||||
// denormals
|
// denormals
|
||||||
// TODO(gri) enable once Float64 is fixed
|
{"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal
|
||||||
// {"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal
|
|
||||||
{"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal
|
{"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal
|
||||||
{"0x.8p-1073", math.SmallestNonzeroFloat64, Exact},
|
{"0x.8p-1073", math.SmallestNonzeroFloat64, Exact},
|
||||||
{"1p-1074", math.SmallestNonzeroFloat64, Exact},
|
{"1p-1074", math.SmallestNonzeroFloat64, Exact},
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue