mirror of
				https://github.com/python/cpython.git
				synced 2025-10-26 03:04:41 +00:00 
			
		
		
		
	Add more comments to hypot() (GH-102817)
This commit is contained in:
		
							parent
							
								
									1cb75a9ce0
								
							
						
					
					
						commit
						3adb23a17d
					
				
					 1 changed files with 12 additions and 18 deletions
				
			
		|  | @ -2447,9 +2447,8 @@ Since lo**2 is less than 1/2 ulp(csum), we have csum+lo*lo == csum. | ||||||
| To minimize loss of information during the accumulation of fractional | To minimize loss of information during the accumulation of fractional | ||||||
| values, each term has a separate accumulator.  This also breaks up | values, each term has a separate accumulator.  This also breaks up | ||||||
| sequential dependencies in the inner loop so the CPU can maximize | sequential dependencies in the inner loop so the CPU can maximize | ||||||
| floating point throughput. [4]  On a 2.6 GHz Haswell, adding one | floating point throughput. [4]  On an Apple M1 Max, hypot(*vec) | ||||||
| dimension has an incremental cost of only 5ns -- for example when | takes only 3.33 µsec when len(vec) == 1000. | ||||||
| moving from hypot(x,y) to hypot(x,y,z). |  | ||||||
| 
 | 
 | ||||||
| The square root differential correction is needed because a | The square root differential correction is needed because a | ||||||
| correctly rounded square root of a correctly rounded sum of | correctly rounded square root of a correctly rounded sum of | ||||||
|  | @ -2473,7 +2472,7 @@ step is exact.  The Neumaier summation computes as if in doubled | ||||||
| precision (106 bits) and has the advantage that its input squares | precision (106 bits) and has the advantage that its input squares | ||||||
| are non-negative so that the condition number of the sum is one. | are non-negative so that the condition number of the sum is one. | ||||||
| The square root with a differential correction is likewise computed | The square root with a differential correction is likewise computed | ||||||
| as if in double precision. | as if in doubled precision. | ||||||
| 
 | 
 | ||||||
| For n <= 1000, prior to the final addition that rounds the overall | For n <= 1000, prior to the final addition that rounds the overall | ||||||
| result, the internal accuracy of "h" together with its correction of | result, the internal accuracy of "h" together with its correction of | ||||||
|  | @ -2514,12 +2513,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) | ||||||
|     } |     } | ||||||
|     frexp(max, &max_e); |     frexp(max, &max_e); | ||||||
|     if (max_e < -1023) { |     if (max_e < -1023) { | ||||||
|         /* When max_e < -1023, ldexp(1.0, -max_e) would overflow.
 |         /* When max_e < -1023, ldexp(1.0, -max_e) would overflow. */ | ||||||
|            So we first perform lossless scaling from subnormals back to normals, |  | ||||||
|            then recurse back to vector_norm(), and then finally undo the scaling. |  | ||||||
|         */ |  | ||||||
|         for (i=0 ; i < n ; i++) { |         for (i=0 ; i < n ; i++) { | ||||||
|             vec[i] /= DBL_MIN; |             vec[i] /= DBL_MIN;          // convert subnormals to normals
 | ||||||
|         } |         } | ||||||
|         return DBL_MIN * vector_norm(n, vec, max / DBL_MIN, found_nan); |         return DBL_MIN * vector_norm(n, vec, max / DBL_MIN, found_nan); | ||||||
|     } |     } | ||||||
|  | @ -2529,17 +2525,14 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) | ||||||
|     for (i=0 ; i < n ; i++) { |     for (i=0 ; i < n ; i++) { | ||||||
|         x = vec[i]; |         x = vec[i]; | ||||||
|         assert(Py_IS_FINITE(x) && fabs(x) <= max); |         assert(Py_IS_FINITE(x) && fabs(x) <= max); | ||||||
| 
 |         x *= scale;                     // lossless scaling
 | ||||||
|         x *= scale; |  | ||||||
|         assert(fabs(x) < 1.0); |         assert(fabs(x) < 1.0); | ||||||
| 
 |         pr = dl_mul(x, x);              // lossless squaring
 | ||||||
|         pr = dl_mul(x, x); |  | ||||||
|         assert(pr.hi <= 1.0); |         assert(pr.hi <= 1.0); | ||||||
| 
 |         sm = dl_fast_sum(csum, pr.hi);  // lossless addition
 | ||||||
|         sm = dl_fast_sum(csum, pr.hi); |  | ||||||
|         csum = sm.hi; |         csum = sm.hi; | ||||||
|         frac1 += pr.lo; |         frac1 += pr.lo;                 // lossy addition
 | ||||||
|         frac2 += sm.lo; |         frac2 += sm.lo;                 // lossy addition
 | ||||||
|     } |     } | ||||||
|     h = sqrt(csum - 1.0 + (frac1 + frac2)); |     h = sqrt(csum - 1.0 + (frac1 + frac2)); | ||||||
|     pr = dl_mul(-h, h); |     pr = dl_mul(-h, h); | ||||||
|  | @ -2548,7 +2541,8 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) | ||||||
|     frac1 += pr.lo; |     frac1 += pr.lo; | ||||||
|     frac2 += sm.lo; |     frac2 += sm.lo; | ||||||
|     x = csum - 1.0 + (frac1 + frac2); |     x = csum - 1.0 + (frac1 + frac2); | ||||||
|     return (h + x / (2.0 * h)) / scale; |     h +=  x / (2.0 * h);                 // differential correction
 | ||||||
|  |     return h / scale; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #define NUM_STACK_ELEMS 16 | #define NUM_STACK_ELEMS 16 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Raymond Hettinger
						Raymond Hettinger