image/jpeg: replace fdct.go and idct.go with new implementation in dct.go

The fdct.go and idct.go files were derived from the MPEG-SSG and JPEG-IJG reference code and therefore carry licenses specific to those groups. Various license checkers flag these files as potentially problematic. The code is also not terribly well documented. This CL fixes the license problem by adding a new, from-scratch implementation using a different algorithm. As a bonus, the new code is both faster and more accurate than the old encumbered code. On speed, the new code is up to 20% faster; benchmarks below. On accuracy, in the set of blocks used in the test, we can measure the number of output values that are off-by-one from the exact rounded answer. The old FDCT was off in 8.6% of values; the new one is off in 2.5%. The old IDCT was off in 1.4% of values; the new one is off in 1.2%. goos: darwin goarch: arm64 pkg: image/jpeg cpu: Apple M3 Pro │ old │ new │ │ sec/op │ sec/op vs base │ FDCT-12 619.6n ± 3% 586.5n ± 1% -5.34% (p=0.000 n=10) IDCT-12 752.4n ± 4% 628.0n ± 1% -16.54% (p=0.000 n=10) goos: linux goarch: amd64 cpu: Intel(R) Xeon(R) CPU @ 2.30GHz │ old │ new │ │ sec/op │ sec/op vs base │ FDCT-16 1.817µ ± 0% 1.542µ ± 0% -15.11% (p=0.000 n=10) IDCT-16 1.897µ ± 0% 1.514µ ± 0% -20.22% (p=0.000 n=10) goos: linux goarch: arm64 cpu: whatever gotip-linux-arm64 has │ old │ new │ │ sec/op │ sec/op vs base │ FDCT-8 1.844µ ± 0% 1.847µ ± 0% +0.14% (p=0.000 n=10) IDCT-8 2.127µ ± 0% 1.973µ ± 0% -7.26% (p=0.000 n=10) Change-Id: Ie6d14103c8478ba5a779f234da84f345828eb925 Reviewed-on: https://go-review.googlesource.com/c/go/+/705518 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Alan Donovan <adonovan@google.com> Auto-Submit: Russ Cox <rsc@golang.org> Reviewed-by: Nigel Tao <nigeltao@google.com>
2025-12-08 06:10:04 +00:00 · 2025-09-18 12:27:06 -04:00 · 2025-09-18 12:27:06 -04:00 · fbba930271
commit fbba930271
parent 92e093467f
5 changed files with 538 additions and 403 deletions
--- a/src/image/decode_example_test.go
+++ b/src/image/decode_example_test.go
@ -70,22 +70,22 @@ func Example() {
 	}
 	// Output:
 	// bin               red  green   blue  alpha
-	// 0x0000-0x0fff:    364    790   7242      0
+	// 0x0000-0x0fff:    362    793   7245      0
-	// 0x1000-0x1fff:    645   2967   1039      0
+	// 0x1000-0x1fff:    648   2963   1036      0
-	// 0x2000-0x2fff:   1072   2299    979      0
+	// 0x2000-0x2fff:   1072   2301    977      0
-	// 0x3000-0x3fff:    820   2266    980      0
+	// 0x3000-0x3fff:    819   2266    982      0
-	// 0x4000-0x4fff:    537   1305    541      0
+	// 0x4000-0x4fff:    537   1303    541      0
-	// 0x5000-0x5fff:    319    962    261      0
+	// 0x5000-0x5fff:    321    964    261      0
-	// 0x6000-0x6fff:    322    375    177      0
+	// 0x6000-0x6fff:    321    375    177      0
-	// 0x7000-0x7fff:    601    279    214      0
+	// 0x7000-0x7fff:    599    278    213      0
-	// 0x8000-0x8fff:   3478    227    273      0
+	// 0x8000-0x8fff:   3478    228    275      0
-	// 0x9000-0x9fff:   2260    234    329      0
+	// 0x9000-0x9fff:   2260    233    328      0
-	// 0xa000-0xafff:    921    282    373      0
+	// 0xa000-0xafff:    921    282    374      0
-	// 0xb000-0xbfff:    321    335    397      0
+	// 0xb000-0xbfff:    322    335    395      0
-	// 0xc000-0xcfff:    229    388    298      0
+	// 0xc000-0xcfff:    228    388    299      0
-	// 0xd000-0xdfff:    260    414    277      0
+	// 0xd000-0xdfff:    261    415    277      0
-	// 0xe000-0xefff:    516    428    298      0
+	// 0xe000-0xefff:    516    423    297      0
-	// 0xf000-0xffff:   2785   1899   1772  15450
+	// 0xf000-0xffff:   2785   1903   1773  15450
 }
 const data = `
--- a/src/image/jpeg/dct.go
+++ b/src/image/jpeg/dct.go
@ -0,0 +1,521 @@
 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package jpeg
 // Discrete Cosine Transformation (DCT) implementations using the algorithm from
 // Christoph Loeffler, Adriaan Lightenberg, and George S. Mostchytz,
 // “Practical Fast 1-D DCT Algorithms with 11 Multiplications,” ICASSP 1989.
 // https://ieeexplore.ieee.org/document/266596
 //
 // Since the paper is paywalled, the rest of this comment gives a summary.
 //
 // A 1-dimensional forward DCT (1D FDCT) takes as input 8 values x0..x7
 // and transforms them in place into the result values.
 //
 // The mathematical definition of the N-point 1D FDCT is:
 //
 //	X[k] = α_k Σ_n x[n] * cos (2n+1)*k*π/2N
 //
 // where α₀ = √2 and α_k = 1 for k > 0.
 //
 // For our purposes, N=8, so the angles end up being multiples of π/16.
 // The most direct implementation of this definition would require 64 multiplications.
 //
 // Loeffler's paper presents a more efficient computation that requires only
 // 11 multiplications and works in terms of three basic operations:
 //
 //  - A “butterfly” x0, x1 = x0+x1, x0-x1.
 //    The inverse is x0, x1 = (x0+x1)/2, (x0-x1)/2.
 //
 //  - A scaling of x0 by k: x0 *= k. The inverse is scaling by 1/k.
 //
 //  - A rotation of x0, x1 by θ, defined as:
 //    x0, x1 = x0 cos θ + x1 sin θ, -x0 sin θ + x1 cos θ.
 //    The inverse is rotation by -θ.
 //
 // The algorithm proceeds in four stages:
 //
 // Stage 1:
 //  - butterfly x0, x7; x1, x6; x2, x5; x3, x4.
 //
 // Stage 2:
 //  - butterfly x0, x3; x1, x2
 //  - rotate x4, x7 by 3π/16
 //  - rotate x5, x6 by π/16.
 //
 // Stage 3:
 //  - butterfly x0, x1; x4, x6; x7, x5
 //  - rotate x2, x3 by 6π/16 and scale by √2.
 //
 // Stage 4:
 //  - butterfly x7, x4
 //  - scale x5, x6 by √2.
 //
 // Finally, the values are permuted. The permutation can be read as either:
 //  - x0, x4, x2, x6, x7, x3, x5, x1 = x0, x1, x2, x3, x4, x5, x6, x7 (paper's form)
 //  - x0, x1, x2, x3, x4, x5, x6, x7 = x0, x7, x2, x5, x1, x6, x3, x4 (sorted by LHS)
 // The code below uses the second form to make it easier to merge adjacent stores.
 // (Note that unlike in recursive FFT implementations, the permutation here is
 // not always mapping indexes to their bit reversals.)
 //
 // As written above, the rotation requires four multiplications, but it can be
 // reduced to three by refactoring (see [dctBox] below), and the scaling in
 // stage 3 can be merged into the rotation constants, so the overall cost
 // of a 1D FDCT is 11 multiplies.
 //
 // The 1D inverse DCT (IDCT) is the 1D FDCT run backward
 // with all the basic operations inverted.
 // dctBox implements a 3-multiply, 3-add rotation+scaling.
 // Given x0, x1, k*cos θ, and k*sin θ, dctBox returns the
 // rotated and scaled coordinates.
 // (It is called dctBox because the rotate+scale operation
 // is drawn as a box in Figures 1 and 2 in the paper.)
 func dctBox(x0, x1, kcos, ksin int32) (y0, y1 int32) {
 	// y0 = x0*kcos + x1*ksin
 	// y1 = -x0*ksin + x1*kcos
 	ksum := kcos * (x0 + x1)
 	y0 = ksum + (ksin-kcos)*x1
 	y1 = ksum - (kcos+ksin)*x0
 	return y0, y1
 }
 // A block is an 8x8 input to a 2D DCT (either the FDCT or IDCT).
 // The input is actually only 8x8 uint8 values, and the outputs are 8x8 int16,
 // but it is convenient to use int32s for intermediate storage,
 // so we define only a single block type of [8*8]int32.
 //
 // A 2D DCT is implemented as 1D DCTs over the rows and columns.
 //
 // dct_test.go defines a String method for nice printing in tests.
 type block [blockSize]int32
 const blockSize = 8 * 8
 // Note on Numerical Precision
 //
 // The inputs to both the FDCT and IDCT are uint8 values stored in a block,
 // and the outputs are int16s in the same block, but the overall operation
 // uses int32 values as fixed-point intermediate values.
 // In the code comments below, the notation “QN.M” refers to a
 // signed value of 1+N+M significant bits, one of which is the sign bit,
 // and M of which hold fractional (sub-integer) precision.
 // For example, 255 as a Q8.0 value is stored as int32(255),
 // while 255 as a Q8.1 value is stored as int32(510),
 // and 255.5 as a Q8.1 value is int32(511).
 // The notation UQN.M refers to an unsigned value of N+M significant bits.
 // See https://en.wikipedia.org/wiki/Q_(number_format) for more.
 //
 // In general we only need to keep about 16 significant bits, but it is more
 // efficient and somewhat more precise to let unnecessary fractional bits
 // accumulate and shift them away in bulk rather than after every operation.
 // As such, it is important to keep track of the number of fractional bits
 // in each variable at different points in the code, to avoid mistakes like
 // adding numbers with different fractional precisions, as well as to keep
 // track of the total number of bits, to avoid overflow. A comment like:
 //
 //	// x[123] now Q8.2.
 //
 // means that x1, x2, and x3 are all Q8.2 (11-bit) values.
 // Keeping extra precision bits also reduces the size of the errors introduced
 // by using right shift to approximate rounded division.
 // Constants needed for the implementation.
 // These are all 60-bit precision fixed-point constants.
 // The function c(val, b) rounds the constant to b bits.
 // c is simple enough that calls to it with constant args
 // are inlined and constant-propagated down to an inline constant.
 // Each constant is commented with its Ivy definition (see robpike.io/ivy),
 // using this scaling helper function:
 //
 //	op fix x = floor 0.5 + x * 2**60
 const (
 	cos1          = 1130768441178740757 // fix cos 1*pi/16
 	sin1          = 224923827593068887  // fix sin 1*pi/16
 	cos3          = 958619196450722178  // fix cos 3*pi/16
 	sin3          = 640528868967736374  // fix sin 3*pi/16
 	sqrt2         = 1630477228166597777 // fix sqrt 2
 	sqrt2_cos6    = 623956622067911264  // fix (sqrt 2)*cos 6*pi/16
 	sqrt2_sin6    = 1506364539328854985 // fix (sqrt 2)*sin 6*pi/16
 	sqrt2inv      = 815238614083298888  // fix 1/sqrt 2
 	sqrt2inv_cos6 = 311978311033955632  // fix (1/sqrt 2)*cos 6*pi/16
 	sqrt2inv_sin6 = 753182269664427492  // fix (1/sqrt 2)*sin 6*pi/16
 )
 func c(x uint64, bits int) int32 {
 	return int32((x + (1 << (59 - bits))) >> (60 - bits))
 }
 // fdct implements the forward DCT.
 // Inputs are UQ8.0; outputs are Q13.0.
 func fdct(b *block) {
 	fdctCols(b)
 	fdctRows(b)
 }
 // fdctCols applies the 1D DCT to the columns of b.
 // Inputs are UQ8.0 in [0,255] but interpreted as [-128,127].
 // Outputs are Q10.18.
 func fdctCols(b *block) {
 	for i := range 8 {
 		x0 := b[0*8+i]
 		x1 := b[1*8+i]
 		x2 := b[2*8+i]
 		x3 := b[3*8+i]
 		x4 := b[4*8+i]
 		x5 := b[5*8+i]
 		x6 := b[6*8+i]
 		x7 := b[7*8+i]
 		// x[01234567] are UQ8.0 in [0,255].
 		// Stage 1: four butterflies.
 		// In general a butterfly of QN.M inputs produces Q(N+1).M outputs.
 		// A butterfly of UQN.M inputs produces a UQ(N+1).M sum and a QN.M difference.
 		x0, x7 = x0+x7, x0-x7
 		x1, x6 = x1+x6, x1-x6
 		x2, x5 = x2+x5, x2-x5
 		x3, x4 = x3+x4, x3-x4
 		// x[0123] now UQ9.0 in [0, 510].
 		// x[4567] now Q8.0 in [-255,255].
 		// Stage 2: two boxes and two butterflies.
 		// A box on QN.M inputs with B-bit constants
 		// produces Q(N+1).(M+B) outputs.
 		// (The +1 is from the addition.)
 		x4, x7 = dctBox(x4, x7, c(cos3, 18), c(sin3, 18))
 		x5, x6 = dctBox(x5, x6, c(cos1, 18), c(sin1, 18))
 		// x[47] now Q9.18 in [-354, 354].
 		// x[56] now Q9.18 in [-300, 300].
 		x0, x3 = x0+x3, x0-x3
 		x1, x2 = x1+x2, x1-x2
 		// x[01] now UQ10.0 in [0, 1020].
 		// x[23] now Q9.0 in [-510, 510].
 		// Stage 3: one box and three butterflies.
 		x2, x3 = dctBox(x2, x3, c(sqrt2_cos6, 18), c(sqrt2_sin6, 18))
 		// x[23] now Q10.18 in [-943, 943].
 		x0, x1 = x0+x1, x0-x1
 		// x0 now UQ11.0 in [0, 2040].
 		// x1 now Q10.0 in [-1020, 1020].
 		// Store x0, x1, x2, x3 to their permuted targets.
 		// The original +128 in every input value
 		// has cancelled out except in the “DC signal” x0.
 		// Subtracting 128*8 here is equivalent to subtracting 128
 		// from every input before we started, but cheaper.
 		// It also converts x0 from UQ11.18 to Q10.18.
 		b[0*8+i] = (x0 - 128*8) << 18
 		b[4*8+i] = x1 << 18
 		b[2*8+i] = x2
 		b[6*8+i] = x3
 		x4, x6 = x4+x6, x4-x6
 		x7, x5 = x7+x5, x7-x5
 		// x[4567] now Q10.18 in [-654, 654].
 		// Stage 4: two √2 scalings and one butterfly.
 		x5 = (x5 >> 12) * c(sqrt2, 12)
 		x6 = (x6 >> 12) * c(sqrt2, 12)
 		// x[56] still Q10.18 in [-925, 925] (= 654√2).
 		x7, x4 = x7+x4, x7-x4
 		// x[47] still Q10.18 in [-925, 925] (not Q11.18!).
 		// This is not obvious at all! See “Note on 925” below.
 		// Store x4 x5 x6 x7 to their permuted targets.
 		b[1*8+i] = x7
 		b[3*8+i] = x5
 		b[5*8+i] = x6
 		b[7*8+i] = x4
 	}
 }
 // fdctRows applies the 1D DCT to the rows of b.
 // Inputs are Q10.18; outputs are Q13.0.
 func fdctRows(b *block) {
 	for i := range 8 {
 		x := b[8*i : 8*i+8 : 8*i+8]
 		x0 := x[0]
 		x1 := x[1]
 		x2 := x[2]
 		x3 := x[3]
 		x4 := x[4]
 		x5 := x[5]
 		x6 := x[6]
 		x7 := x[7]
 		// x[01234567] are Q10.18 [-1020, 1020].
 		// Stage 1: four butterflies.
 		x0, x7 = x0+x7, x0-x7
 		x1, x6 = x1+x6, x1-x6
 		x2, x5 = x2+x5, x2-x5
 		x3, x4 = x3+x4, x3-x4
 		// x[01234567] now Q11.18 in [-2040, 2040].
 		// Stage 2: two boxes and two butterflies.
 		x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 14), c(sin3, 14))
 		x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 14), c(sin1, 14))
 		// x[47] now Q12.18 in [-2830, 2830].
 		// x[56] now Q12.18 in [-2400, 2400].
 		x0, x3 = x0+x3, x0-x3
 		x1, x2 = x1+x2, x1-x2
 		// x[01234567] now Q12.18 in [-4080, 4080].
 		// Stage 3: one box and three butterflies.
 		x2, x3 = dctBox(x2>>14, x3>>14, c(sqrt2_cos6, 14), c(sqrt2_sin6, 14))
 		// x[23] now Q13.18 in [-7539, 7539].
 		x0, x1 = x0+x1, x0-x1
 		// x[01] now Q13.18 in [-8160, 8160].
 		x4, x6 = x4+x6, x4-x6
 		x7, x5 = x7+x5, x7-x5
 		// x[4567] now Q13.18 in [-5230, 5230].
 		// Stage 4: two √2 scalings and one butterfly.
 		x5 = (x5 >> 14) * c(sqrt2, 14)
 		x6 = (x6 >> 14) * c(sqrt2, 14)
 		// x[56] still Q13.18 in [-7397, 7397] (= 5230√2).
 		x7, x4 = x7+x4, x7-x4
 		// x[47] still Q13.18 in [-7395, 7395] (= 2040*3.6246).
 		// See “Note on 925” below.
 		// Cut from Q13.18 to Q13.0.
 		x0 = (x0 + 1<<17) >> 18
 		x1 = (x1 + 1<<17) >> 18
 		x2 = (x2 + 1<<17) >> 18
 		x3 = (x3 + 1<<17) >> 18
 		x4 = (x4 + 1<<17) >> 18
 		x5 = (x5 + 1<<17) >> 18
 		x6 = (x6 + 1<<17) >> 18
 		x7 = (x7 + 1<<17) >> 18
 		// Note: Unlike in fdctCols, saved all stores for the end
 		// because they are adjacent memory locations and some systems
 		// can use multiword stores.
 		x[0] = x0
 		x[1] = x7
 		x[2] = x2
 		x[3] = x5
 		x[4] = x1
 		x[5] = x6
 		x[6] = x3
 		x[7] = x4
 	}
 }
 // “Note on 925”, deferred from above to avoid interrupting code.
 //
 // In fdctCols, heading into stage 2, the values x4, x5, x6, x7 are in [-255, 255].
 // Let's call those specific values b4, b5, b6, b7, and trace how x[4567] evolve:
 //
 // Stage 2:
 //	x4 = b4*cos3 + b7*sin3
 //	x7 = -b4*sin3 + b7*cos3
 //	x5 = b5*cos1 + b6*sin1
 //	x6 = -b5*sin1 + b6*cos1
 //
 // Stage 3:
 //
 //	x4 = x4+x6 =  b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
 //	x6 = x4-x6 =  b4*cos3 + b7*sin3 + b5*sin1 - b6*cos1
 //	x7 = x7+x5 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1
 //	x5 = x7-x5 = -b4*sin3 + b7*cos3 - b5*cos1 - b6*sin1
 //
 // Stage 4:
 //
 //	x7 = x7+x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 + b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
 //	   = b4*(cos3-sin3) + b5*(cos1-sin1) + b6*(cos1+sin1) + b7*(cos3+sin3)
 //	   < 255*(0.2759 + 0.7857 + 1.1759 + 1.3871) = 255*3.6246 < 925.
 //
 //	x4 = x7-x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 - b4*cos3 - b7*sin3 + b5*sin1 - b6*cos1
 //	   = -b4*(cos3+sin3) + b5*(cos1+sin1) + b6*(sin1-cos1) + b7*(cos3-sin3)
 //	   < same 925.
 //
 // The fact that x5, x6 are also at most 925 is not a coincidence: we are computing
 // the same kinds of numbers for all four, just with different paths to them.
 //
 // In fdctRows, the same analysis applies, but the initial values are
 // in [-2040, 2040] instead of [-255, 255], so the bound is 2040*3.6246 < 7395.
 // idct implements the inverse DCT.
 // Inputs are UQ8.0; outputs are Q10.3.
 func idct(b *block) {
 	// A 2D IDCT is a 1D IDCT on rows followed by columns.
 	idctRows(b)
 	idctCols(b)
 }
 // idctRows applies the 1D IDCT to the rows of b.
 // Inputs are UQ8.0; outputs are Q9.20.
 func idctRows(b *block) {
 	for i := range 8 {
 		x := b[8*i : 8*i+8 : 8*i+8]
 		x0 := x[0]
 		x7 := x[1]
 		x2 := x[2]
 		x5 := x[3]
 		x1 := x[4]
 		x6 := x[5]
 		x3 := x[6]
 		x4 := x[7]
 		// Run FDCT backward.
 		// Independent operations have been reordered somewhat
 		// to make precision tracking easier.
 		//
 		// Note that “x0, x1 = x0+x1, x0-x1” is now a reverse butterfly
 		// and carries with it an implicit divide by two: the extra bit
 		// is added to the precision, not the value size.
 		// x[01234567] are UQ8.0 in [0, 255].
 		// Stages 4, 3, 2: x0, x1, x2, x3.
 		x0 <<= 17
 		x1 <<= 17
 		// x0, x1 now UQ8.17.
 		x0, x1 = x0+x1, x0-x1
 		// x0 now UQ8.18 in [0, 255].
 		// x1 now Q7.18 in [-127½, 127½].
 		// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 0.924, so no new high bit.
 		x2, x3 = dctBox(x2, x3, c(sqrt2inv_cos6, 18), -c(sqrt2inv_sin6, 18))
 		// x[23] now Q8.18 in [-236, 236].
 		x1, x2 = x1+x2, x1-x2
 		x0, x3 = x0+x3, x0-x3
 		// x[0123] now Q8.19 in [-246, 246].
 		// Stages 4, 3, 2: x4, x5, x6, x7.
 		x4 <<= 7
 		x7 <<= 7
 		// x[47] now UQ8.7
 		x7, x4 = x7+x4, x7-x4
 		// x7 now UQ8.8 in [0, 255].
 		// x4 now Q7.8 in [-127½, 127½].
 		x6 = x6 * c(sqrt2inv, 8)
 		x5 = x5 * c(sqrt2inv, 8)
 		// x[56] now UQ8.8 in [0, 181].
 		// Note that 1/√2 has five 0s in its binary representation after
 		// the 8th bit, so this multipliy is actually producing 12 bits of precision.
 		x7, x5 = x7+x5, x7-x5
 		x4, x6 = x4+x6, x4-x6
 		// x[4567] now Q8.9 in [-218, 218].
 		x4, x7 = dctBox(x4>>2, x7>>2, c(cos3, 12), -c(sin3, 12))
 		x5, x6 = dctBox(x5>>2, x6>>2, c(cos1, 12), -c(sin1, 12))
 		// x[4567] now Q9.19 in [-303, 303].
 		// Stage 1.
 		x0, x7 = x0+x7, x0-x7
 		x1, x6 = x1+x6, x1-x6
 		x2, x5 = x2+x5, x2-x5
 		x3, x4 = x3+x4, x3-x4
 		// x[01234567] now Q9.20 in [-275, 275].
 		// Note: we don't need all 20 bits of “precision”,
 		// but it is faster to let idctCols shift it away as part
 		// of other operations rather than downshift here.
 		x[0] = x0
 		x[1] = x1
 		x[2] = x2
 		x[3] = x3
 		x[4] = x4
 		x[5] = x5
 		x[6] = x6
 		x[7] = x7
 	}
 }
 // idctCols applies the 1D IDCT to the columns of b.
 // Inputs are Q9.20.
 // Outputs are Q10.3. That is, the result is the IDCT*8.
 func idctCols(b *block) {
 	for i := range 8 {
 		x0 := b[0*8+i]
 		x7 := b[1*8+i]
 		x2 := b[2*8+i]
 		x5 := b[3*8+i]
 		x1 := b[4*8+i]
 		x6 := b[5*8+i]
 		x3 := b[6*8+i]
 		x4 := b[7*8+i]
 		// x[012345678] are Q9.20.
 		// Start by adding 0.5 to x0 (the incoming DC signal).
 		// The butterflies will add it to all the other values,
 		// and then the final shifts will round properly.
 		x0 += 1 << 19
 		// Stages 4, 3, 2: x0, x1, x2, x3.
 		x0, x1 = (x0+x1)>>2, (x0-x1)>>2
 		// x[01] now Q9.19.
 		// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 1, so no new high bit.
 		x2, x3 = dctBox(x2>>13, x3>>13, c(sqrt2inv_cos6, 12), -c(sqrt2inv_sin6, 12))
 		// x[0123] now Q9.19.
 		x1, x2 = x1+x2, x1-x2
 		x0, x3 = x0+x3, x0-x3
 		// x[0123] now Q9.20.
 		// Stages 4, 3, 2: x4, x5, x6, x7.
 		x7, x4 = x7+x4, x7-x4
 		// x[47] now Q9.21.
 		x5 = (x5 >> 13) * c(sqrt2inv, 14)
 		x6 = (x6 >> 13) * c(sqrt2inv, 14)
 		// x[56] now Q9.21.
 		x7, x5 = x7+x5, x7-x5
 		x4, x6 = x4+x6, x4-x6
 		// x[4567] now Q9.22.
 		x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 12), -c(sin3, 12))
 		x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 12), -c(sin1, 12))
 		// x[4567] now Q10.20.
 		x0, x7 = x0+x7, x0-x7
 		x1, x6 = x1+x6, x1-x6
 		x2, x5 = x2+x5, x2-x5
 		x3, x4 = x3+x4, x3-x4
 		// x[01234567] now Q10.21.
 		x0 >>= 18
 		x1 >>= 18
 		x2 >>= 18
 		x3 >>= 18
 		x4 >>= 18
 		x5 >>= 18
 		x6 >>= 18
 		x7 >>= 18
 		// x[01234567] now Q10.3.
 		b[0*8+i] = x0
 		b[1*8+i] = x1
 		b[2*8+i] = x2
 		b[3*8+i] = x3
 		b[4*8+i] = x4
 		b[5*8+i] = x5
 		b[6*8+i] = x6
 		b[7*8+i] = x7
 	}
 }
--- a/src/image/jpeg/dct_test.go
+++ b/src/image/jpeg/dct_test.go
@ -92,7 +92,7 @@ func TestDCT(t *testing.T) {
 	}
 	// Check that the optimized and slow FDCT implementations agree.
-	testDCT(t, "FDCT", blocks, fdct, slowFDCT, 1, 16)
+	testDCT(t, "FDCT", blocks, fdct, slowFDCT, 1, 8)
 	testDCT(t, "IDCT", blocks, idct, slowIDCT, 1, 8)
 }
--- a/src/image/jpeg/fdct.go
+++ b/src/image/jpeg/fdct.go
@ -1,192 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package jpeg
 // This file implements a Forward Discrete Cosine Transformation.
 /*
 It is based on the code in jfdctint.c from the Independent JPEG Group,
 found at http://www.ijg.org/files/jpegsrc.v8c.tar.gz.
 The "LEGAL ISSUES" section of the README in that archive says:
 In plain English:
 1. We don't promise that this software works.  (But if you find any bugs,
   please let us know!)
 2. You can use this software for whatever you want.  You don't have to pay us.
 3. You may not pretend that you wrote this software.  If you use it in a
   program, you must acknowledge somewhere in your documentation that
   you've used the IJG code.
 In legalese:
 The authors make NO WARRANTY or representation, either express or implied,
 with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 This software is copyright (C) 1991-2011, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 Permission is hereby granted to use, copy, modify, and distribute this
 software (or portions thereof) for any purpose, without fee, subject to these
 conditions:
 (1) If any part of the source code for this software is distributed, then this
 README file must be included, with this copyright and no-warranty notice
 unaltered; and any additions, deletions, or changes to the original files
 must be clearly indicated in accompanying documentation.
 (2) If only executable code is distributed, then the accompanying
 documentation must state that "this software is based in part on the work of
 the Independent JPEG Group".
 (3) Permission for use of this software is granted only if the user accepts
 full responsibility for any undesirable consequences; the authors accept
 NO LIABILITY for damages of any kind.
 These conditions apply to any software derived from or based on the IJG code,
 not just to the unmodified library.  If you use our work, you ought to
 acknowledge us.
 Permission is NOT granted for the use of any IJG author's name or company name
 in advertising or publicity relating to this software or products derived from
 it.  This software may be referred to only as "the Independent JPEG Group's
 software".
 We specifically permit and encourage the use of this software as the basis of
 commercial products, provided that all warranty or liability claims are
 assumed by the product vendor.
 */
 // Trigonometric constants in 13-bit fixed point format.
 const (
 	fix_0_298631336 = 2446
 	fix_0_390180644 = 3196
 	fix_0_541196100 = 4433
 	fix_0_765366865 = 6270
 	fix_0_899976223 = 7373
 	fix_1_175875602 = 9633
 	fix_1_501321110 = 12299
 	fix_1_847759065 = 15137
 	fix_1_961570560 = 16069
 	fix_2_053119869 = 16819
 	fix_2_562915447 = 20995
 	fix_3_072711026 = 25172
 )
 const (
 	constBits     = 13
 	pass1Bits     = 2
 	centerJSample = 128
 )
 // fdct performs a forward DCT on an 8x8 block of coefficients, including a
 // level shift.
 func fdct(b *block) {
 	// Pass 1: process rows.
 	for y := 0; y < 8; y++ {
 		y8 := y * 8
 		s := b[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
 		x0 := s[0]
 		x1 := s[1]
 		x2 := s[2]
 		x3 := s[3]
 		x4 := s[4]
 		x5 := s[5]
 		x6 := s[6]
 		x7 := s[7]
 		tmp0 := x0 + x7
 		tmp1 := x1 + x6
 		tmp2 := x2 + x5
 		tmp3 := x3 + x4
 		tmp10 := tmp0 + tmp3
 		tmp12 := tmp0 - tmp3
 		tmp11 := tmp1 + tmp2
 		tmp13 := tmp1 - tmp2
 		tmp0 = x0 - x7
 		tmp1 = x1 - x6
 		tmp2 = x2 - x5
 		tmp3 = x3 - x4
 		s[0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits
 		s[4] = (tmp10 - tmp11) << pass1Bits
 		z1 := (tmp12 + tmp13) * fix_0_541196100
 		z1 += 1 << (constBits - pass1Bits - 1)
 		s[2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits)
 		s[6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits)
 		tmp10 = tmp0 + tmp3
 		tmp11 = tmp1 + tmp2
 		tmp12 = tmp0 + tmp2
 		tmp13 = tmp1 + tmp3
 		z1 = (tmp12 + tmp13) * fix_1_175875602
 		z1 += 1 << (constBits - pass1Bits - 1)
 		tmp0 *= fix_1_501321110
 		tmp1 *= fix_3_072711026
 		tmp2 *= fix_2_053119869
 		tmp3 *= fix_0_298631336
 		tmp10 *= -fix_0_899976223
 		tmp11 *= -fix_2_562915447
 		tmp12 *= -fix_0_390180644
 		tmp13 *= -fix_1_961570560
 		tmp12 += z1
 		tmp13 += z1
 		s[1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits)
 		s[3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits)
 		s[5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits)
 		s[7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits)
 	}
 	// Pass 2: process columns.
 	// We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.
 	for x := 0; x < 8; x++ {
 		tmp0 := b[0*8+x] + b[7*8+x]
 		tmp1 := b[1*8+x] + b[6*8+x]
 		tmp2 := b[2*8+x] + b[5*8+x]
 		tmp3 := b[3*8+x] + b[4*8+x]
 		tmp10 := tmp0 + tmp3 + 1<<(pass1Bits-1)
 		tmp12 := tmp0 - tmp3
 		tmp11 := tmp1 + tmp2
 		tmp13 := tmp1 - tmp2
 		tmp0 = b[0*8+x] - b[7*8+x]
 		tmp1 = b[1*8+x] - b[6*8+x]
 		tmp2 = b[2*8+x] - b[5*8+x]
 		tmp3 = b[3*8+x] - b[4*8+x]
 		b[0*8+x] = (tmp10 + tmp11) >> pass1Bits
 		b[4*8+x] = (tmp10 - tmp11) >> pass1Bits
 		z1 := (tmp12 + tmp13) * fix_0_541196100
 		z1 += 1 << (constBits + pass1Bits - 1)
 		b[2*8+x] = (z1 + tmp12*fix_0_765366865) >> (constBits + pass1Bits)
 		b[6*8+x] = (z1 - tmp13*fix_1_847759065) >> (constBits + pass1Bits)
 		tmp10 = tmp0 + tmp3
 		tmp11 = tmp1 + tmp2
 		tmp12 = tmp0 + tmp2
 		tmp13 = tmp1 + tmp3
 		z1 = (tmp12 + tmp13) * fix_1_175875602
 		z1 += 1 << (constBits + pass1Bits - 1)
 		tmp0 *= fix_1_501321110
 		tmp1 *= fix_3_072711026
 		tmp2 *= fix_2_053119869
 		tmp3 *= fix_0_298631336
 		tmp10 *= -fix_0_899976223
 		tmp11 *= -fix_2_562915447
 		tmp12 *= -fix_0_390180644
 		tmp13 *= -fix_1_961570560
 		tmp12 += z1
 		tmp13 += z1
 		b[1*8+x] = (tmp0 + tmp10 + tmp12) >> (constBits + pass1Bits)
 		b[3*8+x] = (tmp1 + tmp11 + tmp13) >> (constBits + pass1Bits)
 		b[5*8+x] = (tmp2 + tmp11 + tmp12) >> (constBits + pass1Bits)
 		b[7*8+x] = (tmp3 + tmp10 + tmp13) >> (constBits + pass1Bits)
 	}
 }
--- a/src/image/jpeg/idct.go
+++ b/src/image/jpeg/idct.go
@ -1,194 +0,0 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package jpeg
 // This is a Go translation of idct.c from
 //
 // http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
 //
 // which carries the following notice:
 /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
 /*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */
 const blockSize = 64 // A DCT block is 8x8.
 type block [blockSize]int32
 const (
 	w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
 	w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
 	w3 = 2408 // 2048*sqrt(2)*cos(3*pi/16)
 	w5 = 1609 // 2048*sqrt(2)*cos(5*pi/16)
 	w6 = 1108 // 2048*sqrt(2)*cos(6*pi/16)
 	w7 = 565  // 2048*sqrt(2)*cos(7*pi/16)
 	w1pw7 = w1 + w7
 	w1mw7 = w1 - w7
 	w2pw6 = w2 + w6
 	w2mw6 = w2 - w6
 	w3pw5 = w3 + w5
 	w3mw5 = w3 - w5
 	r2 = 181 // 256/sqrt(2)
 )
 // idct performs a 2-D Inverse Discrete Cosine Transformation.
 //
 // The input coefficients should already have been multiplied by the
 // appropriate quantization table. We use fixed-point computation, with the
 // number of bits for the fractional component varying over the intermediate
 // stages.
 //
 // For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
 // discrete W transform and for the discrete Fourier transform", IEEE Trans. on
 // ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
 func idct(src *block) {
 	// Horizontal 1-D IDCT.
 	for y := 0; y < 8; y++ {
 		y8 := y * 8
 		s := src[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
 		// If all the AC components are zero, then the IDCT is trivial.
 		if s[1] == 0 && s[2] == 0 && s[3] == 0 &&
 			s[4] == 0 && s[5] == 0 && s[6] == 0 && s[7] == 0 {
 			dc := s[0] << 3
 			s[0] = dc
 			s[1] = dc
 			s[2] = dc
 			s[3] = dc
 			s[4] = dc
 			s[5] = dc
 			s[6] = dc
 			s[7] = dc
 			continue
 		}
 		// Prescale.
 		x0 := (s[0] << 11) + 128
 		x1 := s[4] << 11
 		x2 := s[6]
 		x3 := s[2]
 		x4 := s[1]
 		x5 := s[7]
 		x6 := s[5]
 		x7 := s[3]
 		// Stage 1.
 		x8 := w7 * (x4 + x5)
 		x4 = x8 + w1mw7*x4
 		x5 = x8 - w1pw7*x5
 		x8 = w3 * (x6 + x7)
 		x6 = x8 - w3mw5*x6
 		x7 = x8 - w3pw5*x7
 		// Stage 2.
 		x8 = x0 + x1
 		x0 -= x1
 		x1 = w6 * (x3 + x2)
 		x2 = x1 - w2pw6*x2
 		x3 = x1 + w2mw6*x3
 		x1 = x4 + x6
 		x4 -= x6
 		x6 = x5 + x7
 		x5 -= x7
 		// Stage 3.
 		x7 = x8 + x3
 		x8 -= x3
 		x3 = x0 + x2
 		x0 -= x2
 		x2 = (r2*(x4+x5) + 128) >> 8
 		x4 = (r2*(x4-x5) + 128) >> 8
 		// Stage 4.
 		s[0] = (x7 + x1) >> 8
 		s[1] = (x3 + x2) >> 8
 		s[2] = (x0 + x4) >> 8
 		s[3] = (x8 + x6) >> 8
 		s[4] = (x8 - x6) >> 8
 		s[5] = (x0 - x4) >> 8
 		s[6] = (x3 - x2) >> 8
 		s[7] = (x7 - x1) >> 8
 	}
 	// Vertical 1-D IDCT.
 	for x := 0; x < 8; x++ {
 		// Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
 		// However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
 		// we do not bother to check for the all-zero case.
 		s := src[x : x+57 : x+57] // Small cap improves performance, see https://golang.org/issue/27857
 		// Prescale.
 		y0 := (s[8*0] << 8) + 8192
 		y1 := s[8*4] << 8
 		y2 := s[8*6]
 		y3 := s[8*2]
 		y4 := s[8*1]
 		y5 := s[8*7]
 		y6 := s[8*5]
 		y7 := s[8*3]
 		// Stage 1.
 		y8 := w7*(y4+y5) + 4
 		y4 = (y8 + w1mw7*y4) >> 3
 		y5 = (y8 - w1pw7*y5) >> 3
 		y8 = w3*(y6+y7) + 4
 		y6 = (y8 - w3mw5*y6) >> 3
 		y7 = (y8 - w3pw5*y7) >> 3
 		// Stage 2.
 		y8 = y0 + y1
 		y0 -= y1
 		y1 = w6*(y3+y2) + 4
 		y2 = (y1 - w2pw6*y2) >> 3
 		y3 = (y1 + w2mw6*y3) >> 3
 		y1 = y4 + y6
 		y4 -= y6
 		y6 = y5 + y7
 		y5 -= y7
 		// Stage 3.
 		y7 = y8 + y3
 		y8 -= y3
 		y3 = y0 + y2
 		y0 -= y2
 		y2 = (r2*(y4+y5) + 128) >> 8
 		y4 = (r2*(y4-y5) + 128) >> 8
 		// Stage 4.
 		s[8*0] = (y7 + y1) >> 14
 		s[8*1] = (y3 + y2) >> 14
 		s[8*2] = (y0 + y4) >> 14
 		s[8*3] = (y8 + y6) >> 14
 		s[8*4] = (y8 - y6) >> 14
 		s[8*5] = (y0 - y4) >> 14
 		s[8*6] = (y3 - y2) >> 14
 		s[8*7] = (y7 - y1) >> 14
 	}
 }