image/jpeg: replace fdct.go and idct.go with new implementation in dct.go

The fdct.go and idct.go files were derived from the MPEG-SSG and
JPEG-IJG reference code and therefore carry licenses specific to those
groups. Various license checkers flag these files as potentially
problematic. The code is also not terribly well documented.

This CL fixes the license problem by adding a new, from-scratch
implementation using a different algorithm. As a bonus, the new code
is both faster and more accurate than the old encumbered code.

On speed, the new code is up to 20% faster; benchmarks below.

On accuracy, in the set of blocks used in the test, we can measure
the number of output values that are off-by-one from the exact rounded answer.
The old FDCT was off in 8.6% of values; the new one is off in 2.5%.
The old IDCT was off in 1.4% of values; the new one is off in 1.2%.

goos: darwin
goarch: arm64
pkg: image/jpeg
cpu: Apple M3 Pro
                     │     old     │                 new                 │
                     │   sec/op    │   sec/op     vs base                │
FDCT-12                619.6n ± 3%   586.5n ± 1%   -5.34% (p=0.000 n=10)
IDCT-12                752.4n ± 4%   628.0n ± 1%  -16.54% (p=0.000 n=10)

goos: linux
goarch: amd64
cpu: Intel(R) Xeon(R) CPU @ 2.30GHz
               │     old     │                 new                 │
               │   sec/op    │   sec/op     vs base                │
FDCT-16          1.817µ ± 0%   1.542µ ± 0%  -15.11% (p=0.000 n=10)
IDCT-16          1.897µ ± 0%   1.514µ ± 0%  -20.22% (p=0.000 n=10)

goos: linux
goarch: arm64
cpu: whatever gotip-linux-arm64 has
              │     old     │                new                 │
              │   sec/op    │   sec/op     vs base               │
FDCT-8          1.844µ ± 0%   1.847µ ± 0%  +0.14% (p=0.000 n=10)
IDCT-8          2.127µ ± 0%   1.973µ ± 0%  -7.26% (p=0.000 n=10)

Change-Id: Ie6d14103c8478ba5a779f234da84f345828eb925
Reviewed-on: https://go-review.googlesource.com/c/go/+/705518
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
Reviewed-by: Nigel Tao <nigeltao@google.com>
This commit is contained in:
Russ Cox 2025-09-18 12:27:06 -04:00 committed by Gopher Robot
parent 92e093467f
commit fbba930271
5 changed files with 538 additions and 403 deletions

View file

@ -70,22 +70,22 @@ func Example() {
} }
// Output: // Output:
// bin red green blue alpha // bin red green blue alpha
// 0x0000-0x0fff: 364 790 7242 0 // 0x0000-0x0fff: 362 793 7245 0
// 0x1000-0x1fff: 645 2967 1039 0 // 0x1000-0x1fff: 648 2963 1036 0
// 0x2000-0x2fff: 1072 2299 979 0 // 0x2000-0x2fff: 1072 2301 977 0
// 0x3000-0x3fff: 820 2266 980 0 // 0x3000-0x3fff: 819 2266 982 0
// 0x4000-0x4fff: 537 1305 541 0 // 0x4000-0x4fff: 537 1303 541 0
// 0x5000-0x5fff: 319 962 261 0 // 0x5000-0x5fff: 321 964 261 0
// 0x6000-0x6fff: 322 375 177 0 // 0x6000-0x6fff: 321 375 177 0
// 0x7000-0x7fff: 601 279 214 0 // 0x7000-0x7fff: 599 278 213 0
// 0x8000-0x8fff: 3478 227 273 0 // 0x8000-0x8fff: 3478 228 275 0
// 0x9000-0x9fff: 2260 234 329 0 // 0x9000-0x9fff: 2260 233 328 0
// 0xa000-0xafff: 921 282 373 0 // 0xa000-0xafff: 921 282 374 0
// 0xb000-0xbfff: 321 335 397 0 // 0xb000-0xbfff: 322 335 395 0
// 0xc000-0xcfff: 229 388 298 0 // 0xc000-0xcfff: 228 388 299 0
// 0xd000-0xdfff: 260 414 277 0 // 0xd000-0xdfff: 261 415 277 0
// 0xe000-0xefff: 516 428 298 0 // 0xe000-0xefff: 516 423 297 0
// 0xf000-0xffff: 2785 1899 1772 15450 // 0xf000-0xffff: 2785 1903 1773 15450
} }
const data = ` const data = `

521
src/image/jpeg/dct.go Normal file
View file

@ -0,0 +1,521 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package jpeg
// Discrete Cosine Transformation (DCT) implementations using the algorithm from
// Christoph Loeffler, Adriaan Lightenberg, and George S. Mostchytz,
// “Practical Fast 1-D DCT Algorithms with 11 Multiplications,” ICASSP 1989.
// https://ieeexplore.ieee.org/document/266596
//
// Since the paper is paywalled, the rest of this comment gives a summary.
//
// A 1-dimensional forward DCT (1D FDCT) takes as input 8 values x0..x7
// and transforms them in place into the result values.
//
// The mathematical definition of the N-point 1D FDCT is:
//
// X[k] = α_k Σ_n x[n] * cos (2n+1)*k*π/2N
//
// where α₀ = √2 and α_k = 1 for k > 0.
//
// For our purposes, N=8, so the angles end up being multiples of π/16.
// The most direct implementation of this definition would require 64 multiplications.
//
// Loeffler's paper presents a more efficient computation that requires only
// 11 multiplications and works in terms of three basic operations:
//
// - A “butterfly” x0, x1 = x0+x1, x0-x1.
// The inverse is x0, x1 = (x0+x1)/2, (x0-x1)/2.
//
// - A scaling of x0 by k: x0 *= k. The inverse is scaling by 1/k.
//
// - A rotation of x0, x1 by θ, defined as:
// x0, x1 = x0 cos θ + x1 sin θ, -x0 sin θ + x1 cos θ.
// The inverse is rotation by -θ.
//
// The algorithm proceeds in four stages:
//
// Stage 1:
// - butterfly x0, x7; x1, x6; x2, x5; x3, x4.
//
// Stage 2:
// - butterfly x0, x3; x1, x2
// - rotate x4, x7 by 3π/16
// - rotate x5, x6 by π/16.
//
// Stage 3:
// - butterfly x0, x1; x4, x6; x7, x5
// - rotate x2, x3 by 6π/16 and scale by √2.
//
// Stage 4:
// - butterfly x7, x4
// - scale x5, x6 by √2.
//
// Finally, the values are permuted. The permutation can be read as either:
// - x0, x4, x2, x6, x7, x3, x5, x1 = x0, x1, x2, x3, x4, x5, x6, x7 (paper's form)
// - x0, x1, x2, x3, x4, x5, x6, x7 = x0, x7, x2, x5, x1, x6, x3, x4 (sorted by LHS)
// The code below uses the second form to make it easier to merge adjacent stores.
// (Note that unlike in recursive FFT implementations, the permutation here is
// not always mapping indexes to their bit reversals.)
//
// As written above, the rotation requires four multiplications, but it can be
// reduced to three by refactoring (see [dctBox] below), and the scaling in
// stage 3 can be merged into the rotation constants, so the overall cost
// of a 1D FDCT is 11 multiplies.
//
// The 1D inverse DCT (IDCT) is the 1D FDCT run backward
// with all the basic operations inverted.
// dctBox implements a 3-multiply, 3-add rotation+scaling.
// Given x0, x1, k*cos θ, and k*sin θ, dctBox returns the
// rotated and scaled coordinates.
// (It is called dctBox because the rotate+scale operation
// is drawn as a box in Figures 1 and 2 in the paper.)
func dctBox(x0, x1, kcos, ksin int32) (y0, y1 int32) {
// y0 = x0*kcos + x1*ksin
// y1 = -x0*ksin + x1*kcos
ksum := kcos * (x0 + x1)
y0 = ksum + (ksin-kcos)*x1
y1 = ksum - (kcos+ksin)*x0
return y0, y1
}
// A block is an 8x8 input to a 2D DCT (either the FDCT or IDCT).
// The input is actually only 8x8 uint8 values, and the outputs are 8x8 int16,
// but it is convenient to use int32s for intermediate storage,
// so we define only a single block type of [8*8]int32.
//
// A 2D DCT is implemented as 1D DCTs over the rows and columns.
//
// dct_test.go defines a String method for nice printing in tests.
type block [blockSize]int32
const blockSize = 8 * 8
// Note on Numerical Precision
//
// The inputs to both the FDCT and IDCT are uint8 values stored in a block,
// and the outputs are int16s in the same block, but the overall operation
// uses int32 values as fixed-point intermediate values.
// In the code comments below, the notation “QN.M” refers to a
// signed value of 1+N+M significant bits, one of which is the sign bit,
// and M of which hold fractional (sub-integer) precision.
// For example, 255 as a Q8.0 value is stored as int32(255),
// while 255 as a Q8.1 value is stored as int32(510),
// and 255.5 as a Q8.1 value is int32(511).
// The notation UQN.M refers to an unsigned value of N+M significant bits.
// See https://en.wikipedia.org/wiki/Q_(number_format) for more.
//
// In general we only need to keep about 16 significant bits, but it is more
// efficient and somewhat more precise to let unnecessary fractional bits
// accumulate and shift them away in bulk rather than after every operation.
// As such, it is important to keep track of the number of fractional bits
// in each variable at different points in the code, to avoid mistakes like
// adding numbers with different fractional precisions, as well as to keep
// track of the total number of bits, to avoid overflow. A comment like:
//
// // x[123] now Q8.2.
//
// means that x1, x2, and x3 are all Q8.2 (11-bit) values.
// Keeping extra precision bits also reduces the size of the errors introduced
// by using right shift to approximate rounded division.
// Constants needed for the implementation.
// These are all 60-bit precision fixed-point constants.
// The function c(val, b) rounds the constant to b bits.
// c is simple enough that calls to it with constant args
// are inlined and constant-propagated down to an inline constant.
// Each constant is commented with its Ivy definition (see robpike.io/ivy),
// using this scaling helper function:
//
// op fix x = floor 0.5 + x * 2**60
const (
cos1 = 1130768441178740757 // fix cos 1*pi/16
sin1 = 224923827593068887 // fix sin 1*pi/16
cos3 = 958619196450722178 // fix cos 3*pi/16
sin3 = 640528868967736374 // fix sin 3*pi/16
sqrt2 = 1630477228166597777 // fix sqrt 2
sqrt2_cos6 = 623956622067911264 // fix (sqrt 2)*cos 6*pi/16
sqrt2_sin6 = 1506364539328854985 // fix (sqrt 2)*sin 6*pi/16
sqrt2inv = 815238614083298888 // fix 1/sqrt 2
sqrt2inv_cos6 = 311978311033955632 // fix (1/sqrt 2)*cos 6*pi/16
sqrt2inv_sin6 = 753182269664427492 // fix (1/sqrt 2)*sin 6*pi/16
)
func c(x uint64, bits int) int32 {
return int32((x + (1 << (59 - bits))) >> (60 - bits))
}
// fdct implements the forward DCT.
// Inputs are UQ8.0; outputs are Q13.0.
func fdct(b *block) {
fdctCols(b)
fdctRows(b)
}
// fdctCols applies the 1D DCT to the columns of b.
// Inputs are UQ8.0 in [0,255] but interpreted as [-128,127].
// Outputs are Q10.18.
func fdctCols(b *block) {
for i := range 8 {
x0 := b[0*8+i]
x1 := b[1*8+i]
x2 := b[2*8+i]
x3 := b[3*8+i]
x4 := b[4*8+i]
x5 := b[5*8+i]
x6 := b[6*8+i]
x7 := b[7*8+i]
// x[01234567] are UQ8.0 in [0,255].
// Stage 1: four butterflies.
// In general a butterfly of QN.M inputs produces Q(N+1).M outputs.
// A butterfly of UQN.M inputs produces a UQ(N+1).M sum and a QN.M difference.
x0, x7 = x0+x7, x0-x7
x1, x6 = x1+x6, x1-x6
x2, x5 = x2+x5, x2-x5
x3, x4 = x3+x4, x3-x4
// x[0123] now UQ9.0 in [0, 510].
// x[4567] now Q8.0 in [-255,255].
// Stage 2: two boxes and two butterflies.
// A box on QN.M inputs with B-bit constants
// produces Q(N+1).(M+B) outputs.
// (The +1 is from the addition.)
x4, x7 = dctBox(x4, x7, c(cos3, 18), c(sin3, 18))
x5, x6 = dctBox(x5, x6, c(cos1, 18), c(sin1, 18))
// x[47] now Q9.18 in [-354, 354].
// x[56] now Q9.18 in [-300, 300].
x0, x3 = x0+x3, x0-x3
x1, x2 = x1+x2, x1-x2
// x[01] now UQ10.0 in [0, 1020].
// x[23] now Q9.0 in [-510, 510].
// Stage 3: one box and three butterflies.
x2, x3 = dctBox(x2, x3, c(sqrt2_cos6, 18), c(sqrt2_sin6, 18))
// x[23] now Q10.18 in [-943, 943].
x0, x1 = x0+x1, x0-x1
// x0 now UQ11.0 in [0, 2040].
// x1 now Q10.0 in [-1020, 1020].
// Store x0, x1, x2, x3 to their permuted targets.
// The original +128 in every input value
// has cancelled out except in the “DC signal” x0.
// Subtracting 128*8 here is equivalent to subtracting 128
// from every input before we started, but cheaper.
// It also converts x0 from UQ11.18 to Q10.18.
b[0*8+i] = (x0 - 128*8) << 18
b[4*8+i] = x1 << 18
b[2*8+i] = x2
b[6*8+i] = x3
x4, x6 = x4+x6, x4-x6
x7, x5 = x7+x5, x7-x5
// x[4567] now Q10.18 in [-654, 654].
// Stage 4: two √2 scalings and one butterfly.
x5 = (x5 >> 12) * c(sqrt2, 12)
x6 = (x6 >> 12) * c(sqrt2, 12)
// x[56] still Q10.18 in [-925, 925] (= 654√2).
x7, x4 = x7+x4, x7-x4
// x[47] still Q10.18 in [-925, 925] (not Q11.18!).
// This is not obvious at all! See “Note on 925” below.
// Store x4 x5 x6 x7 to their permuted targets.
b[1*8+i] = x7
b[3*8+i] = x5
b[5*8+i] = x6
b[7*8+i] = x4
}
}
// fdctRows applies the 1D DCT to the rows of b.
// Inputs are Q10.18; outputs are Q13.0.
func fdctRows(b *block) {
for i := range 8 {
x := b[8*i : 8*i+8 : 8*i+8]
x0 := x[0]
x1 := x[1]
x2 := x[2]
x3 := x[3]
x4 := x[4]
x5 := x[5]
x6 := x[6]
x7 := x[7]
// x[01234567] are Q10.18 [-1020, 1020].
// Stage 1: four butterflies.
x0, x7 = x0+x7, x0-x7
x1, x6 = x1+x6, x1-x6
x2, x5 = x2+x5, x2-x5
x3, x4 = x3+x4, x3-x4
// x[01234567] now Q11.18 in [-2040, 2040].
// Stage 2: two boxes and two butterflies.
x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 14), c(sin3, 14))
x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 14), c(sin1, 14))
// x[47] now Q12.18 in [-2830, 2830].
// x[56] now Q12.18 in [-2400, 2400].
x0, x3 = x0+x3, x0-x3
x1, x2 = x1+x2, x1-x2
// x[01234567] now Q12.18 in [-4080, 4080].
// Stage 3: one box and three butterflies.
x2, x3 = dctBox(x2>>14, x3>>14, c(sqrt2_cos6, 14), c(sqrt2_sin6, 14))
// x[23] now Q13.18 in [-7539, 7539].
x0, x1 = x0+x1, x0-x1
// x[01] now Q13.18 in [-8160, 8160].
x4, x6 = x4+x6, x4-x6
x7, x5 = x7+x5, x7-x5
// x[4567] now Q13.18 in [-5230, 5230].
// Stage 4: two √2 scalings and one butterfly.
x5 = (x5 >> 14) * c(sqrt2, 14)
x6 = (x6 >> 14) * c(sqrt2, 14)
// x[56] still Q13.18 in [-7397, 7397] (= 5230√2).
x7, x4 = x7+x4, x7-x4
// x[47] still Q13.18 in [-7395, 7395] (= 2040*3.6246).
// See “Note on 925” below.
// Cut from Q13.18 to Q13.0.
x0 = (x0 + 1<<17) >> 18
x1 = (x1 + 1<<17) >> 18
x2 = (x2 + 1<<17) >> 18
x3 = (x3 + 1<<17) >> 18
x4 = (x4 + 1<<17) >> 18
x5 = (x5 + 1<<17) >> 18
x6 = (x6 + 1<<17) >> 18
x7 = (x7 + 1<<17) >> 18
// Note: Unlike in fdctCols, saved all stores for the end
// because they are adjacent memory locations and some systems
// can use multiword stores.
x[0] = x0
x[1] = x7
x[2] = x2
x[3] = x5
x[4] = x1
x[5] = x6
x[6] = x3
x[7] = x4
}
}
// “Note on 925”, deferred from above to avoid interrupting code.
//
// In fdctCols, heading into stage 2, the values x4, x5, x6, x7 are in [-255, 255].
// Let's call those specific values b4, b5, b6, b7, and trace how x[4567] evolve:
//
// Stage 2:
// x4 = b4*cos3 + b7*sin3
// x7 = -b4*sin3 + b7*cos3
// x5 = b5*cos1 + b6*sin1
// x6 = -b5*sin1 + b6*cos1
//
// Stage 3:
//
// x4 = x4+x6 = b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
// x6 = x4-x6 = b4*cos3 + b7*sin3 + b5*sin1 - b6*cos1
// x7 = x7+x5 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1
// x5 = x7-x5 = -b4*sin3 + b7*cos3 - b5*cos1 - b6*sin1
//
// Stage 4:
//
// x7 = x7+x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 + b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
// = b4*(cos3-sin3) + b5*(cos1-sin1) + b6*(cos1+sin1) + b7*(cos3+sin3)
// < 255*(0.2759 + 0.7857 + 1.1759 + 1.3871) = 255*3.6246 < 925.
//
// x4 = x7-x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 - b4*cos3 - b7*sin3 + b5*sin1 - b6*cos1
// = -b4*(cos3+sin3) + b5*(cos1+sin1) + b6*(sin1-cos1) + b7*(cos3-sin3)
// < same 925.
//
// The fact that x5, x6 are also at most 925 is not a coincidence: we are computing
// the same kinds of numbers for all four, just with different paths to them.
//
// In fdctRows, the same analysis applies, but the initial values are
// in [-2040, 2040] instead of [-255, 255], so the bound is 2040*3.6246 < 7395.
// idct implements the inverse DCT.
// Inputs are UQ8.0; outputs are Q10.3.
func idct(b *block) {
// A 2D IDCT is a 1D IDCT on rows followed by columns.
idctRows(b)
idctCols(b)
}
// idctRows applies the 1D IDCT to the rows of b.
// Inputs are UQ8.0; outputs are Q9.20.
func idctRows(b *block) {
for i := range 8 {
x := b[8*i : 8*i+8 : 8*i+8]
x0 := x[0]
x7 := x[1]
x2 := x[2]
x5 := x[3]
x1 := x[4]
x6 := x[5]
x3 := x[6]
x4 := x[7]
// Run FDCT backward.
// Independent operations have been reordered somewhat
// to make precision tracking easier.
//
// Note that “x0, x1 = x0+x1, x0-x1” is now a reverse butterfly
// and carries with it an implicit divide by two: the extra bit
// is added to the precision, not the value size.
// x[01234567] are UQ8.0 in [0, 255].
// Stages 4, 3, 2: x0, x1, x2, x3.
x0 <<= 17
x1 <<= 17
// x0, x1 now UQ8.17.
x0, x1 = x0+x1, x0-x1
// x0 now UQ8.18 in [0, 255].
// x1 now Q7.18 in [-127½, 127½].
// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 0.924, so no new high bit.
x2, x3 = dctBox(x2, x3, c(sqrt2inv_cos6, 18), -c(sqrt2inv_sin6, 18))
// x[23] now Q8.18 in [-236, 236].
x1, x2 = x1+x2, x1-x2
x0, x3 = x0+x3, x0-x3
// x[0123] now Q8.19 in [-246, 246].
// Stages 4, 3, 2: x4, x5, x6, x7.
x4 <<= 7
x7 <<= 7
// x[47] now UQ8.7
x7, x4 = x7+x4, x7-x4
// x7 now UQ8.8 in [0, 255].
// x4 now Q7.8 in [-127½, 127½].
x6 = x6 * c(sqrt2inv, 8)
x5 = x5 * c(sqrt2inv, 8)
// x[56] now UQ8.8 in [0, 181].
// Note that 1/√2 has five 0s in its binary representation after
// the 8th bit, so this multipliy is actually producing 12 bits of precision.
x7, x5 = x7+x5, x7-x5
x4, x6 = x4+x6, x4-x6
// x[4567] now Q8.9 in [-218, 218].
x4, x7 = dctBox(x4>>2, x7>>2, c(cos3, 12), -c(sin3, 12))
x5, x6 = dctBox(x5>>2, x6>>2, c(cos1, 12), -c(sin1, 12))
// x[4567] now Q9.19 in [-303, 303].
// Stage 1.
x0, x7 = x0+x7, x0-x7
x1, x6 = x1+x6, x1-x6
x2, x5 = x2+x5, x2-x5
x3, x4 = x3+x4, x3-x4
// x[01234567] now Q9.20 in [-275, 275].
// Note: we don't need all 20 bits of “precision”,
// but it is faster to let idctCols shift it away as part
// of other operations rather than downshift here.
x[0] = x0
x[1] = x1
x[2] = x2
x[3] = x3
x[4] = x4
x[5] = x5
x[6] = x6
x[7] = x7
}
}
// idctCols applies the 1D IDCT to the columns of b.
// Inputs are Q9.20.
// Outputs are Q10.3. That is, the result is the IDCT*8.
func idctCols(b *block) {
for i := range 8 {
x0 := b[0*8+i]
x7 := b[1*8+i]
x2 := b[2*8+i]
x5 := b[3*8+i]
x1 := b[4*8+i]
x6 := b[5*8+i]
x3 := b[6*8+i]
x4 := b[7*8+i]
// x[012345678] are Q9.20.
// Start by adding 0.5 to x0 (the incoming DC signal).
// The butterflies will add it to all the other values,
// and then the final shifts will round properly.
x0 += 1 << 19
// Stages 4, 3, 2: x0, x1, x2, x3.
x0, x1 = (x0+x1)>>2, (x0-x1)>>2
// x[01] now Q9.19.
// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 1, so no new high bit.
x2, x3 = dctBox(x2>>13, x3>>13, c(sqrt2inv_cos6, 12), -c(sqrt2inv_sin6, 12))
// x[0123] now Q9.19.
x1, x2 = x1+x2, x1-x2
x0, x3 = x0+x3, x0-x3
// x[0123] now Q9.20.
// Stages 4, 3, 2: x4, x5, x6, x7.
x7, x4 = x7+x4, x7-x4
// x[47] now Q9.21.
x5 = (x5 >> 13) * c(sqrt2inv, 14)
x6 = (x6 >> 13) * c(sqrt2inv, 14)
// x[56] now Q9.21.
x7, x5 = x7+x5, x7-x5
x4, x6 = x4+x6, x4-x6
// x[4567] now Q9.22.
x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 12), -c(sin3, 12))
x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 12), -c(sin1, 12))
// x[4567] now Q10.20.
x0, x7 = x0+x7, x0-x7
x1, x6 = x1+x6, x1-x6
x2, x5 = x2+x5, x2-x5
x3, x4 = x3+x4, x3-x4
// x[01234567] now Q10.21.
x0 >>= 18
x1 >>= 18
x2 >>= 18
x3 >>= 18
x4 >>= 18
x5 >>= 18
x6 >>= 18
x7 >>= 18
// x[01234567] now Q10.3.
b[0*8+i] = x0
b[1*8+i] = x1
b[2*8+i] = x2
b[3*8+i] = x3
b[4*8+i] = x4
b[5*8+i] = x5
b[6*8+i] = x6
b[7*8+i] = x7
}
}

View file

@ -92,7 +92,7 @@ func TestDCT(t *testing.T) {
} }
// Check that the optimized and slow FDCT implementations agree. // Check that the optimized and slow FDCT implementations agree.
testDCT(t, "FDCT", blocks, fdct, slowFDCT, 1, 16) testDCT(t, "FDCT", blocks, fdct, slowFDCT, 1, 8)
testDCT(t, "IDCT", blocks, idct, slowIDCT, 1, 8) testDCT(t, "IDCT", blocks, idct, slowIDCT, 1, 8)
} }

View file

@ -1,192 +0,0 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package jpeg
// This file implements a Forward Discrete Cosine Transformation.
/*
It is based on the code in jfdctint.c from the Independent JPEG Group,
found at http://www.ijg.org/files/jpegsrc.v8c.tar.gz.
The "LEGAL ISSUES" section of the README in that archive says:
In plain English:
1. We don't promise that this software works. (But if you find any bugs,
please let us know!)
2. You can use this software for whatever you want. You don't have to pay us.
3. You may not pretend that you wrote this software. If you use it in a
program, you must acknowledge somewhere in your documentation that
you've used the IJG code.
In legalese:
The authors make NO WARRANTY or representation, either express or implied,
with respect to this software, its quality, accuracy, merchantability, or
fitness for a particular purpose. This software is provided "AS IS", and you,
its user, assume the entire risk as to its quality and accuracy.
This software is copyright (C) 1991-2011, Thomas G. Lane, Guido Vollbeding.
All Rights Reserved except as specified below.
Permission is hereby granted to use, copy, modify, and distribute this
software (or portions thereof) for any purpose, without fee, subject to these
conditions:
(1) If any part of the source code for this software is distributed, then this
README file must be included, with this copyright and no-warranty notice
unaltered; and any additions, deletions, or changes to the original files
must be clearly indicated in accompanying documentation.
(2) If only executable code is distributed, then the accompanying
documentation must state that "this software is based in part on the work of
the Independent JPEG Group".
(3) Permission for use of this software is granted only if the user accepts
full responsibility for any undesirable consequences; the authors accept
NO LIABILITY for damages of any kind.
These conditions apply to any software derived from or based on the IJG code,
not just to the unmodified library. If you use our work, you ought to
acknowledge us.
Permission is NOT granted for the use of any IJG author's name or company name
in advertising or publicity relating to this software or products derived from
it. This software may be referred to only as "the Independent JPEG Group's
software".
We specifically permit and encourage the use of this software as the basis of
commercial products, provided that all warranty or liability claims are
assumed by the product vendor.
*/
// Trigonometric constants in 13-bit fixed point format.
const (
fix_0_298631336 = 2446
fix_0_390180644 = 3196
fix_0_541196100 = 4433
fix_0_765366865 = 6270
fix_0_899976223 = 7373
fix_1_175875602 = 9633
fix_1_501321110 = 12299
fix_1_847759065 = 15137
fix_1_961570560 = 16069
fix_2_053119869 = 16819
fix_2_562915447 = 20995
fix_3_072711026 = 25172
)
const (
constBits = 13
pass1Bits = 2
centerJSample = 128
)
// fdct performs a forward DCT on an 8x8 block of coefficients, including a
// level shift.
func fdct(b *block) {
// Pass 1: process rows.
for y := 0; y < 8; y++ {
y8 := y * 8
s := b[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
x0 := s[0]
x1 := s[1]
x2 := s[2]
x3 := s[3]
x4 := s[4]
x5 := s[5]
x6 := s[6]
x7 := s[7]
tmp0 := x0 + x7
tmp1 := x1 + x6
tmp2 := x2 + x5
tmp3 := x3 + x4
tmp10 := tmp0 + tmp3
tmp12 := tmp0 - tmp3
tmp11 := tmp1 + tmp2
tmp13 := tmp1 - tmp2
tmp0 = x0 - x7
tmp1 = x1 - x6
tmp2 = x2 - x5
tmp3 = x3 - x4
s[0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits
s[4] = (tmp10 - tmp11) << pass1Bits
z1 := (tmp12 + tmp13) * fix_0_541196100
z1 += 1 << (constBits - pass1Bits - 1)
s[2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits)
s[6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits)
tmp10 = tmp0 + tmp3
tmp11 = tmp1 + tmp2
tmp12 = tmp0 + tmp2
tmp13 = tmp1 + tmp3
z1 = (tmp12 + tmp13) * fix_1_175875602
z1 += 1 << (constBits - pass1Bits - 1)
tmp0 *= fix_1_501321110
tmp1 *= fix_3_072711026
tmp2 *= fix_2_053119869
tmp3 *= fix_0_298631336
tmp10 *= -fix_0_899976223
tmp11 *= -fix_2_562915447
tmp12 *= -fix_0_390180644
tmp13 *= -fix_1_961570560
tmp12 += z1
tmp13 += z1
s[1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits)
s[3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits)
s[5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits)
s[7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits)
}
// Pass 2: process columns.
// We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.
for x := 0; x < 8; x++ {
tmp0 := b[0*8+x] + b[7*8+x]
tmp1 := b[1*8+x] + b[6*8+x]
tmp2 := b[2*8+x] + b[5*8+x]
tmp3 := b[3*8+x] + b[4*8+x]
tmp10 := tmp0 + tmp3 + 1<<(pass1Bits-1)
tmp12 := tmp0 - tmp3
tmp11 := tmp1 + tmp2
tmp13 := tmp1 - tmp2
tmp0 = b[0*8+x] - b[7*8+x]
tmp1 = b[1*8+x] - b[6*8+x]
tmp2 = b[2*8+x] - b[5*8+x]
tmp3 = b[3*8+x] - b[4*8+x]
b[0*8+x] = (tmp10 + tmp11) >> pass1Bits
b[4*8+x] = (tmp10 - tmp11) >> pass1Bits
z1 := (tmp12 + tmp13) * fix_0_541196100
z1 += 1 << (constBits + pass1Bits - 1)
b[2*8+x] = (z1 + tmp12*fix_0_765366865) >> (constBits + pass1Bits)
b[6*8+x] = (z1 - tmp13*fix_1_847759065) >> (constBits + pass1Bits)
tmp10 = tmp0 + tmp3
tmp11 = tmp1 + tmp2
tmp12 = tmp0 + tmp2
tmp13 = tmp1 + tmp3
z1 = (tmp12 + tmp13) * fix_1_175875602
z1 += 1 << (constBits + pass1Bits - 1)
tmp0 *= fix_1_501321110
tmp1 *= fix_3_072711026
tmp2 *= fix_2_053119869
tmp3 *= fix_0_298631336
tmp10 *= -fix_0_899976223
tmp11 *= -fix_2_562915447
tmp12 *= -fix_0_390180644
tmp13 *= -fix_1_961570560
tmp12 += z1
tmp13 += z1
b[1*8+x] = (tmp0 + tmp10 + tmp12) >> (constBits + pass1Bits)
b[3*8+x] = (tmp1 + tmp11 + tmp13) >> (constBits + pass1Bits)
b[5*8+x] = (tmp2 + tmp11 + tmp12) >> (constBits + pass1Bits)
b[7*8+x] = (tmp3 + tmp10 + tmp13) >> (constBits + pass1Bits)
}
}

View file

@ -1,194 +0,0 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package jpeg
// This is a Go translation of idct.c from
//
// http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
//
// which carries the following notice:
/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
/*
* Disclaimer of Warranty
*
* These software programs are available to the user without any license fee or
* royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
* any and all warranties, whether express, implied, or statuary, including any
* implied warranties or merchantability or of fitness for a particular
* purpose. In no event shall the copyright-holder be liable for any
* incidental, punitive, or consequential damages of any kind whatsoever
* arising from the use of these programs.
*
* This disclaimer of warranty extends to the user of these programs and user's
* customers, employees, agents, transferees, successors, and assigns.
*
* The MPEG Software Simulation Group does not represent or warrant that the
* programs furnished hereunder are free of infringement of any third-party
* patents.
*
* Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
* are subject to royalty fees to patent holders. Many of these patents are
* general enough such that they are unavoidable regardless of implementation
* design.
*
*/
const blockSize = 64 // A DCT block is 8x8.
type block [blockSize]int32
const (
w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
w3 = 2408 // 2048*sqrt(2)*cos(3*pi/16)
w5 = 1609 // 2048*sqrt(2)*cos(5*pi/16)
w6 = 1108 // 2048*sqrt(2)*cos(6*pi/16)
w7 = 565 // 2048*sqrt(2)*cos(7*pi/16)
w1pw7 = w1 + w7
w1mw7 = w1 - w7
w2pw6 = w2 + w6
w2mw6 = w2 - w6
w3pw5 = w3 + w5
w3mw5 = w3 - w5
r2 = 181 // 256/sqrt(2)
)
// idct performs a 2-D Inverse Discrete Cosine Transformation.
//
// The input coefficients should already have been multiplied by the
// appropriate quantization table. We use fixed-point computation, with the
// number of bits for the fractional component varying over the intermediate
// stages.
//
// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
func idct(src *block) {
// Horizontal 1-D IDCT.
for y := 0; y < 8; y++ {
y8 := y * 8
s := src[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
// If all the AC components are zero, then the IDCT is trivial.
if s[1] == 0 && s[2] == 0 && s[3] == 0 &&
s[4] == 0 && s[5] == 0 && s[6] == 0 && s[7] == 0 {
dc := s[0] << 3
s[0] = dc
s[1] = dc
s[2] = dc
s[3] = dc
s[4] = dc
s[5] = dc
s[6] = dc
s[7] = dc
continue
}
// Prescale.
x0 := (s[0] << 11) + 128
x1 := s[4] << 11
x2 := s[6]
x3 := s[2]
x4 := s[1]
x5 := s[7]
x6 := s[5]
x7 := s[3]
// Stage 1.
x8 := w7 * (x4 + x5)
x4 = x8 + w1mw7*x4
x5 = x8 - w1pw7*x5
x8 = w3 * (x6 + x7)
x6 = x8 - w3mw5*x6
x7 = x8 - w3pw5*x7
// Stage 2.
x8 = x0 + x1
x0 -= x1
x1 = w6 * (x3 + x2)
x2 = x1 - w2pw6*x2
x3 = x1 + w2mw6*x3
x1 = x4 + x6
x4 -= x6
x6 = x5 + x7
x5 -= x7
// Stage 3.
x7 = x8 + x3
x8 -= x3
x3 = x0 + x2
x0 -= x2
x2 = (r2*(x4+x5) + 128) >> 8
x4 = (r2*(x4-x5) + 128) >> 8
// Stage 4.
s[0] = (x7 + x1) >> 8
s[1] = (x3 + x2) >> 8
s[2] = (x0 + x4) >> 8
s[3] = (x8 + x6) >> 8
s[4] = (x8 - x6) >> 8
s[5] = (x0 - x4) >> 8
s[6] = (x3 - x2) >> 8
s[7] = (x7 - x1) >> 8
}
// Vertical 1-D IDCT.
for x := 0; x < 8; x++ {
// Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
// However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
// we do not bother to check for the all-zero case.
s := src[x : x+57 : x+57] // Small cap improves performance, see https://golang.org/issue/27857
// Prescale.
y0 := (s[8*0] << 8) + 8192
y1 := s[8*4] << 8
y2 := s[8*6]
y3 := s[8*2]
y4 := s[8*1]
y5 := s[8*7]
y6 := s[8*5]
y7 := s[8*3]
// Stage 1.
y8 := w7*(y4+y5) + 4
y4 = (y8 + w1mw7*y4) >> 3
y5 = (y8 - w1pw7*y5) >> 3
y8 = w3*(y6+y7) + 4
y6 = (y8 - w3mw5*y6) >> 3
y7 = (y8 - w3pw5*y7) >> 3
// Stage 2.
y8 = y0 + y1
y0 -= y1
y1 = w6*(y3+y2) + 4
y2 = (y1 - w2pw6*y2) >> 3
y3 = (y1 + w2mw6*y3) >> 3
y1 = y4 + y6
y4 -= y6
y6 = y5 + y7
y5 -= y7
// Stage 3.
y7 = y8 + y3
y8 -= y3
y3 = y0 + y2
y0 -= y2
y2 = (r2*(y4+y5) + 128) >> 8
y4 = (r2*(y4-y5) + 128) >> 8
// Stage 4.
s[8*0] = (y7 + y1) >> 14
s[8*1] = (y3 + y2) >> 14
s[8*2] = (y0 + y4) >> 14
s[8*3] = (y8 + y6) >> 14
s[8*4] = (y8 - y6) >> 14
s[8*5] = (y0 - y4) >> 14
s[8*6] = (y3 - y2) >> 14
s[8*7] = (y7 - y1) >> 14
}
}