mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
image/jpeg: replace fdct.go and idct.go with new implementation in dct.go
The fdct.go and idct.go files were derived from the MPEG-SSG and
JPEG-IJG reference code and therefore carry licenses specific to those
groups. Various license checkers flag these files as potentially
problematic. The code is also not terribly well documented.
This CL fixes the license problem by adding a new, from-scratch
implementation using a different algorithm. As a bonus, the new code
is both faster and more accurate than the old encumbered code.
On speed, the new code is up to 20% faster; benchmarks below.
On accuracy, in the set of blocks used in the test, we can measure
the number of output values that are off-by-one from the exact rounded answer.
The old FDCT was off in 8.6% of values; the new one is off in 2.5%.
The old IDCT was off in 1.4% of values; the new one is off in 1.2%.
goos: darwin
goarch: arm64
pkg: image/jpeg
cpu: Apple M3 Pro
│ old │ new │
│ sec/op │ sec/op vs base │
FDCT-12 619.6n ± 3% 586.5n ± 1% -5.34% (p=0.000 n=10)
IDCT-12 752.4n ± 4% 628.0n ± 1% -16.54% (p=0.000 n=10)
goos: linux
goarch: amd64
cpu: Intel(R) Xeon(R) CPU @ 2.30GHz
│ old │ new │
│ sec/op │ sec/op vs base │
FDCT-16 1.817µ ± 0% 1.542µ ± 0% -15.11% (p=0.000 n=10)
IDCT-16 1.897µ ± 0% 1.514µ ± 0% -20.22% (p=0.000 n=10)
goos: linux
goarch: arm64
cpu: whatever gotip-linux-arm64 has
│ old │ new │
│ sec/op │ sec/op vs base │
FDCT-8 1.844µ ± 0% 1.847µ ± 0% +0.14% (p=0.000 n=10)
IDCT-8 2.127µ ± 0% 1.973µ ± 0% -7.26% (p=0.000 n=10)
Change-Id: Ie6d14103c8478ba5a779f234da84f345828eb925
Reviewed-on: https://go-review.googlesource.com/c/go/+/705518
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
Reviewed-by: Nigel Tao <nigeltao@google.com>
This commit is contained in:
parent
92e093467f
commit
fbba930271
5 changed files with 538 additions and 403 deletions
|
|
@ -70,22 +70,22 @@ func Example() {
|
||||||
}
|
}
|
||||||
// Output:
|
// Output:
|
||||||
// bin red green blue alpha
|
// bin red green blue alpha
|
||||||
// 0x0000-0x0fff: 364 790 7242 0
|
// 0x0000-0x0fff: 362 793 7245 0
|
||||||
// 0x1000-0x1fff: 645 2967 1039 0
|
// 0x1000-0x1fff: 648 2963 1036 0
|
||||||
// 0x2000-0x2fff: 1072 2299 979 0
|
// 0x2000-0x2fff: 1072 2301 977 0
|
||||||
// 0x3000-0x3fff: 820 2266 980 0
|
// 0x3000-0x3fff: 819 2266 982 0
|
||||||
// 0x4000-0x4fff: 537 1305 541 0
|
// 0x4000-0x4fff: 537 1303 541 0
|
||||||
// 0x5000-0x5fff: 319 962 261 0
|
// 0x5000-0x5fff: 321 964 261 0
|
||||||
// 0x6000-0x6fff: 322 375 177 0
|
// 0x6000-0x6fff: 321 375 177 0
|
||||||
// 0x7000-0x7fff: 601 279 214 0
|
// 0x7000-0x7fff: 599 278 213 0
|
||||||
// 0x8000-0x8fff: 3478 227 273 0
|
// 0x8000-0x8fff: 3478 228 275 0
|
||||||
// 0x9000-0x9fff: 2260 234 329 0
|
// 0x9000-0x9fff: 2260 233 328 0
|
||||||
// 0xa000-0xafff: 921 282 373 0
|
// 0xa000-0xafff: 921 282 374 0
|
||||||
// 0xb000-0xbfff: 321 335 397 0
|
// 0xb000-0xbfff: 322 335 395 0
|
||||||
// 0xc000-0xcfff: 229 388 298 0
|
// 0xc000-0xcfff: 228 388 299 0
|
||||||
// 0xd000-0xdfff: 260 414 277 0
|
// 0xd000-0xdfff: 261 415 277 0
|
||||||
// 0xe000-0xefff: 516 428 298 0
|
// 0xe000-0xefff: 516 423 297 0
|
||||||
// 0xf000-0xffff: 2785 1899 1772 15450
|
// 0xf000-0xffff: 2785 1903 1773 15450
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = `
|
const data = `
|
||||||
|
|
|
||||||
521
src/image/jpeg/dct.go
Normal file
521
src/image/jpeg/dct.go
Normal file
|
|
@ -0,0 +1,521 @@
|
||||||
|
// Copyright 2025 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package jpeg
|
||||||
|
|
||||||
|
// Discrete Cosine Transformation (DCT) implementations using the algorithm from
|
||||||
|
// Christoph Loeffler, Adriaan Lightenberg, and George S. Mostchytz,
|
||||||
|
// “Practical Fast 1-D DCT Algorithms with 11 Multiplications,” ICASSP 1989.
|
||||||
|
// https://ieeexplore.ieee.org/document/266596
|
||||||
|
//
|
||||||
|
// Since the paper is paywalled, the rest of this comment gives a summary.
|
||||||
|
//
|
||||||
|
// A 1-dimensional forward DCT (1D FDCT) takes as input 8 values x0..x7
|
||||||
|
// and transforms them in place into the result values.
|
||||||
|
//
|
||||||
|
// The mathematical definition of the N-point 1D FDCT is:
|
||||||
|
//
|
||||||
|
// X[k] = α_k Σ_n x[n] * cos (2n+1)*k*π/2N
|
||||||
|
//
|
||||||
|
// where α₀ = √2 and α_k = 1 for k > 0.
|
||||||
|
//
|
||||||
|
// For our purposes, N=8, so the angles end up being multiples of π/16.
|
||||||
|
// The most direct implementation of this definition would require 64 multiplications.
|
||||||
|
//
|
||||||
|
// Loeffler's paper presents a more efficient computation that requires only
|
||||||
|
// 11 multiplications and works in terms of three basic operations:
|
||||||
|
//
|
||||||
|
// - A “butterfly” x0, x1 = x0+x1, x0-x1.
|
||||||
|
// The inverse is x0, x1 = (x0+x1)/2, (x0-x1)/2.
|
||||||
|
//
|
||||||
|
// - A scaling of x0 by k: x0 *= k. The inverse is scaling by 1/k.
|
||||||
|
//
|
||||||
|
// - A rotation of x0, x1 by θ, defined as:
|
||||||
|
// x0, x1 = x0 cos θ + x1 sin θ, -x0 sin θ + x1 cos θ.
|
||||||
|
// The inverse is rotation by -θ.
|
||||||
|
//
|
||||||
|
// The algorithm proceeds in four stages:
|
||||||
|
//
|
||||||
|
// Stage 1:
|
||||||
|
// - butterfly x0, x7; x1, x6; x2, x5; x3, x4.
|
||||||
|
//
|
||||||
|
// Stage 2:
|
||||||
|
// - butterfly x0, x3; x1, x2
|
||||||
|
// - rotate x4, x7 by 3π/16
|
||||||
|
// - rotate x5, x6 by π/16.
|
||||||
|
//
|
||||||
|
// Stage 3:
|
||||||
|
// - butterfly x0, x1; x4, x6; x7, x5
|
||||||
|
// - rotate x2, x3 by 6π/16 and scale by √2.
|
||||||
|
//
|
||||||
|
// Stage 4:
|
||||||
|
// - butterfly x7, x4
|
||||||
|
// - scale x5, x6 by √2.
|
||||||
|
//
|
||||||
|
// Finally, the values are permuted. The permutation can be read as either:
|
||||||
|
// - x0, x4, x2, x6, x7, x3, x5, x1 = x0, x1, x2, x3, x4, x5, x6, x7 (paper's form)
|
||||||
|
// - x0, x1, x2, x3, x4, x5, x6, x7 = x0, x7, x2, x5, x1, x6, x3, x4 (sorted by LHS)
|
||||||
|
// The code below uses the second form to make it easier to merge adjacent stores.
|
||||||
|
// (Note that unlike in recursive FFT implementations, the permutation here is
|
||||||
|
// not always mapping indexes to their bit reversals.)
|
||||||
|
//
|
||||||
|
// As written above, the rotation requires four multiplications, but it can be
|
||||||
|
// reduced to three by refactoring (see [dctBox] below), and the scaling in
|
||||||
|
// stage 3 can be merged into the rotation constants, so the overall cost
|
||||||
|
// of a 1D FDCT is 11 multiplies.
|
||||||
|
//
|
||||||
|
// The 1D inverse DCT (IDCT) is the 1D FDCT run backward
|
||||||
|
// with all the basic operations inverted.
|
||||||
|
|
||||||
|
// dctBox implements a 3-multiply, 3-add rotation+scaling.
|
||||||
|
// Given x0, x1, k*cos θ, and k*sin θ, dctBox returns the
|
||||||
|
// rotated and scaled coordinates.
|
||||||
|
// (It is called dctBox because the rotate+scale operation
|
||||||
|
// is drawn as a box in Figures 1 and 2 in the paper.)
|
||||||
|
func dctBox(x0, x1, kcos, ksin int32) (y0, y1 int32) {
|
||||||
|
// y0 = x0*kcos + x1*ksin
|
||||||
|
// y1 = -x0*ksin + x1*kcos
|
||||||
|
ksum := kcos * (x0 + x1)
|
||||||
|
y0 = ksum + (ksin-kcos)*x1
|
||||||
|
y1 = ksum - (kcos+ksin)*x0
|
||||||
|
return y0, y1
|
||||||
|
}
|
||||||
|
|
||||||
|
// A block is an 8x8 input to a 2D DCT (either the FDCT or IDCT).
|
||||||
|
// The input is actually only 8x8 uint8 values, and the outputs are 8x8 int16,
|
||||||
|
// but it is convenient to use int32s for intermediate storage,
|
||||||
|
// so we define only a single block type of [8*8]int32.
|
||||||
|
//
|
||||||
|
// A 2D DCT is implemented as 1D DCTs over the rows and columns.
|
||||||
|
//
|
||||||
|
// dct_test.go defines a String method for nice printing in tests.
|
||||||
|
type block [blockSize]int32
|
||||||
|
|
||||||
|
const blockSize = 8 * 8
|
||||||
|
|
||||||
|
// Note on Numerical Precision
|
||||||
|
//
|
||||||
|
// The inputs to both the FDCT and IDCT are uint8 values stored in a block,
|
||||||
|
// and the outputs are int16s in the same block, but the overall operation
|
||||||
|
// uses int32 values as fixed-point intermediate values.
|
||||||
|
// In the code comments below, the notation “QN.M” refers to a
|
||||||
|
// signed value of 1+N+M significant bits, one of which is the sign bit,
|
||||||
|
// and M of which hold fractional (sub-integer) precision.
|
||||||
|
// For example, 255 as a Q8.0 value is stored as int32(255),
|
||||||
|
// while 255 as a Q8.1 value is stored as int32(510),
|
||||||
|
// and 255.5 as a Q8.1 value is int32(511).
|
||||||
|
// The notation UQN.M refers to an unsigned value of N+M significant bits.
|
||||||
|
// See https://en.wikipedia.org/wiki/Q_(number_format) for more.
|
||||||
|
//
|
||||||
|
// In general we only need to keep about 16 significant bits, but it is more
|
||||||
|
// efficient and somewhat more precise to let unnecessary fractional bits
|
||||||
|
// accumulate and shift them away in bulk rather than after every operation.
|
||||||
|
// As such, it is important to keep track of the number of fractional bits
|
||||||
|
// in each variable at different points in the code, to avoid mistakes like
|
||||||
|
// adding numbers with different fractional precisions, as well as to keep
|
||||||
|
// track of the total number of bits, to avoid overflow. A comment like:
|
||||||
|
//
|
||||||
|
// // x[123] now Q8.2.
|
||||||
|
//
|
||||||
|
// means that x1, x2, and x3 are all Q8.2 (11-bit) values.
|
||||||
|
// Keeping extra precision bits also reduces the size of the errors introduced
|
||||||
|
// by using right shift to approximate rounded division.
|
||||||
|
|
||||||
|
// Constants needed for the implementation.
|
||||||
|
// These are all 60-bit precision fixed-point constants.
|
||||||
|
// The function c(val, b) rounds the constant to b bits.
|
||||||
|
// c is simple enough that calls to it with constant args
|
||||||
|
// are inlined and constant-propagated down to an inline constant.
|
||||||
|
// Each constant is commented with its Ivy definition (see robpike.io/ivy),
|
||||||
|
// using this scaling helper function:
|
||||||
|
//
|
||||||
|
// op fix x = floor 0.5 + x * 2**60
|
||||||
|
const (
|
||||||
|
cos1 = 1130768441178740757 // fix cos 1*pi/16
|
||||||
|
sin1 = 224923827593068887 // fix sin 1*pi/16
|
||||||
|
cos3 = 958619196450722178 // fix cos 3*pi/16
|
||||||
|
sin3 = 640528868967736374 // fix sin 3*pi/16
|
||||||
|
sqrt2 = 1630477228166597777 // fix sqrt 2
|
||||||
|
sqrt2_cos6 = 623956622067911264 // fix (sqrt 2)*cos 6*pi/16
|
||||||
|
sqrt2_sin6 = 1506364539328854985 // fix (sqrt 2)*sin 6*pi/16
|
||||||
|
sqrt2inv = 815238614083298888 // fix 1/sqrt 2
|
||||||
|
sqrt2inv_cos6 = 311978311033955632 // fix (1/sqrt 2)*cos 6*pi/16
|
||||||
|
sqrt2inv_sin6 = 753182269664427492 // fix (1/sqrt 2)*sin 6*pi/16
|
||||||
|
)
|
||||||
|
|
||||||
|
func c(x uint64, bits int) int32 {
|
||||||
|
return int32((x + (1 << (59 - bits))) >> (60 - bits))
|
||||||
|
}
|
||||||
|
|
||||||
|
// fdct implements the forward DCT.
|
||||||
|
// Inputs are UQ8.0; outputs are Q13.0.
|
||||||
|
func fdct(b *block) {
|
||||||
|
fdctCols(b)
|
||||||
|
fdctRows(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
// fdctCols applies the 1D DCT to the columns of b.
|
||||||
|
// Inputs are UQ8.0 in [0,255] but interpreted as [-128,127].
|
||||||
|
// Outputs are Q10.18.
|
||||||
|
func fdctCols(b *block) {
|
||||||
|
for i := range 8 {
|
||||||
|
x0 := b[0*8+i]
|
||||||
|
x1 := b[1*8+i]
|
||||||
|
x2 := b[2*8+i]
|
||||||
|
x3 := b[3*8+i]
|
||||||
|
x4 := b[4*8+i]
|
||||||
|
x5 := b[5*8+i]
|
||||||
|
x6 := b[6*8+i]
|
||||||
|
x7 := b[7*8+i]
|
||||||
|
|
||||||
|
// x[01234567] are UQ8.0 in [0,255].
|
||||||
|
|
||||||
|
// Stage 1: four butterflies.
|
||||||
|
// In general a butterfly of QN.M inputs produces Q(N+1).M outputs.
|
||||||
|
// A butterfly of UQN.M inputs produces a UQ(N+1).M sum and a QN.M difference.
|
||||||
|
|
||||||
|
x0, x7 = x0+x7, x0-x7
|
||||||
|
x1, x6 = x1+x6, x1-x6
|
||||||
|
x2, x5 = x2+x5, x2-x5
|
||||||
|
x3, x4 = x3+x4, x3-x4
|
||||||
|
// x[0123] now UQ9.0 in [0, 510].
|
||||||
|
// x[4567] now Q8.0 in [-255,255].
|
||||||
|
|
||||||
|
// Stage 2: two boxes and two butterflies.
|
||||||
|
// A box on QN.M inputs with B-bit constants
|
||||||
|
// produces Q(N+1).(M+B) outputs.
|
||||||
|
// (The +1 is from the addition.)
|
||||||
|
|
||||||
|
x4, x7 = dctBox(x4, x7, c(cos3, 18), c(sin3, 18))
|
||||||
|
x5, x6 = dctBox(x5, x6, c(cos1, 18), c(sin1, 18))
|
||||||
|
// x[47] now Q9.18 in [-354, 354].
|
||||||
|
// x[56] now Q9.18 in [-300, 300].
|
||||||
|
|
||||||
|
x0, x3 = x0+x3, x0-x3
|
||||||
|
x1, x2 = x1+x2, x1-x2
|
||||||
|
// x[01] now UQ10.0 in [0, 1020].
|
||||||
|
// x[23] now Q9.0 in [-510, 510].
|
||||||
|
|
||||||
|
// Stage 3: one box and three butterflies.
|
||||||
|
|
||||||
|
x2, x3 = dctBox(x2, x3, c(sqrt2_cos6, 18), c(sqrt2_sin6, 18))
|
||||||
|
// x[23] now Q10.18 in [-943, 943].
|
||||||
|
|
||||||
|
x0, x1 = x0+x1, x0-x1
|
||||||
|
// x0 now UQ11.0 in [0, 2040].
|
||||||
|
// x1 now Q10.0 in [-1020, 1020].
|
||||||
|
|
||||||
|
// Store x0, x1, x2, x3 to their permuted targets.
|
||||||
|
// The original +128 in every input value
|
||||||
|
// has cancelled out except in the “DC signal” x0.
|
||||||
|
// Subtracting 128*8 here is equivalent to subtracting 128
|
||||||
|
// from every input before we started, but cheaper.
|
||||||
|
// It also converts x0 from UQ11.18 to Q10.18.
|
||||||
|
b[0*8+i] = (x0 - 128*8) << 18
|
||||||
|
b[4*8+i] = x1 << 18
|
||||||
|
b[2*8+i] = x2
|
||||||
|
b[6*8+i] = x3
|
||||||
|
|
||||||
|
x4, x6 = x4+x6, x4-x6
|
||||||
|
x7, x5 = x7+x5, x7-x5
|
||||||
|
// x[4567] now Q10.18 in [-654, 654].
|
||||||
|
|
||||||
|
// Stage 4: two √2 scalings and one butterfly.
|
||||||
|
|
||||||
|
x5 = (x5 >> 12) * c(sqrt2, 12)
|
||||||
|
x6 = (x6 >> 12) * c(sqrt2, 12)
|
||||||
|
// x[56] still Q10.18 in [-925, 925] (= 654√2).
|
||||||
|
x7, x4 = x7+x4, x7-x4
|
||||||
|
// x[47] still Q10.18 in [-925, 925] (not Q11.18!).
|
||||||
|
// This is not obvious at all! See “Note on 925” below.
|
||||||
|
|
||||||
|
// Store x4 x5 x6 x7 to their permuted targets.
|
||||||
|
b[1*8+i] = x7
|
||||||
|
b[3*8+i] = x5
|
||||||
|
b[5*8+i] = x6
|
||||||
|
b[7*8+i] = x4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fdctRows applies the 1D DCT to the rows of b.
|
||||||
|
// Inputs are Q10.18; outputs are Q13.0.
|
||||||
|
func fdctRows(b *block) {
|
||||||
|
for i := range 8 {
|
||||||
|
x := b[8*i : 8*i+8 : 8*i+8]
|
||||||
|
x0 := x[0]
|
||||||
|
x1 := x[1]
|
||||||
|
x2 := x[2]
|
||||||
|
x3 := x[3]
|
||||||
|
x4 := x[4]
|
||||||
|
x5 := x[5]
|
||||||
|
x6 := x[6]
|
||||||
|
x7 := x[7]
|
||||||
|
|
||||||
|
// x[01234567] are Q10.18 [-1020, 1020].
|
||||||
|
|
||||||
|
// Stage 1: four butterflies.
|
||||||
|
|
||||||
|
x0, x7 = x0+x7, x0-x7
|
||||||
|
x1, x6 = x1+x6, x1-x6
|
||||||
|
x2, x5 = x2+x5, x2-x5
|
||||||
|
x3, x4 = x3+x4, x3-x4
|
||||||
|
// x[01234567] now Q11.18 in [-2040, 2040].
|
||||||
|
|
||||||
|
// Stage 2: two boxes and two butterflies.
|
||||||
|
|
||||||
|
x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 14), c(sin3, 14))
|
||||||
|
x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 14), c(sin1, 14))
|
||||||
|
// x[47] now Q12.18 in [-2830, 2830].
|
||||||
|
// x[56] now Q12.18 in [-2400, 2400].
|
||||||
|
x0, x3 = x0+x3, x0-x3
|
||||||
|
x1, x2 = x1+x2, x1-x2
|
||||||
|
// x[01234567] now Q12.18 in [-4080, 4080].
|
||||||
|
|
||||||
|
// Stage 3: one box and three butterflies.
|
||||||
|
|
||||||
|
x2, x3 = dctBox(x2>>14, x3>>14, c(sqrt2_cos6, 14), c(sqrt2_sin6, 14))
|
||||||
|
// x[23] now Q13.18 in [-7539, 7539].
|
||||||
|
x0, x1 = x0+x1, x0-x1
|
||||||
|
// x[01] now Q13.18 in [-8160, 8160].
|
||||||
|
x4, x6 = x4+x6, x4-x6
|
||||||
|
x7, x5 = x7+x5, x7-x5
|
||||||
|
// x[4567] now Q13.18 in [-5230, 5230].
|
||||||
|
|
||||||
|
// Stage 4: two √2 scalings and one butterfly.
|
||||||
|
|
||||||
|
x5 = (x5 >> 14) * c(sqrt2, 14)
|
||||||
|
x6 = (x6 >> 14) * c(sqrt2, 14)
|
||||||
|
// x[56] still Q13.18 in [-7397, 7397] (= 5230√2).
|
||||||
|
x7, x4 = x7+x4, x7-x4
|
||||||
|
// x[47] still Q13.18 in [-7395, 7395] (= 2040*3.6246).
|
||||||
|
// See “Note on 925” below.
|
||||||
|
|
||||||
|
// Cut from Q13.18 to Q13.0.
|
||||||
|
x0 = (x0 + 1<<17) >> 18
|
||||||
|
x1 = (x1 + 1<<17) >> 18
|
||||||
|
x2 = (x2 + 1<<17) >> 18
|
||||||
|
x3 = (x3 + 1<<17) >> 18
|
||||||
|
x4 = (x4 + 1<<17) >> 18
|
||||||
|
x5 = (x5 + 1<<17) >> 18
|
||||||
|
x6 = (x6 + 1<<17) >> 18
|
||||||
|
x7 = (x7 + 1<<17) >> 18
|
||||||
|
|
||||||
|
// Note: Unlike in fdctCols, saved all stores for the end
|
||||||
|
// because they are adjacent memory locations and some systems
|
||||||
|
// can use multiword stores.
|
||||||
|
x[0] = x0
|
||||||
|
x[1] = x7
|
||||||
|
x[2] = x2
|
||||||
|
x[3] = x5
|
||||||
|
x[4] = x1
|
||||||
|
x[5] = x6
|
||||||
|
x[6] = x3
|
||||||
|
x[7] = x4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// “Note on 925”, deferred from above to avoid interrupting code.
|
||||||
|
//
|
||||||
|
// In fdctCols, heading into stage 2, the values x4, x5, x6, x7 are in [-255, 255].
|
||||||
|
// Let's call those specific values b4, b5, b6, b7, and trace how x[4567] evolve:
|
||||||
|
//
|
||||||
|
// Stage 2:
|
||||||
|
// x4 = b4*cos3 + b7*sin3
|
||||||
|
// x7 = -b4*sin3 + b7*cos3
|
||||||
|
// x5 = b5*cos1 + b6*sin1
|
||||||
|
// x6 = -b5*sin1 + b6*cos1
|
||||||
|
//
|
||||||
|
// Stage 3:
|
||||||
|
//
|
||||||
|
// x4 = x4+x6 = b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
|
||||||
|
// x6 = x4-x6 = b4*cos3 + b7*sin3 + b5*sin1 - b6*cos1
|
||||||
|
// x7 = x7+x5 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1
|
||||||
|
// x5 = x7-x5 = -b4*sin3 + b7*cos3 - b5*cos1 - b6*sin1
|
||||||
|
//
|
||||||
|
// Stage 4:
|
||||||
|
//
|
||||||
|
// x7 = x7+x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 + b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
|
||||||
|
// = b4*(cos3-sin3) + b5*(cos1-sin1) + b6*(cos1+sin1) + b7*(cos3+sin3)
|
||||||
|
// < 255*(0.2759 + 0.7857 + 1.1759 + 1.3871) = 255*3.6246 < 925.
|
||||||
|
//
|
||||||
|
// x4 = x7-x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 - b4*cos3 - b7*sin3 + b5*sin1 - b6*cos1
|
||||||
|
// = -b4*(cos3+sin3) + b5*(cos1+sin1) + b6*(sin1-cos1) + b7*(cos3-sin3)
|
||||||
|
// < same 925.
|
||||||
|
//
|
||||||
|
// The fact that x5, x6 are also at most 925 is not a coincidence: we are computing
|
||||||
|
// the same kinds of numbers for all four, just with different paths to them.
|
||||||
|
//
|
||||||
|
// In fdctRows, the same analysis applies, but the initial values are
|
||||||
|
// in [-2040, 2040] instead of [-255, 255], so the bound is 2040*3.6246 < 7395.
|
||||||
|
|
||||||
|
// idct implements the inverse DCT.
|
||||||
|
// Inputs are UQ8.0; outputs are Q10.3.
|
||||||
|
func idct(b *block) {
|
||||||
|
// A 2D IDCT is a 1D IDCT on rows followed by columns.
|
||||||
|
idctRows(b)
|
||||||
|
idctCols(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
// idctRows applies the 1D IDCT to the rows of b.
|
||||||
|
// Inputs are UQ8.0; outputs are Q9.20.
|
||||||
|
func idctRows(b *block) {
|
||||||
|
for i := range 8 {
|
||||||
|
x := b[8*i : 8*i+8 : 8*i+8]
|
||||||
|
x0 := x[0]
|
||||||
|
x7 := x[1]
|
||||||
|
x2 := x[2]
|
||||||
|
x5 := x[3]
|
||||||
|
x1 := x[4]
|
||||||
|
x6 := x[5]
|
||||||
|
x3 := x[6]
|
||||||
|
x4 := x[7]
|
||||||
|
|
||||||
|
// Run FDCT backward.
|
||||||
|
// Independent operations have been reordered somewhat
|
||||||
|
// to make precision tracking easier.
|
||||||
|
//
|
||||||
|
// Note that “x0, x1 = x0+x1, x0-x1” is now a reverse butterfly
|
||||||
|
// and carries with it an implicit divide by two: the extra bit
|
||||||
|
// is added to the precision, not the value size.
|
||||||
|
|
||||||
|
// x[01234567] are UQ8.0 in [0, 255].
|
||||||
|
|
||||||
|
// Stages 4, 3, 2: x0, x1, x2, x3.
|
||||||
|
|
||||||
|
x0 <<= 17
|
||||||
|
x1 <<= 17
|
||||||
|
// x0, x1 now UQ8.17.
|
||||||
|
x0, x1 = x0+x1, x0-x1
|
||||||
|
// x0 now UQ8.18 in [0, 255].
|
||||||
|
// x1 now Q7.18 in [-127½, 127½].
|
||||||
|
|
||||||
|
// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 0.924, so no new high bit.
|
||||||
|
x2, x3 = dctBox(x2, x3, c(sqrt2inv_cos6, 18), -c(sqrt2inv_sin6, 18))
|
||||||
|
// x[23] now Q8.18 in [-236, 236].
|
||||||
|
x1, x2 = x1+x2, x1-x2
|
||||||
|
x0, x3 = x0+x3, x0-x3
|
||||||
|
// x[0123] now Q8.19 in [-246, 246].
|
||||||
|
|
||||||
|
// Stages 4, 3, 2: x4, x5, x6, x7.
|
||||||
|
|
||||||
|
x4 <<= 7
|
||||||
|
x7 <<= 7
|
||||||
|
// x[47] now UQ8.7
|
||||||
|
x7, x4 = x7+x4, x7-x4
|
||||||
|
// x7 now UQ8.8 in [0, 255].
|
||||||
|
// x4 now Q7.8 in [-127½, 127½].
|
||||||
|
|
||||||
|
x6 = x6 * c(sqrt2inv, 8)
|
||||||
|
x5 = x5 * c(sqrt2inv, 8)
|
||||||
|
// x[56] now UQ8.8 in [0, 181].
|
||||||
|
// Note that 1/√2 has five 0s in its binary representation after
|
||||||
|
// the 8th bit, so this multipliy is actually producing 12 bits of precision.
|
||||||
|
|
||||||
|
x7, x5 = x7+x5, x7-x5
|
||||||
|
x4, x6 = x4+x6, x4-x6
|
||||||
|
// x[4567] now Q8.9 in [-218, 218].
|
||||||
|
|
||||||
|
x4, x7 = dctBox(x4>>2, x7>>2, c(cos3, 12), -c(sin3, 12))
|
||||||
|
x5, x6 = dctBox(x5>>2, x6>>2, c(cos1, 12), -c(sin1, 12))
|
||||||
|
// x[4567] now Q9.19 in [-303, 303].
|
||||||
|
|
||||||
|
// Stage 1.
|
||||||
|
|
||||||
|
x0, x7 = x0+x7, x0-x7
|
||||||
|
x1, x6 = x1+x6, x1-x6
|
||||||
|
x2, x5 = x2+x5, x2-x5
|
||||||
|
x3, x4 = x3+x4, x3-x4
|
||||||
|
// x[01234567] now Q9.20 in [-275, 275].
|
||||||
|
|
||||||
|
// Note: we don't need all 20 bits of “precision”,
|
||||||
|
// but it is faster to let idctCols shift it away as part
|
||||||
|
// of other operations rather than downshift here.
|
||||||
|
|
||||||
|
x[0] = x0
|
||||||
|
x[1] = x1
|
||||||
|
x[2] = x2
|
||||||
|
x[3] = x3
|
||||||
|
x[4] = x4
|
||||||
|
x[5] = x5
|
||||||
|
x[6] = x6
|
||||||
|
x[7] = x7
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// idctCols applies the 1D IDCT to the columns of b.
|
||||||
|
// Inputs are Q9.20.
|
||||||
|
// Outputs are Q10.3. That is, the result is the IDCT*8.
|
||||||
|
func idctCols(b *block) {
|
||||||
|
for i := range 8 {
|
||||||
|
x0 := b[0*8+i]
|
||||||
|
x7 := b[1*8+i]
|
||||||
|
x2 := b[2*8+i]
|
||||||
|
x5 := b[3*8+i]
|
||||||
|
x1 := b[4*8+i]
|
||||||
|
x6 := b[5*8+i]
|
||||||
|
x3 := b[6*8+i]
|
||||||
|
x4 := b[7*8+i]
|
||||||
|
|
||||||
|
// x[012345678] are Q9.20.
|
||||||
|
|
||||||
|
// Start by adding 0.5 to x0 (the incoming DC signal).
|
||||||
|
// The butterflies will add it to all the other values,
|
||||||
|
// and then the final shifts will round properly.
|
||||||
|
x0 += 1 << 19
|
||||||
|
|
||||||
|
// Stages 4, 3, 2: x0, x1, x2, x3.
|
||||||
|
|
||||||
|
x0, x1 = (x0+x1)>>2, (x0-x1)>>2
|
||||||
|
// x[01] now Q9.19.
|
||||||
|
// Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 1, so no new high bit.
|
||||||
|
x2, x3 = dctBox(x2>>13, x3>>13, c(sqrt2inv_cos6, 12), -c(sqrt2inv_sin6, 12))
|
||||||
|
// x[0123] now Q9.19.
|
||||||
|
|
||||||
|
x1, x2 = x1+x2, x1-x2
|
||||||
|
x0, x3 = x0+x3, x0-x3
|
||||||
|
// x[0123] now Q9.20.
|
||||||
|
|
||||||
|
// Stages 4, 3, 2: x4, x5, x6, x7.
|
||||||
|
|
||||||
|
x7, x4 = x7+x4, x7-x4
|
||||||
|
// x[47] now Q9.21.
|
||||||
|
|
||||||
|
x5 = (x5 >> 13) * c(sqrt2inv, 14)
|
||||||
|
x6 = (x6 >> 13) * c(sqrt2inv, 14)
|
||||||
|
// x[56] now Q9.21.
|
||||||
|
|
||||||
|
x7, x5 = x7+x5, x7-x5
|
||||||
|
x4, x6 = x4+x6, x4-x6
|
||||||
|
// x[4567] now Q9.22.
|
||||||
|
|
||||||
|
x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 12), -c(sin3, 12))
|
||||||
|
x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 12), -c(sin1, 12))
|
||||||
|
// x[4567] now Q10.20.
|
||||||
|
|
||||||
|
x0, x7 = x0+x7, x0-x7
|
||||||
|
x1, x6 = x1+x6, x1-x6
|
||||||
|
x2, x5 = x2+x5, x2-x5
|
||||||
|
x3, x4 = x3+x4, x3-x4
|
||||||
|
// x[01234567] now Q10.21.
|
||||||
|
|
||||||
|
x0 >>= 18
|
||||||
|
x1 >>= 18
|
||||||
|
x2 >>= 18
|
||||||
|
x3 >>= 18
|
||||||
|
x4 >>= 18
|
||||||
|
x5 >>= 18
|
||||||
|
x6 >>= 18
|
||||||
|
x7 >>= 18
|
||||||
|
// x[01234567] now Q10.3.
|
||||||
|
|
||||||
|
b[0*8+i] = x0
|
||||||
|
b[1*8+i] = x1
|
||||||
|
b[2*8+i] = x2
|
||||||
|
b[3*8+i] = x3
|
||||||
|
b[4*8+i] = x4
|
||||||
|
b[5*8+i] = x5
|
||||||
|
b[6*8+i] = x6
|
||||||
|
b[7*8+i] = x7
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -92,7 +92,7 @@ func TestDCT(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that the optimized and slow FDCT implementations agree.
|
// Check that the optimized and slow FDCT implementations agree.
|
||||||
testDCT(t, "FDCT", blocks, fdct, slowFDCT, 1, 16)
|
testDCT(t, "FDCT", blocks, fdct, slowFDCT, 1, 8)
|
||||||
testDCT(t, "IDCT", blocks, idct, slowIDCT, 1, 8)
|
testDCT(t, "IDCT", blocks, idct, slowIDCT, 1, 8)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,192 +0,0 @@
|
||||||
// Copyright 2011 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
package jpeg
|
|
||||||
|
|
||||||
// This file implements a Forward Discrete Cosine Transformation.
|
|
||||||
|
|
||||||
/*
|
|
||||||
It is based on the code in jfdctint.c from the Independent JPEG Group,
|
|
||||||
found at http://www.ijg.org/files/jpegsrc.v8c.tar.gz.
|
|
||||||
|
|
||||||
The "LEGAL ISSUES" section of the README in that archive says:
|
|
||||||
|
|
||||||
In plain English:
|
|
||||||
|
|
||||||
1. We don't promise that this software works. (But if you find any bugs,
|
|
||||||
please let us know!)
|
|
||||||
2. You can use this software for whatever you want. You don't have to pay us.
|
|
||||||
3. You may not pretend that you wrote this software. If you use it in a
|
|
||||||
program, you must acknowledge somewhere in your documentation that
|
|
||||||
you've used the IJG code.
|
|
||||||
|
|
||||||
In legalese:
|
|
||||||
|
|
||||||
The authors make NO WARRANTY or representation, either express or implied,
|
|
||||||
with respect to this software, its quality, accuracy, merchantability, or
|
|
||||||
fitness for a particular purpose. This software is provided "AS IS", and you,
|
|
||||||
its user, assume the entire risk as to its quality and accuracy.
|
|
||||||
|
|
||||||
This software is copyright (C) 1991-2011, Thomas G. Lane, Guido Vollbeding.
|
|
||||||
All Rights Reserved except as specified below.
|
|
||||||
|
|
||||||
Permission is hereby granted to use, copy, modify, and distribute this
|
|
||||||
software (or portions thereof) for any purpose, without fee, subject to these
|
|
||||||
conditions:
|
|
||||||
(1) If any part of the source code for this software is distributed, then this
|
|
||||||
README file must be included, with this copyright and no-warranty notice
|
|
||||||
unaltered; and any additions, deletions, or changes to the original files
|
|
||||||
must be clearly indicated in accompanying documentation.
|
|
||||||
(2) If only executable code is distributed, then the accompanying
|
|
||||||
documentation must state that "this software is based in part on the work of
|
|
||||||
the Independent JPEG Group".
|
|
||||||
(3) Permission for use of this software is granted only if the user accepts
|
|
||||||
full responsibility for any undesirable consequences; the authors accept
|
|
||||||
NO LIABILITY for damages of any kind.
|
|
||||||
|
|
||||||
These conditions apply to any software derived from or based on the IJG code,
|
|
||||||
not just to the unmodified library. If you use our work, you ought to
|
|
||||||
acknowledge us.
|
|
||||||
|
|
||||||
Permission is NOT granted for the use of any IJG author's name or company name
|
|
||||||
in advertising or publicity relating to this software or products derived from
|
|
||||||
it. This software may be referred to only as "the Independent JPEG Group's
|
|
||||||
software".
|
|
||||||
|
|
||||||
We specifically permit and encourage the use of this software as the basis of
|
|
||||||
commercial products, provided that all warranty or liability claims are
|
|
||||||
assumed by the product vendor.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Trigonometric constants in 13-bit fixed point format.
|
|
||||||
const (
|
|
||||||
fix_0_298631336 = 2446
|
|
||||||
fix_0_390180644 = 3196
|
|
||||||
fix_0_541196100 = 4433
|
|
||||||
fix_0_765366865 = 6270
|
|
||||||
fix_0_899976223 = 7373
|
|
||||||
fix_1_175875602 = 9633
|
|
||||||
fix_1_501321110 = 12299
|
|
||||||
fix_1_847759065 = 15137
|
|
||||||
fix_1_961570560 = 16069
|
|
||||||
fix_2_053119869 = 16819
|
|
||||||
fix_2_562915447 = 20995
|
|
||||||
fix_3_072711026 = 25172
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
constBits = 13
|
|
||||||
pass1Bits = 2
|
|
||||||
centerJSample = 128
|
|
||||||
)
|
|
||||||
|
|
||||||
// fdct performs a forward DCT on an 8x8 block of coefficients, including a
|
|
||||||
// level shift.
|
|
||||||
func fdct(b *block) {
|
|
||||||
// Pass 1: process rows.
|
|
||||||
for y := 0; y < 8; y++ {
|
|
||||||
y8 := y * 8
|
|
||||||
s := b[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
|
|
||||||
x0 := s[0]
|
|
||||||
x1 := s[1]
|
|
||||||
x2 := s[2]
|
|
||||||
x3 := s[3]
|
|
||||||
x4 := s[4]
|
|
||||||
x5 := s[5]
|
|
||||||
x6 := s[6]
|
|
||||||
x7 := s[7]
|
|
||||||
|
|
||||||
tmp0 := x0 + x7
|
|
||||||
tmp1 := x1 + x6
|
|
||||||
tmp2 := x2 + x5
|
|
||||||
tmp3 := x3 + x4
|
|
||||||
|
|
||||||
tmp10 := tmp0 + tmp3
|
|
||||||
tmp12 := tmp0 - tmp3
|
|
||||||
tmp11 := tmp1 + tmp2
|
|
||||||
tmp13 := tmp1 - tmp2
|
|
||||||
|
|
||||||
tmp0 = x0 - x7
|
|
||||||
tmp1 = x1 - x6
|
|
||||||
tmp2 = x2 - x5
|
|
||||||
tmp3 = x3 - x4
|
|
||||||
|
|
||||||
s[0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits
|
|
||||||
s[4] = (tmp10 - tmp11) << pass1Bits
|
|
||||||
z1 := (tmp12 + tmp13) * fix_0_541196100
|
|
||||||
z1 += 1 << (constBits - pass1Bits - 1)
|
|
||||||
s[2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits)
|
|
||||||
s[6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits)
|
|
||||||
|
|
||||||
tmp10 = tmp0 + tmp3
|
|
||||||
tmp11 = tmp1 + tmp2
|
|
||||||
tmp12 = tmp0 + tmp2
|
|
||||||
tmp13 = tmp1 + tmp3
|
|
||||||
z1 = (tmp12 + tmp13) * fix_1_175875602
|
|
||||||
z1 += 1 << (constBits - pass1Bits - 1)
|
|
||||||
tmp0 *= fix_1_501321110
|
|
||||||
tmp1 *= fix_3_072711026
|
|
||||||
tmp2 *= fix_2_053119869
|
|
||||||
tmp3 *= fix_0_298631336
|
|
||||||
tmp10 *= -fix_0_899976223
|
|
||||||
tmp11 *= -fix_2_562915447
|
|
||||||
tmp12 *= -fix_0_390180644
|
|
||||||
tmp13 *= -fix_1_961570560
|
|
||||||
|
|
||||||
tmp12 += z1
|
|
||||||
tmp13 += z1
|
|
||||||
s[1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits)
|
|
||||||
s[3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits)
|
|
||||||
s[5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits)
|
|
||||||
s[7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits)
|
|
||||||
}
|
|
||||||
// Pass 2: process columns.
|
|
||||||
// We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.
|
|
||||||
for x := 0; x < 8; x++ {
|
|
||||||
tmp0 := b[0*8+x] + b[7*8+x]
|
|
||||||
tmp1 := b[1*8+x] + b[6*8+x]
|
|
||||||
tmp2 := b[2*8+x] + b[5*8+x]
|
|
||||||
tmp3 := b[3*8+x] + b[4*8+x]
|
|
||||||
|
|
||||||
tmp10 := tmp0 + tmp3 + 1<<(pass1Bits-1)
|
|
||||||
tmp12 := tmp0 - tmp3
|
|
||||||
tmp11 := tmp1 + tmp2
|
|
||||||
tmp13 := tmp1 - tmp2
|
|
||||||
|
|
||||||
tmp0 = b[0*8+x] - b[7*8+x]
|
|
||||||
tmp1 = b[1*8+x] - b[6*8+x]
|
|
||||||
tmp2 = b[2*8+x] - b[5*8+x]
|
|
||||||
tmp3 = b[3*8+x] - b[4*8+x]
|
|
||||||
|
|
||||||
b[0*8+x] = (tmp10 + tmp11) >> pass1Bits
|
|
||||||
b[4*8+x] = (tmp10 - tmp11) >> pass1Bits
|
|
||||||
|
|
||||||
z1 := (tmp12 + tmp13) * fix_0_541196100
|
|
||||||
z1 += 1 << (constBits + pass1Bits - 1)
|
|
||||||
b[2*8+x] = (z1 + tmp12*fix_0_765366865) >> (constBits + pass1Bits)
|
|
||||||
b[6*8+x] = (z1 - tmp13*fix_1_847759065) >> (constBits + pass1Bits)
|
|
||||||
|
|
||||||
tmp10 = tmp0 + tmp3
|
|
||||||
tmp11 = tmp1 + tmp2
|
|
||||||
tmp12 = tmp0 + tmp2
|
|
||||||
tmp13 = tmp1 + tmp3
|
|
||||||
z1 = (tmp12 + tmp13) * fix_1_175875602
|
|
||||||
z1 += 1 << (constBits + pass1Bits - 1)
|
|
||||||
tmp0 *= fix_1_501321110
|
|
||||||
tmp1 *= fix_3_072711026
|
|
||||||
tmp2 *= fix_2_053119869
|
|
||||||
tmp3 *= fix_0_298631336
|
|
||||||
tmp10 *= -fix_0_899976223
|
|
||||||
tmp11 *= -fix_2_562915447
|
|
||||||
tmp12 *= -fix_0_390180644
|
|
||||||
tmp13 *= -fix_1_961570560
|
|
||||||
|
|
||||||
tmp12 += z1
|
|
||||||
tmp13 += z1
|
|
||||||
b[1*8+x] = (tmp0 + tmp10 + tmp12) >> (constBits + pass1Bits)
|
|
||||||
b[3*8+x] = (tmp1 + tmp11 + tmp13) >> (constBits + pass1Bits)
|
|
||||||
b[5*8+x] = (tmp2 + tmp11 + tmp12) >> (constBits + pass1Bits)
|
|
||||||
b[7*8+x] = (tmp3 + tmp10 + tmp13) >> (constBits + pass1Bits)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,194 +0,0 @@
|
||||||
// Copyright 2009 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
package jpeg
|
|
||||||
|
|
||||||
// This is a Go translation of idct.c from
|
|
||||||
//
|
|
||||||
// http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
|
|
||||||
//
|
|
||||||
// which carries the following notice:
|
|
||||||
|
|
||||||
/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Disclaimer of Warranty
|
|
||||||
*
|
|
||||||
* These software programs are available to the user without any license fee or
|
|
||||||
* royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
|
|
||||||
* any and all warranties, whether express, implied, or statuary, including any
|
|
||||||
* implied warranties or merchantability or of fitness for a particular
|
|
||||||
* purpose. In no event shall the copyright-holder be liable for any
|
|
||||||
* incidental, punitive, or consequential damages of any kind whatsoever
|
|
||||||
* arising from the use of these programs.
|
|
||||||
*
|
|
||||||
* This disclaimer of warranty extends to the user of these programs and user's
|
|
||||||
* customers, employees, agents, transferees, successors, and assigns.
|
|
||||||
*
|
|
||||||
* The MPEG Software Simulation Group does not represent or warrant that the
|
|
||||||
* programs furnished hereunder are free of infringement of any third-party
|
|
||||||
* patents.
|
|
||||||
*
|
|
||||||
* Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
|
|
||||||
* are subject to royalty fees to patent holders. Many of these patents are
|
|
||||||
* general enough such that they are unavoidable regardless of implementation
|
|
||||||
* design.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
const blockSize = 64 // A DCT block is 8x8.
|
|
||||||
|
|
||||||
type block [blockSize]int32
|
|
||||||
|
|
||||||
const (
|
|
||||||
w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
|
|
||||||
w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
|
|
||||||
w3 = 2408 // 2048*sqrt(2)*cos(3*pi/16)
|
|
||||||
w5 = 1609 // 2048*sqrt(2)*cos(5*pi/16)
|
|
||||||
w6 = 1108 // 2048*sqrt(2)*cos(6*pi/16)
|
|
||||||
w7 = 565 // 2048*sqrt(2)*cos(7*pi/16)
|
|
||||||
|
|
||||||
w1pw7 = w1 + w7
|
|
||||||
w1mw7 = w1 - w7
|
|
||||||
w2pw6 = w2 + w6
|
|
||||||
w2mw6 = w2 - w6
|
|
||||||
w3pw5 = w3 + w5
|
|
||||||
w3mw5 = w3 - w5
|
|
||||||
|
|
||||||
r2 = 181 // 256/sqrt(2)
|
|
||||||
)
|
|
||||||
|
|
||||||
// idct performs a 2-D Inverse Discrete Cosine Transformation.
|
|
||||||
//
|
|
||||||
// The input coefficients should already have been multiplied by the
|
|
||||||
// appropriate quantization table. We use fixed-point computation, with the
|
|
||||||
// number of bits for the fractional component varying over the intermediate
|
|
||||||
// stages.
|
|
||||||
//
|
|
||||||
// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
|
|
||||||
// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
|
|
||||||
// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
|
|
||||||
func idct(src *block) {
|
|
||||||
// Horizontal 1-D IDCT.
|
|
||||||
for y := 0; y < 8; y++ {
|
|
||||||
y8 := y * 8
|
|
||||||
s := src[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
|
|
||||||
// If all the AC components are zero, then the IDCT is trivial.
|
|
||||||
if s[1] == 0 && s[2] == 0 && s[3] == 0 &&
|
|
||||||
s[4] == 0 && s[5] == 0 && s[6] == 0 && s[7] == 0 {
|
|
||||||
dc := s[0] << 3
|
|
||||||
s[0] = dc
|
|
||||||
s[1] = dc
|
|
||||||
s[2] = dc
|
|
||||||
s[3] = dc
|
|
||||||
s[4] = dc
|
|
||||||
s[5] = dc
|
|
||||||
s[6] = dc
|
|
||||||
s[7] = dc
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prescale.
|
|
||||||
x0 := (s[0] << 11) + 128
|
|
||||||
x1 := s[4] << 11
|
|
||||||
x2 := s[6]
|
|
||||||
x3 := s[2]
|
|
||||||
x4 := s[1]
|
|
||||||
x5 := s[7]
|
|
||||||
x6 := s[5]
|
|
||||||
x7 := s[3]
|
|
||||||
|
|
||||||
// Stage 1.
|
|
||||||
x8 := w7 * (x4 + x5)
|
|
||||||
x4 = x8 + w1mw7*x4
|
|
||||||
x5 = x8 - w1pw7*x5
|
|
||||||
x8 = w3 * (x6 + x7)
|
|
||||||
x6 = x8 - w3mw5*x6
|
|
||||||
x7 = x8 - w3pw5*x7
|
|
||||||
|
|
||||||
// Stage 2.
|
|
||||||
x8 = x0 + x1
|
|
||||||
x0 -= x1
|
|
||||||
x1 = w6 * (x3 + x2)
|
|
||||||
x2 = x1 - w2pw6*x2
|
|
||||||
x3 = x1 + w2mw6*x3
|
|
||||||
x1 = x4 + x6
|
|
||||||
x4 -= x6
|
|
||||||
x6 = x5 + x7
|
|
||||||
x5 -= x7
|
|
||||||
|
|
||||||
// Stage 3.
|
|
||||||
x7 = x8 + x3
|
|
||||||
x8 -= x3
|
|
||||||
x3 = x0 + x2
|
|
||||||
x0 -= x2
|
|
||||||
x2 = (r2*(x4+x5) + 128) >> 8
|
|
||||||
x4 = (r2*(x4-x5) + 128) >> 8
|
|
||||||
|
|
||||||
// Stage 4.
|
|
||||||
s[0] = (x7 + x1) >> 8
|
|
||||||
s[1] = (x3 + x2) >> 8
|
|
||||||
s[2] = (x0 + x4) >> 8
|
|
||||||
s[3] = (x8 + x6) >> 8
|
|
||||||
s[4] = (x8 - x6) >> 8
|
|
||||||
s[5] = (x0 - x4) >> 8
|
|
||||||
s[6] = (x3 - x2) >> 8
|
|
||||||
s[7] = (x7 - x1) >> 8
|
|
||||||
}
|
|
||||||
|
|
||||||
// Vertical 1-D IDCT.
|
|
||||||
for x := 0; x < 8; x++ {
|
|
||||||
// Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
|
|
||||||
// However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
|
|
||||||
// we do not bother to check for the all-zero case.
|
|
||||||
s := src[x : x+57 : x+57] // Small cap improves performance, see https://golang.org/issue/27857
|
|
||||||
|
|
||||||
// Prescale.
|
|
||||||
y0 := (s[8*0] << 8) + 8192
|
|
||||||
y1 := s[8*4] << 8
|
|
||||||
y2 := s[8*6]
|
|
||||||
y3 := s[8*2]
|
|
||||||
y4 := s[8*1]
|
|
||||||
y5 := s[8*7]
|
|
||||||
y6 := s[8*5]
|
|
||||||
y7 := s[8*3]
|
|
||||||
|
|
||||||
// Stage 1.
|
|
||||||
y8 := w7*(y4+y5) + 4
|
|
||||||
y4 = (y8 + w1mw7*y4) >> 3
|
|
||||||
y5 = (y8 - w1pw7*y5) >> 3
|
|
||||||
y8 = w3*(y6+y7) + 4
|
|
||||||
y6 = (y8 - w3mw5*y6) >> 3
|
|
||||||
y7 = (y8 - w3pw5*y7) >> 3
|
|
||||||
|
|
||||||
// Stage 2.
|
|
||||||
y8 = y0 + y1
|
|
||||||
y0 -= y1
|
|
||||||
y1 = w6*(y3+y2) + 4
|
|
||||||
y2 = (y1 - w2pw6*y2) >> 3
|
|
||||||
y3 = (y1 + w2mw6*y3) >> 3
|
|
||||||
y1 = y4 + y6
|
|
||||||
y4 -= y6
|
|
||||||
y6 = y5 + y7
|
|
||||||
y5 -= y7
|
|
||||||
|
|
||||||
// Stage 3.
|
|
||||||
y7 = y8 + y3
|
|
||||||
y8 -= y3
|
|
||||||
y3 = y0 + y2
|
|
||||||
y0 -= y2
|
|
||||||
y2 = (r2*(y4+y5) + 128) >> 8
|
|
||||||
y4 = (r2*(y4-y5) + 128) >> 8
|
|
||||||
|
|
||||||
// Stage 4.
|
|
||||||
s[8*0] = (y7 + y1) >> 14
|
|
||||||
s[8*1] = (y3 + y2) >> 14
|
|
||||||
s[8*2] = (y0 + y4) >> 14
|
|
||||||
s[8*3] = (y8 + y6) >> 14
|
|
||||||
s[8*4] = (y8 - y6) >> 14
|
|
||||||
s[8*5] = (y0 - y4) >> 14
|
|
||||||
s[8*6] = (y3 - y2) >> 14
|
|
||||||
s[8*7] = (y7 - y1) >> 14
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue