| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * FFT/MDCT transform with Extended 3DNow! optimizations | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |  * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt | 
					
						
							| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  |  * | 
					
						
							| 
									
										
										
										
											2006-10-07 15:30:46 +00:00
										 |  |  |  * This file is part of FFmpeg. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * FFmpeg is free software; you can redistribute it and/or | 
					
						
							| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  |  * modify it under the terms of the GNU Lesser General Public | 
					
						
							|  |  |  |  * License as published by the Free Software Foundation; either | 
					
						
							| 
									
										
										
										
											2006-10-07 15:30:46 +00:00
										 |  |  |  * version 2.1 of the License, or (at your option) any later version. | 
					
						
							| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  |  * | 
					
						
							| 
									
										
										
										
											2006-10-07 15:30:46 +00:00
										 |  |  |  * FFmpeg is distributed in the hope that it will be useful, | 
					
						
							| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
					
						
							|  |  |  |  * Lesser General Public License for more details. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * You should have received a copy of the GNU Lesser General Public | 
					
						
							| 
									
										
										
										
											2006-10-07 15:30:46 +00:00
										 |  |  |  * License along with FFmpeg; if not, write to the Free Software | 
					
						
							| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  |  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2008-05-09 11:56:36 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | #include "libavutil/x86_cpu.h"
 | 
					
						
							|  |  |  | #include "libavcodec/dsputil.h"
 | 
					
						
							| 
									
										
										
										
											2009-09-15 21:14:14 +00:00
										 |  |  | #include "fft.h"
 | 
					
						
							| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-09-19 21:32:09 -04:00
										 |  |  | DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 }; | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  | #ifdef EMULATE_3DNOWEXT
 | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  | #define PSWAPD(s,d)\
 | 
					
						
							|  |  |  |     "movq "#s","#d"\n"\ | 
					
						
							|  |  |  |     "psrlq $32,"#d"\n"\ | 
					
						
							|  |  |  |     "punpckldq "#s","#d"\n" | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  | #define ff_fft_calc_3dn2 ff_fft_calc_3dn
 | 
					
						
							|  |  |  | #define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
 | 
					
						
							|  |  |  | #define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
 | 
					
						
							|  |  |  | #define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
 | 
					
						
							|  |  |  | #define ff_imdct_half_3dn2 ff_imdct_half_3dn
 | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  | #else
 | 
					
						
							|  |  |  | #define PSWAPD(s,d) "pswapd "#s","#d"\n"
 | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  | void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); | 
					
						
							| 
									
										
										
										
											2008-08-14 04:41:02 +00:00
										 |  |  | void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); | 
					
						
							| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  |     int n = 1<<s->nbits; | 
					
						
							|  |  |  |     int i; | 
					
						
							|  |  |  |     ff_fft_dispatch_interleave_3dn2(z, s->nbits); | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |     __asm__ volatile("femms"); | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  |     if(n <= 8) | 
					
						
							|  |  |  |         for(i=0; i<n; i+=2) | 
					
						
							|  |  |  |             FFSWAP(FFTSample, z[i].im, z[i+1].re); | 
					
						
							| 
									
										
										
										
											2006-03-08 04:13:55 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-09-20 17:30:20 +00:00
										 |  |  | void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) | 
					
						
							| 
									
										
										
										
											2006-08-08 04:01:04 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |     x86_reg j, k; | 
					
						
							| 
									
										
										
										
											2010-08-23 15:51:09 +00:00
										 |  |  |     long n = s->mdct_size; | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |     long n2 = n >> 1; | 
					
						
							|  |  |  |     long n4 = n >> 2; | 
					
						
							|  |  |  |     long n8 = n >> 3; | 
					
						
							| 
									
										
										
										
											2009-09-20 17:30:20 +00:00
										 |  |  |     const uint16_t *revtab = s->revtab; | 
					
						
							| 
									
										
										
										
											2006-08-08 04:01:04 +00:00
										 |  |  |     const FFTSample *tcos = s->tcos; | 
					
						
							|  |  |  |     const FFTSample *tsin = s->tsin; | 
					
						
							|  |  |  |     const FFTSample *in1, *in2; | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |     FFTComplex *z = (FFTComplex *)output; | 
					
						
							| 
									
										
										
										
											2006-08-08 04:01:04 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /* pre rotation */ | 
					
						
							|  |  |  |     in1 = input; | 
					
						
							|  |  |  |     in2 = input + n2 - 1; | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  | #ifdef EMULATE_3DNOWEXT
 | 
					
						
							| 
									
										
										
										
											2011-09-19 21:32:09 -04:00
										 |  |  |     __asm__ volatile("movd %0, %%mm7" ::"r"(1U<<31)); | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2006-08-08 04:01:04 +00:00
										 |  |  |     for(k = 0; k < n4; k++) { | 
					
						
							| 
									
										
										
										
											2006-08-09 06:33:49 +00:00
										 |  |  |         // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
 | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |         __asm__ volatile( | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |             "movd         %0, %%mm0 \n" | 
					
						
							|  |  |  |             "movd         %2, %%mm1 \n" | 
					
						
							|  |  |  |             "punpckldq    %1, %%mm0 \n" | 
					
						
							|  |  |  |             "punpckldq    %3, %%mm1 \n" | 
					
						
							|  |  |  |             "movq      %%mm0, %%mm2 \n" | 
					
						
							|  |  |  |             PSWAPD(    %%mm1, %%mm3 ) | 
					
						
							|  |  |  |             "pfmul     %%mm1, %%mm0 \n" | 
					
						
							|  |  |  |             "pfmul     %%mm3, %%mm2 \n" | 
					
						
							|  |  |  | #ifdef EMULATE_3DNOWEXT
 | 
					
						
							|  |  |  |             "movq      %%mm0, %%mm1 \n" | 
					
						
							|  |  |  |             "punpckhdq %%mm2, %%mm0 \n" | 
					
						
							|  |  |  |             "punpckldq %%mm2, %%mm1 \n" | 
					
						
							|  |  |  |             "pxor      %%mm7, %%mm0 \n" | 
					
						
							|  |  |  |             "pfadd     %%mm1, %%mm0 \n" | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |             "pfpnacc   %%mm2, %%mm0 \n" | 
					
						
							|  |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2006-08-09 06:33:49 +00:00
										 |  |  |             ::"m"(in2[-2*k]), "m"(in1[2*k]), | 
					
						
							|  |  |  |               "m"(tcos[k]), "m"(tsin[k]) | 
					
						
							|  |  |  |         ); | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |         __asm__ volatile( | 
					
						
							| 
									
										
										
										
											2006-08-08 04:01:04 +00:00
										 |  |  |             "movq    %%mm0, %0    \n\t" | 
					
						
							|  |  |  |             :"=m"(z[revtab[k]]) | 
					
						
							|  |  |  |         ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-09-20 17:30:20 +00:00
										 |  |  |     ff_fft_dispatch_3dn2(z, s->nbits); | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | #define CMUL(j,mm0,mm1)\
 | 
					
						
							|  |  |  |         "movq  (%2,"#j",2), %%mm6 \n"\ | 
					
						
							|  |  |  |         "movq 8(%2,"#j",2), "#mm0"\n"\ | 
					
						
							|  |  |  |         "movq        %%mm6, "#mm1"\n"\ | 
					
						
							|  |  |  |         "movq        "#mm0",%%mm7 \n"\ | 
					
						
							|  |  |  |         "pfmul   (%3,"#j"), %%mm6 \n"\ | 
					
						
							|  |  |  |         "pfmul   (%4,"#j"), "#mm0"\n"\ | 
					
						
							|  |  |  |         "pfmul   (%4,"#j"), "#mm1"\n"\ | 
					
						
							|  |  |  |         "pfmul   (%3,"#j"), %%mm7 \n"\ | 
					
						
							|  |  |  |         "pfsub       %%mm6, "#mm0"\n"\ | 
					
						
							|  |  |  |         "pfadd       %%mm7, "#mm1"\n" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* post rotation */ | 
					
						
							|  |  |  |     j = -n2; | 
					
						
							|  |  |  |     k = n2-8; | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |     __asm__ volatile( | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |         "1: \n" | 
					
						
							|  |  |  |         CMUL(%0, %%mm0, %%mm1) | 
					
						
							|  |  |  |         CMUL(%1, %%mm2, %%mm3) | 
					
						
							|  |  |  |         "movd   %%mm0,  (%2,%0,2) \n" | 
					
						
							|  |  |  |         "movd   %%mm1,12(%2,%1,2) \n" | 
					
						
							|  |  |  |         "movd   %%mm2,  (%2,%1,2) \n" | 
					
						
							|  |  |  |         "movd   %%mm3,12(%2,%0,2) \n" | 
					
						
							|  |  |  |         "psrlq  $32,   %%mm0 \n" | 
					
						
							|  |  |  |         "psrlq  $32,   %%mm1 \n" | 
					
						
							|  |  |  |         "psrlq  $32,   %%mm2 \n" | 
					
						
							|  |  |  |         "psrlq  $32,   %%mm3 \n" | 
					
						
							|  |  |  |         "movd   %%mm0, 8(%2,%0,2) \n" | 
					
						
							|  |  |  |         "movd   %%mm1, 4(%2,%1,2) \n" | 
					
						
							|  |  |  |         "movd   %%mm2, 8(%2,%1,2) \n" | 
					
						
							|  |  |  |         "movd   %%mm3, 4(%2,%0,2) \n" | 
					
						
							|  |  |  |         "sub $8, %1 \n" | 
					
						
							|  |  |  |         "add $8, %0 \n" | 
					
						
							|  |  |  |         "jl 1b \n" | 
					
						
							|  |  |  |         :"+r"(j), "+r"(k) | 
					
						
							|  |  |  |         :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) | 
					
						
							| 
									
										
										
										
											2006-09-21 17:42:23 +00:00
										 |  |  |         :"memory" | 
					
						
							|  |  |  |     ); | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |     __asm__ volatile("femms"); | 
					
						
							| 
									
										
										
										
											2006-08-08 04:01:04 +00:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2006-08-18 23:53:49 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-09-20 17:30:20 +00:00
										 |  |  | void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  | { | 
					
						
							|  |  |  |     x86_reg j, k; | 
					
						
							| 
									
										
										
										
											2010-08-23 15:51:09 +00:00
										 |  |  |     long n = s->mdct_size; | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |     long n4 = n >> 2; | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |     ff_imdct_half_3dn2(s, output+n4, input); | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     j = -n; | 
					
						
							|  |  |  |     k = n-8; | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |     __asm__ volatile( | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |         "movq %4, %%mm7 \n" | 
					
						
							|  |  |  |         "1: \n" | 
					
						
							|  |  |  |         PSWAPD((%2,%1), %%mm0) | 
					
						
							|  |  |  |         PSWAPD((%3,%0), %%mm1) | 
					
						
							|  |  |  |         "pxor    %%mm7, %%mm0 \n" | 
					
						
							|  |  |  |         "movq    %%mm1, (%3,%1) \n" | 
					
						
							|  |  |  |         "movq    %%mm0, (%2,%0) \n" | 
					
						
							|  |  |  |         "sub $8, %1 \n" | 
					
						
							|  |  |  |         "add $8, %0 \n" | 
					
						
							|  |  |  |         "jl 1b \n" | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  |         :"+r"(j), "+r"(k) | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |         :"r"(output+n4), "r"(output+n4*3), | 
					
						
							|  |  |  |          "m"(*m1m1) | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  |     ); | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |     __asm__ volatile("femms"); | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  | } |