| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * FFT/MDCT transform with SSE optimizations | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |  * Copyright (c) 2008 Loren Merritt | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  |  * | 
					
						
							| 
									
										
										
										
											2006-10-07 15:30:46 +00:00
										 |  |  |  * This file is part of FFmpeg. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * FFmpeg is free software; you can redistribute it and/or | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  |  * modify it under the terms of the GNU Lesser General Public | 
					
						
							|  |  |  |  * License as published by the Free Software Foundation; either | 
					
						
							| 
									
										
										
										
											2006-10-07 15:30:46 +00:00
										 |  |  |  * version 2.1 of the License, or (at your option) any later version. | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  |  * | 
					
						
							| 
									
										
										
										
											2006-10-07 15:30:46 +00:00
										 |  |  |  * FFmpeg is distributed in the hope that it will be useful, | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
					
						
							|  |  |  |  * Lesser General Public License for more details. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * You should have received a copy of the GNU Lesser General Public | 
					
						
							| 
									
										
										
										
											2006-10-07 15:30:46 +00:00
										 |  |  |  * License along with FFmpeg; if not, write to the Free Software | 
					
						
							| 
									
										
										
										
											2006-01-12 22:43:26 +00:00
										 |  |  |  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  |  */ | 
					
						
							| 
									
										
										
										
											2008-05-09 11:56:36 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | #include "libavutil/x86_cpu.h"
 | 
					
						
							|  |  |  | #include "libavcodec/dsputil.h"
 | 
					
						
							| 
									
										
										
										
											2009-09-15 21:14:14 +00:00
										 |  |  | #include "fft.h"
 | 
					
						
							| 
									
										
										
										
											2011-05-27 21:18:12 +02:00
										 |  |  | #include "config.h"
 | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-09-19 21:32:09 -04:00
										 |  |  | DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] = | 
					
						
							|  |  |  |     { 1U << 31, 1U << 31, 1U << 31, 1U << 31 }; | 
					
						
							| 
									
										
										
										
											2006-09-21 16:37:39 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  | void ff_fft_dispatch_sse(FFTComplex *z, int nbits); | 
					
						
							|  |  |  | void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); | 
					
						
							| 
									
										
										
										
											2011-04-25 11:39:01 +02:00
										 |  |  | void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-05-26 19:44:39 +02:00
										 |  |  | #if HAVE_AVX
 | 
					
						
							| 
									
										
										
										
											2011-04-25 11:39:01 +02:00
										 |  |  | void ff_fft_calc_avx(FFTContext *s, FFTComplex *z) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     ff_fft_dispatch_interleave_avx(z, s->nbits); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2011-05-26 19:44:39 +02:00
										 |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-03-13 21:43:24 +00:00
										 |  |  | void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  |     int n = 1 << s->nbits; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ff_fft_dispatch_interleave_sse(z, s->nbits); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if(n <= 16) { | 
					
						
							|  |  |  |         x86_reg i = -8*n; | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |         __asm__ volatile( | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  |             "1: \n" | 
					
						
							|  |  |  |             "movaps     (%0,%1), %%xmm0 \n" | 
					
						
							|  |  |  |             "movaps      %%xmm0, %%xmm1 \n" | 
					
						
							|  |  |  |             "unpcklps 16(%0,%1), %%xmm0 \n" | 
					
						
							|  |  |  |             "unpckhps 16(%0,%1), %%xmm1 \n" | 
					
						
							|  |  |  |             "movaps      %%xmm0,   (%0,%1) \n" | 
					
						
							|  |  |  |             "movaps      %%xmm1, 16(%0,%1) \n" | 
					
						
							|  |  |  |             "add $32, %0 \n" | 
					
						
							|  |  |  |             "jl 1b \n" | 
					
						
							|  |  |  |             :"+r"(i) | 
					
						
							|  |  |  |             :"r"(z+n) | 
					
						
							|  |  |  |             :"memory" | 
					
						
							|  |  |  |         ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  | void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     int n = 1 << s->nbits; | 
					
						
							|  |  |  |     int i; | 
					
						
							|  |  |  |     for(i=0; i<n; i+=2) { | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |         __asm__ volatile( | 
					
						
							| 
									
										
										
										
											2008-08-12 00:26:58 +00:00
										 |  |  |             "movaps %2, %%xmm0 \n" | 
					
						
							|  |  |  |             "movlps %%xmm0, %0 \n" | 
					
						
							|  |  |  |             "movhps %%xmm0, %1 \n" | 
					
						
							|  |  |  |             :"=m"(s->tmp_buf[s->revtab[i]]), | 
					
						
							|  |  |  |              "=m"(s->tmp_buf[s->revtab[i+1]]) | 
					
						
							|  |  |  |             :"m"(z[i]) | 
					
						
							|  |  |  |         ); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); | 
					
						
							| 
									
										
										
										
											2002-10-28 00:34:08 +00:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2003-01-07 17:41:43 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-09-20 17:30:20 +00:00
										 |  |  | void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  | { | 
					
						
							|  |  |  |     x86_reg j, k; | 
					
						
							| 
									
										
										
										
											2010-08-23 15:51:09 +00:00
										 |  |  |     long n = s->mdct_size; | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |     long n4 = n >> 2; | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-04-25 11:39:01 +02:00
										 |  |  |     s->imdct_half(s, output + n4, input); | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     j = -n; | 
					
						
							|  |  |  |     k = n-16; | 
					
						
							| 
									
										
										
										
											2008-10-16 13:34:09 +00:00
										 |  |  |     __asm__ volatile( | 
					
						
							| 
									
										
										
										
											2011-01-30 01:04:41 -08:00
										 |  |  |         "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n" | 
					
						
							| 
									
										
										
										
											2008-08-12 00:33:34 +00:00
										 |  |  |         "1: \n" | 
					
						
							|  |  |  |         "movaps       (%2,%1), %%xmm0 \n" | 
					
						
							|  |  |  |         "movaps       (%3,%0), %%xmm1 \n" | 
					
						
							|  |  |  |         "shufps $0x1b, %%xmm0, %%xmm0 \n" | 
					
						
							|  |  |  |         "shufps $0x1b, %%xmm1, %%xmm1 \n" | 
					
						
							|  |  |  |         "xorps         %%xmm7, %%xmm0 \n" | 
					
						
							|  |  |  |         "movaps        %%xmm1, (%3,%1) \n" | 
					
						
							|  |  |  |         "movaps        %%xmm0, (%2,%0) \n" | 
					
						
							|  |  |  |         "sub $16, %1 \n" | 
					
						
							|  |  |  |         "add $16, %0 \n" | 
					
						
							|  |  |  |         "jl 1b \n" | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  |         :"+r"(j), "+r"(k) | 
					
						
							| 
									
										
										
										
											2011-01-30 01:04:41 -08:00
										 |  |  |         :"r"(output+n4), "r"(output+n4*3) | 
					
						
							| 
									
										
										
										
											2010-10-06 01:27:02 +00:00
										 |  |  |         XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7") | 
					
						
							| 
									
										
										
										
											2008-07-13 15:03:58 +00:00
										 |  |  |     ); | 
					
						
							|  |  |  | } |