2013-03-10 15:37:59 -07:00
|
|
|
/*
|
2014-01-29 13:12:59 +01:00
|
|
|
* SIMD-optimized halfpel functions
|
2013-03-10 15:37:59 -07:00
|
|
|
* Copyright (c) 2000, 2001 Fabrice Bellard
|
|
|
|
|
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
|
|
|
|
*
|
|
|
|
|
* This file is part of FFmpeg.
|
|
|
|
|
*
|
|
|
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
|
*
|
|
|
|
|
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
|
|
|
|
*/
|
|
|
|
|
|
2025-05-30 13:54:50 +02:00
|
|
|
#include <stddef.h>
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
2014-01-09 14:06:32 +01:00
|
|
|
#include "libavutil/attributes.h"
|
2013-03-10 15:37:59 -07:00
|
|
|
#include "libavutil/cpu.h"
|
2013-08-20 14:46:58 +02:00
|
|
|
#include "libavutil/x86/cpu.h"
|
2014-01-29 13:07:57 +01:00
|
|
|
#include "libavcodec/avcodec.h"
|
2013-03-10 15:37:59 -07:00
|
|
|
#include "libavcodec/hpeldsp.h"
|
2013-12-21 17:33:17 +01:00
|
|
|
#include "libavcodec/pixels.h"
|
2014-01-28 18:19:21 +01:00
|
|
|
#include "fpel.h"
|
2014-01-29 14:57:10 +01:00
|
|
|
#include "hpeldsp.h"
|
2013-03-10 15:37:59 -07:00
|
|
|
|
|
|
|
|
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
|
|
|
|
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
2014-05-22 17:48:19 +00:00
|
|
|
void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
|
|
|
|
void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
|
|
|
|
void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
|
|
|
|
void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
2013-03-10 15:37:59 -07:00
|
|
|
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
2023-09-04 11:52:06 +02:00
|
|
|
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
|
|
|
|
|
const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
2013-03-10 15:37:59 -07:00
|
|
|
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
|
|
|
|
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
2023-09-04 11:52:06 +02:00
|
|
|
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
|
|
|
|
|
const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
2013-03-10 15:37:59 -07:00
|
|
|
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
|
|
|
|
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
2014-05-22 17:48:17 +00:00
|
|
|
void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h);
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2013-04-23 17:10:59 +02:00
|
|
|
#define put_pixels8_mmx ff_put_pixels8_mmx
|
|
|
|
|
#define put_pixels16_mmx ff_put_pixels16_mmx
|
2013-05-07 00:42:22 +02:00
|
|
|
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
|
2013-04-23 17:10:59 +02:00
|
|
|
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
|
|
|
|
|
#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
|
2013-03-10 15:37:59 -07:00
|
|
|
|
|
|
|
|
#if HAVE_INLINE_ASM
|
|
|
|
|
|
|
|
|
|
/***********************************/
|
|
|
|
|
/* MMX no rounding */
|
|
|
|
|
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
|
|
|
|
|
#define SET_RND MOVQ_WONE
|
2013-05-07 00:42:22 +02:00
|
|
|
#define STATIC static
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2013-05-07 00:42:22 +02:00
|
|
|
#include "rnd_template.c"
|
2013-03-10 15:37:59 -07:00
|
|
|
|
|
|
|
|
#undef DEF
|
|
|
|
|
#undef SET_RND
|
2013-05-07 00:42:22 +02:00
|
|
|
#undef STATIC
|
2013-04-20 20:28:28 +02:00
|
|
|
|
2025-05-30 13:54:50 +02:00
|
|
|
// this routine is 'slightly' suboptimal but mostly unused
|
|
|
|
|
static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
|
|
|
|
|
ptrdiff_t line_size, int h)
|
|
|
|
|
{
|
|
|
|
|
MOVQ_ZERO(mm7);
|
|
|
|
|
MOVQ_WONE(mm6); // =2 for rnd and =1 for no_rnd version
|
|
|
|
|
__asm__ volatile(
|
|
|
|
|
"movq (%1), %%mm0 \n\t"
|
|
|
|
|
"movq 1(%1), %%mm4 \n\t"
|
|
|
|
|
"movq %%mm0, %%mm1 \n\t"
|
|
|
|
|
"movq %%mm4, %%mm5 \n\t"
|
|
|
|
|
"punpcklbw %%mm7, %%mm0 \n\t"
|
|
|
|
|
"punpcklbw %%mm7, %%mm4 \n\t"
|
|
|
|
|
"punpckhbw %%mm7, %%mm1 \n\t"
|
|
|
|
|
"punpckhbw %%mm7, %%mm5 \n\t"
|
|
|
|
|
"paddusw %%mm0, %%mm4 \n\t"
|
|
|
|
|
"paddusw %%mm1, %%mm5 \n\t"
|
|
|
|
|
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
|
|
|
|
|
"add %3, %1 \n\t"
|
|
|
|
|
".p2align 3 \n\t"
|
|
|
|
|
"1: \n\t"
|
|
|
|
|
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
|
|
|
|
|
"movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
|
|
|
|
|
"movq %%mm0, %%mm1 \n\t"
|
|
|
|
|
"movq %%mm2, %%mm3 \n\t"
|
|
|
|
|
"punpcklbw %%mm7, %%mm0 \n\t"
|
|
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t"
|
|
|
|
|
"punpckhbw %%mm7, %%mm1 \n\t"
|
|
|
|
|
"punpckhbw %%mm7, %%mm3 \n\t"
|
|
|
|
|
"paddusw %%mm2, %%mm0 \n\t"
|
|
|
|
|
"paddusw %%mm3, %%mm1 \n\t"
|
|
|
|
|
"paddusw %%mm6, %%mm4 \n\t"
|
|
|
|
|
"paddusw %%mm6, %%mm5 \n\t"
|
|
|
|
|
"paddusw %%mm0, %%mm4 \n\t"
|
|
|
|
|
"paddusw %%mm1, %%mm5 \n\t"
|
|
|
|
|
"psrlw $2, %%mm4 \n\t"
|
|
|
|
|
"psrlw $2, %%mm5 \n\t"
|
|
|
|
|
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
|
|
|
|
|
"packuswb %%mm5, %%mm4 \n\t"
|
|
|
|
|
"pcmpeqd %%mm2, %%mm2 \n\t"
|
|
|
|
|
"paddb %%mm2, %%mm2 \n\t"
|
|
|
|
|
PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
|
|
|
|
|
"movq %%mm5, (%2, %%"FF_REG_a") \n\t"
|
|
|
|
|
"add %3, %%"FF_REG_a" \n\t"
|
|
|
|
|
|
|
|
|
|
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
|
|
|
|
"movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
|
|
|
|
|
"movq %%mm2, %%mm3 \n\t"
|
|
|
|
|
"movq %%mm4, %%mm5 \n\t"
|
|
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t"
|
|
|
|
|
"punpcklbw %%mm7, %%mm4 \n\t"
|
|
|
|
|
"punpckhbw %%mm7, %%mm3 \n\t"
|
|
|
|
|
"punpckhbw %%mm7, %%mm5 \n\t"
|
|
|
|
|
"paddusw %%mm2, %%mm4 \n\t"
|
|
|
|
|
"paddusw %%mm3, %%mm5 \n\t"
|
|
|
|
|
"paddusw %%mm6, %%mm0 \n\t"
|
|
|
|
|
"paddusw %%mm6, %%mm1 \n\t"
|
|
|
|
|
"paddusw %%mm4, %%mm0 \n\t"
|
|
|
|
|
"paddusw %%mm5, %%mm1 \n\t"
|
|
|
|
|
"psrlw $2, %%mm0 \n\t"
|
|
|
|
|
"psrlw $2, %%mm1 \n\t"
|
|
|
|
|
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
|
|
|
|
|
"packuswb %%mm1, %%mm0 \n\t"
|
|
|
|
|
"pcmpeqd %%mm2, %%mm2 \n\t"
|
|
|
|
|
"paddb %%mm2, %%mm2 \n\t"
|
|
|
|
|
PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
|
|
|
|
|
"movq %%mm1, (%2, %%"FF_REG_a") \n\t"
|
|
|
|
|
"add %3, %%"FF_REG_a" \n\t"
|
|
|
|
|
|
|
|
|
|
"subl $2, %0 \n\t"
|
|
|
|
|
"jnz 1b \n\t"
|
|
|
|
|
:"+g"(h), "+S"(pixels)
|
|
|
|
|
:"D"(block), "r"((x86_reg)line_size)
|
|
|
|
|
:FF_REG_a, "memory");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void put_no_rnd_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
|
|
|
{
|
|
|
|
|
MOVQ_BFE(mm6);
|
|
|
|
|
__asm__ volatile(
|
|
|
|
|
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
|
|
|
|
".p2align 3 \n\t"
|
|
|
|
|
"1: \n\t"
|
|
|
|
|
"movq (%1), %%mm0 \n\t"
|
|
|
|
|
"movq 1(%1), %%mm1 \n\t"
|
|
|
|
|
"movq (%1, %3), %%mm2 \n\t"
|
|
|
|
|
"movq 1(%1, %3), %%mm3 \n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
|
|
|
"movq %%mm4, (%2) \n\t"
|
|
|
|
|
"movq %%mm5, (%2, %3) \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %1 \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %2 \n\t"
|
|
|
|
|
"movq (%1), %%mm0 \n\t"
|
|
|
|
|
"movq 1(%1), %%mm1 \n\t"
|
|
|
|
|
"movq (%1, %3), %%mm2 \n\t"
|
|
|
|
|
"movq 1(%1, %3), %%mm3 \n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
|
|
|
"movq %%mm4, (%2) \n\t"
|
|
|
|
|
"movq %%mm5, (%2, %3) \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %1 \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %2 \n\t"
|
|
|
|
|
"subl $4, %0 \n\t"
|
|
|
|
|
"jnz 1b \n\t"
|
|
|
|
|
:"+g"(h), "+S"(pixels), "+D"(block)
|
|
|
|
|
:"r"((x86_reg)line_size)
|
|
|
|
|
:FF_REG_a, "memory");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void put_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
|
|
|
{
|
|
|
|
|
MOVQ_BFE(mm6);
|
|
|
|
|
__asm__ volatile(
|
|
|
|
|
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
|
|
|
|
".p2align 3 \n\t"
|
|
|
|
|
"1: \n\t"
|
|
|
|
|
"movq (%1), %%mm0 \n\t"
|
|
|
|
|
"movq 1(%1), %%mm1 \n\t"
|
|
|
|
|
"movq (%1, %3), %%mm2 \n\t"
|
|
|
|
|
"movq 1(%1, %3), %%mm3 \n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
|
|
|
"movq %%mm4, (%2) \n\t"
|
|
|
|
|
"movq %%mm5, (%2, %3) \n\t"
|
|
|
|
|
"movq 8(%1), %%mm0 \n\t"
|
|
|
|
|
"movq 9(%1), %%mm1 \n\t"
|
|
|
|
|
"movq 8(%1, %3), %%mm2 \n\t"
|
|
|
|
|
"movq 9(%1, %3), %%mm3 \n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
|
|
|
"movq %%mm4, 8(%2) \n\t"
|
|
|
|
|
"movq %%mm5, 8(%2, %3) \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %1 \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %2 \n\t"
|
|
|
|
|
"movq (%1), %%mm0 \n\t"
|
|
|
|
|
"movq 1(%1), %%mm1 \n\t"
|
|
|
|
|
"movq (%1, %3), %%mm2 \n\t"
|
|
|
|
|
"movq 1(%1, %3), %%mm3 \n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
|
|
|
"movq %%mm4, (%2) \n\t"
|
|
|
|
|
"movq %%mm5, (%2, %3) \n\t"
|
|
|
|
|
"movq 8(%1), %%mm0 \n\t"
|
|
|
|
|
"movq 9(%1), %%mm1 \n\t"
|
|
|
|
|
"movq 8(%1, %3), %%mm2 \n\t"
|
|
|
|
|
"movq 9(%1, %3), %%mm3 \n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
|
|
|
"movq %%mm4, 8(%2) \n\t"
|
|
|
|
|
"movq %%mm5, 8(%2, %3) \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %1 \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %2 \n\t"
|
|
|
|
|
"subl $4, %0 \n\t"
|
|
|
|
|
"jnz 1b \n\t"
|
|
|
|
|
:"+g"(h), "+S"(pixels), "+D"(block)
|
|
|
|
|
:"r"((x86_reg)line_size)
|
|
|
|
|
:FF_REG_a, "memory");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void put_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
|
|
|
{
|
|
|
|
|
MOVQ_BFE(mm6);
|
|
|
|
|
__asm__ volatile(
|
|
|
|
|
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
|
|
|
|
"movq (%1), %%mm0 \n\t"
|
|
|
|
|
".p2align 3 \n\t"
|
|
|
|
|
"1: \n\t"
|
|
|
|
|
"movq (%1, %3), %%mm1 \n\t"
|
|
|
|
|
"movq (%1, %%"FF_REG_a"),%%mm2\n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
|
|
|
|
"movq %%mm4, (%2) \n\t"
|
|
|
|
|
"movq %%mm5, (%2, %3) \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %1 \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %2 \n\t"
|
|
|
|
|
"movq (%1, %3), %%mm1 \n\t"
|
|
|
|
|
"movq (%1, %%"FF_REG_a"),%%mm0\n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
|
|
|
|
"movq %%mm4, (%2) \n\t"
|
|
|
|
|
"movq %%mm5, (%2, %3) \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %1 \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %2 \n\t"
|
|
|
|
|
"subl $4, %0 \n\t"
|
|
|
|
|
"jnz 1b \n\t"
|
|
|
|
|
:"+g"(h), "+S"(pixels), "+D"(block)
|
|
|
|
|
:"r"((x86_reg)line_size)
|
|
|
|
|
:FF_REG_a, "memory");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void avg_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
|
|
|
{
|
|
|
|
|
MOVQ_BFE(mm6);
|
|
|
|
|
__asm__ volatile(
|
|
|
|
|
".p2align 3 \n\t"
|
|
|
|
|
"1: \n\t"
|
|
|
|
|
"movq (%1), %%mm0 \n\t"
|
|
|
|
|
"movq 1(%1), %%mm1 \n\t"
|
|
|
|
|
"movq (%2), %%mm3 \n\t"
|
|
|
|
|
PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
|
|
|
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
|
|
|
"movq %%mm0, (%2) \n\t"
|
|
|
|
|
"movq 8(%1), %%mm0 \n\t"
|
|
|
|
|
"movq 9(%1), %%mm1 \n\t"
|
|
|
|
|
"movq 8(%2), %%mm3 \n\t"
|
|
|
|
|
PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
|
|
|
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
|
|
|
"movq %%mm0, 8(%2) \n\t"
|
|
|
|
|
"add %3, %1 \n\t"
|
|
|
|
|
"add %3, %2 \n\t"
|
|
|
|
|
"subl $1, %0 \n\t"
|
|
|
|
|
"jnz 1b \n\t"
|
|
|
|
|
:"+g"(h), "+S"(pixels), "+D"(block)
|
|
|
|
|
:"r"((x86_reg)line_size)
|
|
|
|
|
:"memory");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void avg_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
|
|
|
|
{
|
|
|
|
|
MOVQ_BFE(mm6);
|
|
|
|
|
__asm__ volatile(
|
|
|
|
|
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
|
|
|
|
"movq (%1), %%mm0 \n\t"
|
|
|
|
|
".p2align 3 \n\t"
|
|
|
|
|
"1: \n\t"
|
|
|
|
|
"movq (%1, %3), %%mm1 \n\t"
|
|
|
|
|
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
|
|
|
|
"movq (%2), %%mm3 \n\t"
|
|
|
|
|
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
|
|
|
|
|
"movq (%2, %3), %%mm3 \n\t"
|
|
|
|
|
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
|
|
|
|
|
"movq %%mm0, (%2) \n\t"
|
|
|
|
|
"movq %%mm1, (%2, %3) \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %1 \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %2 \n\t"
|
|
|
|
|
|
|
|
|
|
"movq (%1, %3), %%mm1 \n\t"
|
|
|
|
|
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
|
|
|
|
|
PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
|
|
|
|
"movq (%2), %%mm3 \n\t"
|
|
|
|
|
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
|
|
|
|
|
"movq (%2, %3), %%mm3 \n\t"
|
|
|
|
|
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
|
|
|
|
|
"movq %%mm2, (%2) \n\t"
|
|
|
|
|
"movq %%mm1, (%2, %3) \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %1 \n\t"
|
|
|
|
|
"add %%"FF_REG_a", %2 \n\t"
|
|
|
|
|
|
|
|
|
|
"subl $4, %0 \n\t"
|
|
|
|
|
"jnz 1b \n\t"
|
|
|
|
|
:"+g"(h), "+S"(pixels), "+D"(block)
|
|
|
|
|
:"r"((x86_reg)line_size)
|
|
|
|
|
:FF_REG_a, "memory");
|
|
|
|
|
}
|
|
|
|
|
|
2015-09-19 11:39:37 -04:00
|
|
|
#if HAVE_MMX
|
2013-12-21 17:33:17 +01:00
|
|
|
CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
|
|
|
|
|
CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
|
2013-04-20 20:28:28 +02:00
|
|
|
|
2013-12-21 17:33:17 +01:00
|
|
|
CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
|
|
|
|
|
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
|
2015-09-19 11:39:37 -04:00
|
|
|
#endif
|
2013-04-20 20:28:28 +02:00
|
|
|
|
2013-03-10 15:37:59 -07:00
|
|
|
/***********************************/
|
|
|
|
|
/* MMX rounding */
|
|
|
|
|
|
|
|
|
|
#define SET_RND MOVQ_WTWO
|
2014-01-29 12:49:24 +01:00
|
|
|
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
|
|
|
|
|
#define STATIC
|
|
|
|
|
|
|
|
|
|
#include "rnd_template.c"
|
|
|
|
|
|
2022-06-10 22:42:01 +02:00
|
|
|
#undef NO_AVG
|
2013-03-10 15:37:59 -07:00
|
|
|
#undef DEF
|
|
|
|
|
#undef SET_RND
|
|
|
|
|
|
2015-09-19 11:39:37 -04:00
|
|
|
#if HAVE_MMX
|
2022-06-10 22:42:01 +02:00
|
|
|
CALL_2X_PIXELS(put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
|
2015-09-19 11:39:37 -04:00
|
|
|
#endif
|
2014-01-29 12:49:24 +01:00
|
|
|
|
2013-03-10 15:37:59 -07:00
|
|
|
#endif /* HAVE_INLINE_ASM */
|
|
|
|
|
|
|
|
|
|
|
2016-10-08 14:18:33 +00:00
|
|
|
#if HAVE_X86ASM
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2013-12-21 17:33:17 +01:00
|
|
|
#define HPELDSP_AVG_PIXELS16(CPUEXT) \
|
|
|
|
|
CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \
|
|
|
|
|
CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \
|
|
|
|
|
CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \
|
|
|
|
|
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \
|
|
|
|
|
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \
|
2014-05-22 17:48:18 +00:00
|
|
|
CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \
|
2014-05-22 17:48:17 +00:00
|
|
|
CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
|
2013-04-20 20:28:28 +02:00
|
|
|
|
|
|
|
|
HPELDSP_AVG_PIXELS16(_mmxext)
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2016-10-08 14:18:33 +00:00
|
|
|
#endif /* HAVE_X86ASM */
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2014-05-21 11:19:29 +02:00
|
|
|
#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
|
|
|
|
|
if (HAVE_MMX_EXTERNAL) \
|
2022-06-10 22:42:01 +02:00
|
|
|
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU
|
2014-05-21 11:19:29 +02:00
|
|
|
|
2022-06-10 22:42:01 +02:00
|
|
|
#define SET_HPEL_FUNCS03(PFX, IDX, SIZE, CPU) \
|
|
|
|
|
do { \
|
|
|
|
|
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU); \
|
|
|
|
|
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
|
|
|
|
|
} while (0)
|
|
|
|
|
#define SET_HPEL_FUNCS12(PFX, IDX, SIZE, CPU) \
|
2013-03-10 15:37:59 -07:00
|
|
|
do { \
|
|
|
|
|
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
|
|
|
|
|
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
|
|
|
|
|
} while (0)
|
2014-05-21 11:19:29 +02:00
|
|
|
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
|
|
|
|
|
do { \
|
2022-06-10 22:42:01 +02:00
|
|
|
SET_HPEL_FUNCS03(PFX, IDX, SIZE, CPU); \
|
|
|
|
|
SET_HPEL_FUNCS12(PFX, IDX, SIZE, CPU); \
|
2014-05-21 11:19:29 +02:00
|
|
|
} while (0)
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2016-01-11 16:06:33 +01:00
|
|
|
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
|
2013-03-10 15:37:59 -07:00
|
|
|
{
|
2024-02-16 20:36:24 +01:00
|
|
|
#if HAVE_MMX_INLINE
|
2022-06-10 22:42:01 +02:00
|
|
|
SET_HPEL_FUNCS03(put, [0], 16, mmx);
|
2013-03-10 15:37:59 -07:00
|
|
|
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
|
2024-02-16 20:36:24 +01:00
|
|
|
SET_HPEL_FUNCS12(avg_no_rnd, , 16, mmx);
|
|
|
|
|
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
|
2022-06-10 22:42:01 +02:00
|
|
|
SET_HPEL_FUNCS03(put, [1], 8, mmx);
|
2013-03-10 15:37:59 -07:00
|
|
|
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
|
2024-02-16 20:36:24 +01:00
|
|
|
#endif
|
2013-03-10 15:37:59 -07:00
|
|
|
}
|
|
|
|
|
|
2016-01-11 16:06:33 +01:00
|
|
|
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
|
2013-03-10 15:37:59 -07:00
|
|
|
{
|
2013-05-06 00:58:07 +02:00
|
|
|
#if HAVE_MMXEXT_EXTERNAL
|
2013-03-10 15:37:59 -07:00
|
|
|
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
|
2013-04-20 22:15:22 +02:00
|
|
|
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2025-05-30 12:55:54 +02:00
|
|
|
c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
|
2013-04-20 22:15:22 +02:00
|
|
|
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
|
|
|
|
|
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
|
2014-05-22 17:48:18 +00:00
|
|
|
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
|
2013-03-10 15:37:59 -07:00
|
|
|
|
|
|
|
|
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
|
|
|
|
|
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
|
|
|
|
|
|
|
|
|
|
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
|
|
|
|
|
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
|
|
|
|
|
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
|
2014-05-22 17:48:18 +00:00
|
|
|
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2023-09-04 11:52:06 +02:00
|
|
|
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
|
|
|
|
|
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
|
|
|
|
|
|
2024-02-16 20:36:24 +01:00
|
|
|
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_mmxext;
|
|
|
|
|
|
2015-06-29 21:59:37 +02:00
|
|
|
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
|
2013-04-20 22:15:22 +02:00
|
|
|
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
|
|
|
|
|
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
|
2013-03-10 15:37:59 -07:00
|
|
|
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
|
|
|
|
|
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
|
|
|
|
|
|
2014-05-22 17:48:17 +00:00
|
|
|
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
|
|
|
|
|
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
|
2013-03-10 15:37:59 -07:00
|
|
|
}
|
2013-05-06 00:58:07 +02:00
|
|
|
#endif /* HAVE_MMXEXT_EXTERNAL */
|
2013-03-10 15:37:59 -07:00
|
|
|
}
|
|
|
|
|
|
2016-01-11 16:06:33 +01:00
|
|
|
static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
|
2013-03-10 15:37:59 -07:00
|
|
|
{
|
2013-05-06 00:58:07 +02:00
|
|
|
#if HAVE_SSE2_EXTERNAL
|
2016-01-11 16:04:17 +01:00
|
|
|
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
|
|
|
|
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
2017-01-31 14:53:27 -03:00
|
|
|
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
|
|
|
|
|
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
|
|
|
|
|
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2;
|
2016-01-11 16:04:17 +01:00
|
|
|
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
|
2017-01-31 14:53:27 -03:00
|
|
|
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
|
|
|
|
|
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
|
|
|
|
|
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
|
2024-02-16 20:36:24 +01:00
|
|
|
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2;
|
2013-05-06 00:58:07 +02:00
|
|
|
#endif /* HAVE_SSE2_EXTERNAL */
|
2013-03-10 15:37:59 -07:00
|
|
|
}
|
|
|
|
|
|
2017-01-31 14:56:11 -03:00
|
|
|
static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
|
2014-05-22 23:47:06 +02:00
|
|
|
{
|
|
|
|
|
#if HAVE_SSSE3_EXTERNAL
|
|
|
|
|
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
|
|
|
|
|
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
|
|
|
|
|
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
|
|
|
|
|
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
2014-01-09 14:06:32 +01:00
|
|
|
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
|
2013-03-10 15:37:59 -07:00
|
|
|
{
|
2013-07-17 20:19:24 +02:00
|
|
|
int cpu_flags = av_get_cpu_flags();
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2013-08-20 14:46:58 +02:00
|
|
|
if (INLINE_MMX(cpu_flags))
|
2016-01-11 16:06:33 +01:00
|
|
|
hpeldsp_init_mmx(c, flags);
|
2013-03-10 15:37:59 -07:00
|
|
|
|
2013-09-30 15:04:44 +02:00
|
|
|
if (EXTERNAL_MMXEXT(cpu_flags))
|
2016-01-11 16:06:33 +01:00
|
|
|
hpeldsp_init_mmxext(c, flags);
|
2013-09-30 15:04:44 +02:00
|
|
|
|
2016-01-11 16:04:17 +01:00
|
|
|
if (EXTERNAL_SSE2_FAST(cpu_flags))
|
2016-01-11 16:06:33 +01:00
|
|
|
hpeldsp_init_sse2_fast(c, flags);
|
2014-05-22 23:47:06 +02:00
|
|
|
|
|
|
|
|
if (EXTERNAL_SSSE3(cpu_flags))
|
2017-01-31 14:56:11 -03:00
|
|
|
hpeldsp_init_ssse3(c, flags);
|
2013-03-10 15:37:59 -07:00
|
|
|
}
|