ffmpeg/libavutil/x86/pixelutils.h
Andreas Rheinhardt e1782fb016 avutil/x86/pixelutils: Don't use mmx in 8x8 SAD
This function is exported, so has to abide by the ABI
and therefore issues emms since commit
5b85ca5317. Yet this is
expensive and using SSE2 instead improves performance.
Also avoid the initial zeroing and the last pointer
increment while just at it.
This removes the last usage of mmx from libavutil*.

Old benchmarks:
sad_8x8_0_c:                                            13.2 ( 1.00x)
sad_8x8_0_mmxext:                                       27.8 ( 0.48x)
sad_8x8_1_c:                                            13.2 ( 1.00x)
sad_8x8_1_mmxext:                                       27.6 ( 0.48x)
sad_8x8_2_c:                                            13.3 ( 1.00x)
sad_8x8_2_mmxext:                                       27.6 ( 0.48x)

New benchmarks:
sad_8x8_0_c:                                            13.3 ( 1.00x)
sad_8x8_0_sse2:                                         11.7 ( 1.13x)
sad_8x8_1_c:                                            13.8 ( 1.00x)
sad_8x8_1_sse2:                                         11.6 ( 1.20x)
sad_8x8_2_c:                                            13.2 ( 1.00x)
sad_8x8_2_sse2:                                         11.8 ( 1.12x)

Hint: Using two psadbw or one psadbw and movhps made no difference
in the benchmarks, so I chose the latter due to smaller codesize.

*: except if lavu provides avpriv_emms for other libraries

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2026-04-18 21:21:11 +02:00

78 lines
3.2 KiB
C

/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_PIXELUTILS_H
#define AVUTIL_X86_PIXELUTILS_H
#include <stddef.h>
#include <stdint.h>
#include "config.h"
#include "cpu.h"
#include "libavutil/attributes.h"
#include "libavutil/pixelutils.h"
int ff_pixelutils_sad_8x8_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
static inline av_cold void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
sad[2] = ff_pixelutils_sad_8x8_sse2;
switch (aligned) {
case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned
case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned
case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
switch (aligned) {
case 0: sad[4] = ff_pixelutils_sad_32x32_sse2; break; // src1 unaligned, src2 unaligned
case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1 aligned, src2 unaligned
case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1 aligned, src2 aligned
}
}
#if HAVE_AVX2_EXTERNAL
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
sad[4] = ff_pixelutils_sad_32x32_avx2;
}
#endif
}
#endif