avcodec/vp9: add 64-bit ipred_dr_32x32_16 avx2 implementation

vp9_diag_downright_32x32_12bpp_c: 429.7
vp9_diag_downright_32x32_12bpp_sse2: 158.9
vp9_diag_downright_32x32_12bpp_ssse3: 144.6
vp9_diag_downright_32x32_12bpp_avx: 141.0
vp9_diag_downright_32x32_12bpp_avx2: 73.8

Almost 50% faster than avx implementation

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
Ilia Valiakhmetov 2017-06-27 15:06:19 -04:00 committed by Ronald S. Bultje
parent 0daa1cf073
commit 35a5d9715d
2 changed files with 106 additions and 3 deletions

View file

@ -52,8 +52,9 @@ decl_ipred_fns(dc, 16, mmxext, sse2);
decl_ipred_fns(dc_top, 16, mmxext, sse2);
decl_ipred_fns(dc_left, 16, mmxext, sse2);
decl_ipred_fn(dl, 16, 16, avx2);
decl_ipred_fn(dr, 16, 16, avx2);
decl_ipred_fn(dl, 32, 16, avx2);
decl_ipred_fn(dr, 16, 16, avx2);
decl_ipred_fn(dr, 32, 16, avx2);
#define decl_ipred_dir_funcs(type) \
decl_ipred_fns(type, 16, sse2, sse2); \
@ -137,8 +138,9 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(1, 1, 64, avg, _16, avx2);
init_fpel_func(0, 1, 128, avg, _16, avx2);
init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
}
#endif /* HAVE_X86ASM */