mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-02-12 04:59:59 +00:00
Before this patch, prof_grad_filter calculate
gh[0], gh[1], gv[0], gv[1] and save them to stack.
derive_bdof_vx_vy load them from stack and calculate
gh[0] + gh[1], gv[0] + gv[1].
apply_bdof_min_block load them from stack and calculate
gh[0] - gh[1], gv[0] - gv[1]
This patch add bdof_grad_filter, which calculate gh[0] + gh[1],
gh[0] - gh[1], gv[0] + gv[1], gv[0] - gv[1], and save them to
stack, so derive_bdof_vx_vy and apply_bdof_min_block can use the
results directly.
prof_grad_filter is kept for reuse by other functions in the future.
Benchmark on rpi5 with gcc 12
Before After
--------------------------------------------------------------------
apply_bdof_8_8x16_c: | 7431.4 ( 1.00x) | 7371.7 ( 1.00x)
apply_bdof_8_8x16_neon: | 1175.4 ( 6.32x) | 1036.3 ( 7.11x)
apply_bdof_8_16x8_c: | 7182.2 ( 1.00x) | 7201.1 ( 1.00x)
apply_bdof_8_16x8_neon: | 1021.7 ( 7.03x) | 879.9 ( 8.18x)
apply_bdof_8_16x16_c: | 14577.1 ( 1.00x) | 14589.3 ( 1.00x)
apply_bdof_8_16x16_neon: | 2012.8 ( 7.24x) | 1743.3 ( 8.37x)
apply_bdof_10_8x16_c: | 7292.4 ( 1.00x) | 7308.5 ( 1.00x)
apply_bdof_10_8x16_neon: | 1156.3 ( 6.31x) | 1045.3 ( 6.99x)
apply_bdof_10_16x8_c: | 7112.4 ( 1.00x) | 7214.4 ( 1.00x)
apply_bdof_10_16x8_neon: | 1007.6 ( 7.06x) | 904.8 ( 7.97x)
apply_bdof_10_16x16_c: | 14363.3 ( 1.00x) | 14476.4 ( 1.00x)
apply_bdof_10_16x16_neon: | 1986.9 ( 7.23x) | 1783.1 ( 8.12x)
apply_bdof_12_8x16_c: | 7433.3 ( 1.00x) | 7374.7 ( 1.00x)
apply_bdof_12_8x16_neon: | 1155.9 ( 6.43x) | 1040.8 ( 7.09x)
apply_bdof_12_16x8_c: | 7171.1 ( 1.00x) | 7376.3 ( 1.00x)
apply_bdof_12_16x8_neon: | 1010.8 ( 7.09x) | 899.4 ( 8.20x)
apply_bdof_12_16x16_c: | 14515.5 ( 1.00x) | 14731.5 ( 1.00x)
apply_bdof_12_16x16_neon: | 1988.4 ( 7.30x) | 1785.2 ( 8.25x)
|
||
|---|---|---|
| .. | ||
| h26x | ||
| vvc | ||
| aacencdsp_init.c | ||
| aacencdsp_neon.S | ||
| aacpsdsp_init_aarch64.c | ||
| aacpsdsp_neon.S | ||
| ac3dsp_init_aarch64.c | ||
| ac3dsp_neon.S | ||
| cabac.h | ||
| fdct.h | ||
| fdctdsp_init_aarch64.c | ||
| fdctdsp_neon.S | ||
| fmtconvert_init.c | ||
| fmtconvert_neon.S | ||
| h264chroma_init_aarch64.c | ||
| h264cmc_neon.S | ||
| h264dsp_init_aarch64.c | ||
| h264dsp_neon.S | ||
| h264idct_neon.S | ||
| h264pred_init.c | ||
| h264pred_neon.S | ||
| h264qpel_init_aarch64.c | ||
| h264qpel_neon.S | ||
| hevcdsp_deblock_neon.S | ||
| hevcdsp_idct_neon.S | ||
| hevcdsp_init_aarch64.c | ||
| hpeldsp_init_aarch64.c | ||
| hpeldsp_neon.S | ||
| idct.h | ||
| idctdsp_init_aarch64.c | ||
| idctdsp_neon.S | ||
| Makefile | ||
| me_cmp_init_aarch64.c | ||
| me_cmp_neon.S | ||
| mpegaudiodsp_init.c | ||
| mpegaudiodsp_neon.S | ||
| mpegvideoencdsp_init.c | ||
| mpegvideoencdsp_neon.S | ||
| neon.S | ||
| neontest.c | ||
| opusdsp_init.c | ||
| opusdsp_neon.S | ||
| pixblockdsp_init_aarch64.c | ||
| pixblockdsp_neon.S | ||
| rv40dsp_init_aarch64.c | ||
| sbrdsp_init_aarch64.c | ||
| sbrdsp_neon.S | ||
| simple_idct_neon.S | ||
| synth_filter_init.c | ||
| synth_filter_neon.S | ||
| vc1dsp_init_aarch64.c | ||
| vc1dsp_neon.S | ||
| videodsp.S | ||
| videodsp_init.c | ||
| vorbisdsp_init.c | ||
| vorbisdsp_neon.S | ||
| vp8dsp.h | ||
| vp8dsp_init_aarch64.c | ||
| vp8dsp_neon.S | ||
| vp9dsp_init.h | ||
| vp9dsp_init_10bpp_aarch64.c | ||
| vp9dsp_init_12bpp_aarch64.c | ||
| vp9dsp_init_16bpp_aarch64_template.c | ||
| vp9dsp_init_aarch64.c | ||
| vp9itxfm_16bpp_neon.S | ||
| vp9itxfm_neon.S | ||
| vp9lpf_16bpp_neon.S | ||
| vp9lpf_neon.S | ||
| vp9mc_16bpp_neon.S | ||
| vp9mc_aarch64.S | ||
| vp9mc_neon.S | ||