From 1e86a92a1cd752f64b47a8a09a44626320cd4e27 Mon Sep 17 00:00:00 2001 From: Zhao Zhili Date: Thu, 21 May 2026 15:01:50 +0800 Subject: [PATCH] lavu/aarch64: unroll butterflies_float to 8 floats/iter butterflies_float_neon: before after Cortex-A76 (gcc 12.4): 163.1 (3.95x) 147.0 (4.37x) Apple M1 (clang 16): 0.7 (0.85x) 0.6 (0.99x) Signed-off-by: Zhao Zhili --- libavutil/aarch64/float_dsp_neon.S | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/libavutil/aarch64/float_dsp_neon.S b/libavutil/aarch64/float_dsp_neon.S index fee47cb474..aa6b5189f5 100644 --- a/libavutil/aarch64/float_dsp_neon.S +++ b/libavutil/aarch64/float_dsp_neon.S @@ -178,15 +178,26 @@ function ff_vector_fmul_reverse_neon, export=1 endfunc function ff_butterflies_float_neon, export=1 -1: ld1 {v0.4s}, [x0] + subs w2, w2, #8 + b.lt 2f +1: ldp q0, q1, [x0] + ldp q2, q3, [x1] + subs w2, w2, #8 + fadd v4.4s, v0.4s, v2.4s + fadd v5.4s, v1.4s, v3.4s + fsub v0.4s, v0.4s, v2.4s + fsub v1.4s, v1.4s, v3.4s + st1 {v4.4s, v5.4s}, [x0], #32 + st1 {v0.4s, v1.4s}, [x1], #32 + b.ge 1b +2: tbz w2, #2, 3f + ld1 {v0.4s}, [x0] ld1 {v1.4s}, [x1] - subs w2, w2, #4 fsub v2.4s, v0.4s, v1.4s fadd v3.4s, v0.4s, v1.4s - st1 {v2.4s}, [x1], #16 - st1 {v3.4s}, [x0], #16 - b.gt 1b - ret + st1 {v2.4s}, [x1] + st1 {v3.4s}, [x0] +3: ret endfunc function ff_scalarproduct_float_neon, export=1