aarch64/vvc: Optimisations of put_luma_hv() functions for 10/12-bit

Apple M2:
put_luma_hv_10_4x4_c:                                   36.3 ( 1.00x)
put_luma_hv_10_8x8_c:                                   82.9 ( 1.00x)
put_luma_hv_10_8x8_neon:                                34.9 ( 2.37x)
put_luma_hv_10_16x16_c:                                239.2 ( 1.00x)
put_luma_hv_10_16x16_neon:                             119.0 ( 2.01x)
put_luma_hv_10_32x32_c:                                900.3 ( 1.00x)
put_luma_hv_10_32x32_neon:                             429.3 ( 2.10x)
put_luma_hv_10_64x64_c:                               2984.7 ( 1.00x)
put_luma_hv_10_64x64_neon:                            1736.2 ( 1.72x)
put_luma_hv_10_128x128_c:                            11194.2 ( 1.00x)
put_luma_hv_10_128x128_neon:                          6357.3 ( 1.76x)
put_luma_hv_12_4x4_c:                                   35.9 ( 1.00x)
put_luma_hv_12_8x8_c:                                   82.6 ( 1.00x)
put_luma_hv_12_8x8_neon:                                34.3 ( 2.41x)
put_luma_hv_12_16x16_c:                                240.2 ( 1.00x)
put_luma_hv_12_16x16_neon:                             115.3 ( 2.08x)
put_luma_hv_12_32x32_c:                                787.7 ( 1.00x)
put_luma_hv_12_32x32_neon:                             414.2 ( 1.90x)
put_luma_hv_12_64x64_c:                               3058.4 ( 1.00x)
put_luma_hv_12_64x64_neon:                            1592.3 ( 1.92x)
put_luma_hv_12_128x128_c:                            11350.8 ( 1.00x)
put_luma_hv_12_128x128_neon:                          6378.3 ( 1.78x)

RPi4:
put_luma_hv_10_4x4_c:                                  637.8 ( 1.00x)
put_luma_hv_10_8x8_c:                                 1044.9 ( 1.00x)
put_luma_hv_10_8x8_neon:                               483.7 ( 2.16x)
put_luma_hv_10_16x16_c:                               3098.0 ( 1.00x)
put_luma_hv_10_16x16_neon:                            1603.1 ( 1.93x)
put_luma_hv_10_32x32_c:                              10054.8 ( 1.00x)
put_luma_hv_10_32x32_neon:                            5843.6 ( 1.72x)
put_luma_hv_10_64x64_c:                              40506.2 ( 1.00x)
put_luma_hv_10_64x64_neon:                           24384.0 ( 1.66x)
put_luma_hv_10_128x128_c:                           130604.2 ( 1.00x)
put_luma_hv_10_128x128_neon:                         99746.6 ( 1.31x)
put_luma_hv_12_4x4_c:                                  638.2 ( 1.00x)
put_luma_hv_12_8x8_c:                                 1074.6 ( 1.00x)
put_luma_hv_12_8x8_neon:                               482.6 ( 2.23x)
put_luma_hv_12_16x16_c:                               3094.0 ( 1.00x)
put_luma_hv_12_16x16_neon:                            1602.5 ( 1.93x)
put_luma_hv_12_32x32_c:                              10034.4 ( 1.00x)
put_luma_hv_12_32x32_neon:                            5843.3 ( 1.72x)
put_luma_hv_12_64x64_c:                              40447.5 ( 1.00x)
put_luma_hv_12_64x64_neon:                           24377.2 ( 1.66x)
put_luma_hv_12_128x128_c:                           130610.4 ( 1.00x)
put_luma_hv_12_128x128_neon:                         99765.8 ( 1.31x)
This commit is contained in:
Georgii Zagoruiko 2026-03-03 18:59:32 +00:00 committed by Martin Storsjö
parent cef2fbfd4b
commit 90431417cb
2 changed files with 376 additions and 0 deletions

View file

@ -60,6 +60,19 @@ void ff_vvc_put_luma_v16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdif
void ff_vvc_put_luma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
const int height, const int8_t *hf, const int8_t *vf, const int width);
void ff_vvc_put_luma_hv8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
const int height, const int8_t *hf, const int8_t *vf, const int width);
void ff_vvc_put_luma_hv16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
const int height, const int8_t *hf, const int8_t *vf, const int width);
void ff_vvc_put_luma_hv_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
const int height, const int8_t *hf, const int8_t *vf, const int width);
void ff_vvc_put_luma_hv8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
const int height, const int8_t *hf, const int8_t *vf, const int width);
void ff_vvc_put_luma_hv16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
const int height, const int8_t *hf, const int8_t *vf, const int width);
void ff_vvc_put_luma_hv_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
const int height, const int8_t *hf, const int8_t *vf, const int width);
void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps);
#define BIT_DEPTH 8
@ -287,6 +300,12 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[0][5][1][0] =
c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_10_neon;
c->inter.put[0][2][1][1] = ff_vvc_put_luma_hv8_10_neon;
c->inter.put[0][3][1][1] = ff_vvc_put_luma_hv16_10_neon;
c->inter.put[0][4][1][1] =
c->inter.put[0][5][1][1] =
c->inter.put[0][6][1][1] = ff_vvc_put_luma_hv_x16_10_neon;
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
c->alf.classify = alf_classify_10_neon;
@ -303,6 +322,12 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[0][5][0][1] =
c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_12_neon;
c->inter.put[0][2][1][1] = ff_vvc_put_luma_hv8_12_neon;
c->inter.put[0][3][1][1] = ff_vvc_put_luma_hv16_12_neon;
c->inter.put[0][4][1][1] =
c->inter.put[0][5][1][1] =
c->inter.put[0][6][1][1] = ff_vvc_put_luma_hv_x16_12_neon;
c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_12_neon;
c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_12_neon;
c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_12_neon;

View file

@ -2224,3 +2224,354 @@ endfunc
function ff_vvc_put_luma_v_x16_12_neon, export=1
put_luma_v_x16_xx_neon 4
endfunc
.macro put_luma_hv_x8_horizontal_filter shift, dst, src0, src1
ext v2.16b, \src0\().16b, \src1\().16b, #2
ext v3.16b, \src0\().16b, \src1\().16b, #4
ext v4.16b, \src0\().16b, \src1\().16b, #6
ext v5.16b, \src0\().16b, \src1\().16b, #8
smull v6.4s, \src0\().4h, v0.h[0]
smull2 v7.4s, \src0\().8h, v0.h[0]
smlal v6.4s, v2.4h, v0.h[1]
smlal2 v7.4s, v2.8h, v0.h[1]
smlal v6.4s, v3.4h, v0.h[2]
smlal2 v7.4s, v3.8h, v0.h[2]
smlal v6.4s, v4.4h, v0.h[3]
smlal2 v7.4s, v4.8h, v0.h[3]
smlal v6.4s, v5.4h, v0.h[4]
smlal2 v7.4s, v5.8h, v0.h[4]
ext v2.16b, \src0\().16b, \src1\().16b, #10
ext v3.16b, \src0\().16b, \src1\().16b, #12
ext v4.16b, \src0\().16b, \src1\().16b, #14
smlal v6.4s, v2.4h, v0.h[5]
smlal2 v7.4s, v2.8h, v0.h[5]
smlal v6.4s, v3.4h, v0.h[6]
smlal2 v7.4s, v3.8h, v0.h[6]
smlal v6.4s, v4.4h, v0.h[7]
smlal2 v7.4s, v4.8h, v0.h[7]
sqshrn \dst\().4h, v6.4s, #(\shift)
sqshrn2 \dst\().8h, v7.4s, #(\shift)
.endm
.macro put_luma_hv_x8_vertical_filter dst0, dst1, src0, src1, src2, src3, src4, src5, src6, src7
smull \dst0\().4s, \src0\().4h, v1.h[0]
smull2 \dst1\().4s, \src0\().8h, v1.h[0]
smlal \dst0\().4s, \src1\().4h, v1.h[1]
smlal2 \dst1\().4s, \src1\().8h, v1.h[1]
smlal \dst0\().4s, \src2\().4h, v1.h[2]
smlal2 \dst1\().4s, \src2\().8h, v1.h[2]
smlal \dst0\().4s, \src3\().4h, v1.h[3]
smlal2 \dst1\().4s, \src3\().8h, v1.h[3]
smlal \dst0\().4s, \src4\().4h, v1.h[4]
smlal2 \dst1\().4s, \src4\().8h, v1.h[4]
smlal \dst0\().4s, \src5\().4h, v1.h[5]
smlal2 \dst1\().4s, \src5\().8h, v1.h[5]
smlal \dst0\().4s, \src6\().4h, v1.h[6]
smlal2 \dst1\().4s, \src6\().8h, v1.h[6]
smlal \dst0\().4s, \src7\().4h, v1.h[7]
smlal2 \dst1\().4s, \src7\().8h, v1.h[7]
sqshrn \dst0\().4h, \dst0\().4s, #6
sqshrn \dst1\().4h, \dst1\().4s, #6
.endm
.macro put_luma_hv8_xx_neon shift
// dst .req x0
// _src .req x1
// _src_stride .req x2
// height .req x3
// hf .req x4
// vf .req x5
// width .req x6
mov x9, #(VVC_MAX_PB_SIZE * 2)
sub x1, x1, #6
ld1 {v0.8b}, [x4]
sub x1, x1, x2, lsl #1
sxtl v0.8h, v0.8b
ld1 {v1.8b}, [x5]
sub x1, x1, x2
sxtl v1.8h, v1.8b
ld1 {v16.8h, v17.8h}, [x1], x2
ld1 {v18.8h, v19.8h}, [x1], x2
ld1 {v20.8h, v21.8h}, [x1], x2
ld1 {v22.8h, v23.8h}, [x1], x2
ld1 {v24.8h, v25.8h}, [x1], x2
ld1 {v26.8h, v27.8h}, [x1], x2
ld1 {v28.8h, v29.8h}, [x1], x2
put_luma_hv_x8_horizontal_filter \shift, v16, v16, v17
put_luma_hv_x8_horizontal_filter \shift, v18, v18, v19
put_luma_hv_x8_horizontal_filter \shift, v20, v20, v21
put_luma_hv_x8_horizontal_filter \shift, v22, v22, v23
put_luma_hv_x8_horizontal_filter \shift, v24, v24, v25
put_luma_hv_x8_horizontal_filter \shift, v26, v26, v27
put_luma_hv_x8_horizontal_filter \shift, v28, v28, v29
1:
ld1 {v30.8h, v31.8h}, [x1], x2
put_luma_hv_x8_horizontal_filter \shift, v30, v30, v31
put_luma_hv_x8_vertical_filter v2, v3, v16, v18, v20, v22, v24, v26, v28, v30
ld1 {v16.8h, v17.8h}, [x1], x2
st1 {v2.4h-v3.4h}, [x0], x9
put_luma_hv_x8_horizontal_filter \shift, v16, v16, v17
put_luma_hv_x8_vertical_filter v2, v3, v18, v20, v22, v24, v26, v28, v30, v16
ld1 {v18.8h, v19.8h}, [x1], x2
st1 {v2.4h-v3.4h}, [x0], x9
put_luma_hv_x8_horizontal_filter \shift, v18, v18, v19
put_luma_hv_x8_vertical_filter v2, v3, v20, v22, v24, v26, v28, v30, v16, v18
ld1 {v20.8h, v21.8h}, [x1], x2
st1 {v2.4h-v3.4h}, [x0], x9
put_luma_hv_x8_horizontal_filter \shift, v20, v20, v21
put_luma_hv_x8_vertical_filter v2, v3, v22, v24, v26, v28, v30, v16, v18, v20
st1 {v2.4h-v3.4h}, [x0], x9
mov v17.16b, v16.16b
mov v16.16b, v24.16b
mov v24.16b, v17.16b
mov v19.16b, v18.16b
mov v18.16b, v26.16b
mov v26.16b, v19.16b
mov v21.16b, v20.16b
mov v20.16b, v28.16b
mov v28.16b, v21.16b
subs w3, w3, #4
mov v22.16b, v30.16b
b.gt 1b
ret
.endm
function ff_vvc_put_luma_hv8_10_neon, export=1
put_luma_hv8_xx_neon 2
endfunc
function ff_vvc_put_luma_hv8_12_neon, export=1
put_luma_hv8_xx_neon 4
endfunc
.macro put_luma_hv16_xx_neon shift
// dst .req x0
// _src .req x1
// _src_stride .req x2
// height .req x3
// hf .req x4
// vf .req x5
// width .req x6
stp d8, d9, [sp, #-64]!
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x9, #(VVC_MAX_PB_SIZE * 2)
sub x1, x1, #6
ld1 {v0.8b}, [x4]
sub x1, x1, x2, lsl #1
sxtl v0.8h, v0.8b
ld1 {v1.8b}, [x5]
sub x1, x1, x2
sxtl v1.8h, v1.8b
ld1 {v8.8h, v9.8h, v10.8h}, [x1], x2
ld1 {v11.8h, v12.8h, v13.8h}, [x1], x2
ld1 {v14.8h, v15.8h, v16.8h}, [x1], x2
ld1 {v17.8h, v18.8h, v19.8h}, [x1], x2
ld1 {v20.8h, v21.8h, v22.8h}, [x1], x2
ld1 {v23.8h, v24.8h, v25.8h}, [x1], x2
ld1 {v26.8h, v27.8h, v28.8h}, [x1], x2
put_luma_hv_x8_horizontal_filter \shift, v8, v8, v9
put_luma_hv_x8_horizontal_filter \shift, v9, v9, v10
put_luma_hv_x8_horizontal_filter \shift, v11, v11, v12
put_luma_hv_x8_horizontal_filter \shift, v12, v12, v13
put_luma_hv_x8_horizontal_filter \shift, v14, v14, v15
put_luma_hv_x8_horizontal_filter \shift, v15, v15, v16
put_luma_hv_x8_horizontal_filter \shift, v17, v17, v18
put_luma_hv_x8_horizontal_filter \shift, v18, v18, v19
put_luma_hv_x8_horizontal_filter \shift, v20, v20, v21
put_luma_hv_x8_horizontal_filter \shift, v21, v21, v22
put_luma_hv_x8_horizontal_filter \shift, v23, v23, v24
put_luma_hv_x8_horizontal_filter \shift, v24, v24, v25
put_luma_hv_x8_horizontal_filter \shift, v26, v26, v27
put_luma_hv_x8_horizontal_filter \shift, v27, v27, v28
1:
ld1 {v29.8h, v30.8h, v31.8h}, [x1], x2
put_luma_hv_x8_horizontal_filter \shift, v29, v29, v30
put_luma_hv_x8_horizontal_filter \shift, v30, v30, v31
put_luma_hv_x8_vertical_filter v2, v3, v8, v11, v14, v17, v20, v23, v26, v29
put_luma_hv_x8_vertical_filter v4, v5, v9, v12, v15, v18, v21, v24, v27, v30
ld1 {v8.8h, v9.8h, v10.8h}, [x1], x2
st1 {v2.4h-v5.4h}, [x0], x9
put_luma_hv_x8_horizontal_filter \shift, v8, v8, v9
put_luma_hv_x8_horizontal_filter \shift, v9, v9, v10
put_luma_hv_x8_vertical_filter v2, v3, v11, v14, v17, v20, v23, v26, v29, v8
put_luma_hv_x8_vertical_filter v4, v5, v12, v15, v18, v21, v24, v27, v30, v9
ld1 {v11.8h, v12.8h, v13.8h}, [x1], x2
st1 {v2.4h-v5.4h}, [x0], x9
put_luma_hv_x8_horizontal_filter \shift, v11, v11, v12
put_luma_hv_x8_horizontal_filter \shift, v12, v12, v13
put_luma_hv_x8_vertical_filter v2, v3, v14, v17, v20, v23, v26, v29, v8, v11
put_luma_hv_x8_vertical_filter v4, v5, v15, v18, v21, v24, v27, v30, v9, v12
ld1 {v14.8h, v15.8h, v16.8h}, [x1], x2
st1 {v2.4h-v5.4h}, [x0], x9
put_luma_hv_x8_horizontal_filter \shift, v14, v14, v15
put_luma_hv_x8_horizontal_filter \shift, v15, v15, v16
put_luma_hv_x8_vertical_filter v2, v3, v17, v20, v23, v26, v29, v8, v11, v14
put_luma_hv_x8_vertical_filter v4, v5, v18, v21, v24, v27, v30, v9, v12, v15
st1 {v2.4h-v5.4h}, [x0], x9
mov v10.16b, v8.16b
mov v8.16b, v20.16b
mov v20.16b, v10.16b
mov v10.16b, v9.16b
mov v9.16b, v21.16b
mov v21.16b, v10.16b
mov v13.16b, v11.16b
mov v11.16b, v23.16b
mov v23.16b, v13.16b
mov v13.16b, v12.16b
mov v12.16b, v24.16b
mov v24.16b, v13.16b
mov v16.16b, v14.16b
mov v14.16b, v26.16b
mov v26.16b, v16.16b
mov v16.16b, v15.16b
mov v15.16b, v27.16b
mov v27.16b, v16.16b
subs w3, w3, #4
mov v17.16b, v29.16b
mov v18.16b, v30.16b
b.gt 1b
ldp d14, d15, [sp, #48]
ldp d12, d13, [sp, #32]
ldp d10, d11, [sp, #16]
ldp d8, d9, [sp], #64
ret
.endm
function ff_vvc_put_luma_hv16_10_neon, export=1
put_luma_hv16_xx_neon 2
endfunc
function ff_vvc_put_luma_hv16_12_neon, export=1
put_luma_hv16_xx_neon 4
endfunc
.macro put_luma_hv_x16_xx_neon shift
// dst .req x0
// _src .req x1
// _src_stride .req x2
// height .req x3
// hf .req x4
// vf .req x5
// width .req x6
uxtw x6, w6
stp d8, d9, [sp, #-64]!
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x9, #(VVC_MAX_PB_SIZE * 2)
sub x1, x1, #6
ld1 {v0.8b}, [x4]
sub x1, x1, x2, lsl #1
sxtl v0.8h, v0.8b
ld1 {v1.8b}, [x5]
sub x1, x1, x2
sxtl v1.8h, v1.8b
1:
mov w13, w3
mov x11, x1
mov x10, x0
ld1 {v8.8h, v9.8h, v10.8h}, [x11], x2
ld1 {v11.8h, v12.8h, v13.8h}, [x11], x2
ld1 {v14.8h, v15.8h, v16.8h}, [x11], x2
ld1 {v17.8h, v18.8h, v19.8h}, [x11], x2
ld1 {v20.8h, v21.8h, v22.8h}, [x11], x2
ld1 {v23.8h, v24.8h, v25.8h}, [x11], x2
ld1 {v26.8h, v27.8h, v28.8h}, [x11], x2
put_luma_hv_x8_horizontal_filter \shift, v8, v8, v9
put_luma_hv_x8_horizontal_filter \shift, v9, v9, v10
put_luma_hv_x8_horizontal_filter \shift, v11, v11, v12
put_luma_hv_x8_horizontal_filter \shift, v12, v12, v13
put_luma_hv_x8_horizontal_filter \shift, v14, v14, v15
put_luma_hv_x8_horizontal_filter \shift, v15, v15, v16
put_luma_hv_x8_horizontal_filter \shift, v17, v17, v18
put_luma_hv_x8_horizontal_filter \shift, v18, v18, v19
put_luma_hv_x8_horizontal_filter \shift, v20, v20, v21
put_luma_hv_x8_horizontal_filter \shift, v21, v21, v22
put_luma_hv_x8_horizontal_filter \shift, v23, v23, v24
put_luma_hv_x8_horizontal_filter \shift, v24, v24, v25
put_luma_hv_x8_horizontal_filter \shift, v26, v26, v27
put_luma_hv_x8_horizontal_filter \shift, v27, v27, v28
2:
ld1 {v29.8h, v30.8h, v31.8h}, [x11], x2
put_luma_hv_x8_horizontal_filter \shift, v29, v29, v30
put_luma_hv_x8_horizontal_filter \shift, v30, v30, v31
put_luma_hv_x8_vertical_filter v2, v3, v8, v11, v14, v17, v20, v23, v26, v29
put_luma_hv_x8_vertical_filter v4, v5, v9, v12, v15, v18, v21, v24, v27, v30
ld1 {v8.8h, v9.8h, v10.8h}, [x11], x2
st1 {v2.4h-v5.4h}, [x10], x9
put_luma_hv_x8_horizontal_filter \shift, v8, v8, v9
put_luma_hv_x8_horizontal_filter \shift, v9, v9, v10
put_luma_hv_x8_vertical_filter v2, v3, v11, v14, v17, v20, v23, v26, v29, v8
put_luma_hv_x8_vertical_filter v4, v5, v12, v15, v18, v21, v24, v27, v30, v9
ld1 {v11.8h, v12.8h, v13.8h}, [x11], x2
st1 {v2.4h-v5.4h}, [x10], x9
put_luma_hv_x8_horizontal_filter \shift, v11, v11, v12
put_luma_hv_x8_horizontal_filter \shift, v12, v12, v13
put_luma_hv_x8_vertical_filter v2, v3, v14, v17, v20, v23, v26, v29, v8, v11
put_luma_hv_x8_vertical_filter v4, v5, v15, v18, v21, v24, v27, v30, v9, v12
ld1 {v14.8h, v15.8h, v16.8h}, [x11], x2
st1 {v2.4h-v5.4h}, [x10], x9
put_luma_hv_x8_horizontal_filter \shift, v14, v14, v15
put_luma_hv_x8_horizontal_filter \shift, v15, v15, v16
put_luma_hv_x8_vertical_filter v2, v3, v17, v20, v23, v26, v29, v8, v11, v14
put_luma_hv_x8_vertical_filter v4, v5, v18, v21, v24, v27, v30, v9, v12, v15
st1 {v2.4h-v5.4h}, [x10], x9
mov v10.16b, v8.16b
mov v8.16b, v20.16b
mov v20.16b, v10.16b
mov v10.16b, v9.16b
mov v9.16b, v21.16b
mov v21.16b, v10.16b
mov v13.16b, v11.16b
mov v11.16b, v23.16b
mov v23.16b, v13.16b
mov v13.16b, v12.16b
mov v12.16b, v24.16b
mov v24.16b, v13.16b
mov v16.16b, v14.16b
mov v14.16b, v26.16b
mov v26.16b, v16.16b
mov v16.16b, v15.16b
mov v15.16b, v27.16b
mov v27.16b, v16.16b
subs w13, w13, #4
mov v17.16b, v29.16b
mov v18.16b, v30.16b
b.gt 2b
add x0, x0, #32
add x1, x1, #32
subs w6, w6, #16
b.gt 1b
ldp d14, d15, [sp, #48]
ldp d12, d13, [sp, #32]
ldp d10, d11, [sp, #16]
ldp d8, d9, [sp], #64
ret
.endm
function ff_vvc_put_luma_hv_x16_10_neon, export=1
put_luma_hv_x16_xx_neon 2
endfunc
function ff_vvc_put_luma_hv_x16_12_neon, export=1
put_luma_hv_x16_xx_neon 4
endfunc