From ba516a34cd1cdc46ab35a9440ef2f0891c774bd0 Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Sun, 12 Apr 2026 18:16:39 +0200 Subject: [PATCH] swscale/x86/ops_int: use sized mov for packed_shuffle output This code made the input read conditional on the byte count, but not the output, leading to a lot of over-write for cases like 15, 5. Signed-off-by: Niklas Haas --- libswscale/x86/ops.c | 14 +++++++++----- libswscale/x86/ops_int.asm | 20 ++++++++++++-------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c index 115970c226..a87fa56f53 100644 --- a/libswscale/x86/ops.c +++ b/libswscale/x86/ops.c @@ -869,6 +869,13 @@ static bool op_is_type_invariant(const SwsOp *op) return false; } +static int movsize(const int bytes, const int mmsize) +{ + return bytes <= 4 ? 4 : /* movd */ + bytes <= 8 ? 8 : /* movq */ + mmsize; /* movu */ +} + static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out) { uint8_t shuffle[16]; @@ -888,17 +895,14 @@ static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out) const int num_lanes = mmsize / 16; const int in_total = num_lanes * read_bytes; const int out_total = num_lanes * write_bytes; - const int read_size = in_total <= 4 ? 4 : /* movd */ - in_total <= 8 ? 8 : /* movq */ - mmsize; /* movu */ *out = (SwsCompiledOp) { .priv = av_memdup(shuffle, sizeof(shuffle)), .free = av_free, .slice_align = 1, .block_size = pixels * num_lanes, - .over_read = read_size - in_total, - .over_write = mmsize - out_total, + .over_read = movsize(in_total, mmsize) - in_total, + .over_write = movsize(out_total, mmsize) - out_total, .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 : mmsize > 16 ? AV_CPU_FLAG_AVX2 : AV_CPU_FLAG_SSE4, diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm index f28c8c640d..15e0918083 100644 --- a/libswscale/x86/ops_int.asm +++ b/libswscale/x86/ops_int.asm @@ -163,6 +163,16 @@ process_fn 4 ; For the clean multiples (e.g. rgba -> argb), we also define AVX2 and AVX512 ; versions that can handle a larger number of bytes at once. +%macro MOVSIZE 3 ; size, dst, src + %if %1 <= 4 + movd %2, %3 + %elif %1 <= 8 + movq %2, %3 + %else + movu %2, %3 + %endif +%endmacro + %macro packed_shuffle 2 ; size_in, size_out cglobal packed_shuffle%1_%2, 6, 10, 2, \ exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride @@ -185,15 +195,9 @@ cglobal packed_shuffle%1_%2, 6, 10, 2, \ sub srcq, srcidxq sub dstq, dstidxq .loop: - %if %1 <= 4 - movd m0, [srcq + srcidxq] - %elif %1 <= 8 - movq m0, [srcq + srcidxq] - %else - movu m0, [srcq + srcidxq] - %endif + MOVSIZE %1, m0, [srcq + srcidxq] pshufb m0, m1 - movu [dstq + dstidxq], m0 + MOVSIZE %2, [dstq + dstidxq], m0 add srcidxq, %1 IF %1 != %2,add dstidxq, %2 jnz .loop