swscale/x86/ops_int: use sized mov for packed_shuffle output

This code made the input read conditional on the byte count, but not the
output, leading to a lot of over-write for cases like 15, 5.

Signed-off-by: Niklas Haas <git@haasn.dev>
This commit is contained in:
Niklas Haas 2026-04-12 18:16:39 +02:00 committed by Niklas Haas
parent 4264045137
commit ba516a34cd
2 changed files with 21 additions and 13 deletions

View file

@ -869,6 +869,13 @@ static bool op_is_type_invariant(const SwsOp *op)
return false;
}
static int movsize(const int bytes, const int mmsize)
{
return bytes <= 4 ? 4 : /* movd */
bytes <= 8 ? 8 : /* movq */
mmsize; /* movu */
}
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
{
uint8_t shuffle[16];
@ -888,17 +895,14 @@ static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
const int num_lanes = mmsize / 16;
const int in_total = num_lanes * read_bytes;
const int out_total = num_lanes * write_bytes;
const int read_size = in_total <= 4 ? 4 : /* movd */
in_total <= 8 ? 8 : /* movq */
mmsize; /* movu */
*out = (SwsCompiledOp) {
.priv = av_memdup(shuffle, sizeof(shuffle)),
.free = av_free,
.slice_align = 1,
.block_size = pixels * num_lanes,
.over_read = read_size - in_total,
.over_write = mmsize - out_total,
.over_read = movsize(in_total, mmsize) - in_total,
.over_write = movsize(out_total, mmsize) - out_total,
.cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
mmsize > 16 ? AV_CPU_FLAG_AVX2 :
AV_CPU_FLAG_SSE4,

View file

@ -163,6 +163,16 @@ process_fn 4
; For the clean multiples (e.g. rgba -> argb), we also define AVX2 and AVX512
; versions that can handle a larger number of bytes at once.
%macro MOVSIZE 3 ; size, dst, src
%if %1 <= 4
movd %2, %3
%elif %1 <= 8
movq %2, %3
%else
movu %2, %3
%endif
%endmacro
%macro packed_shuffle 2 ; size_in, size_out
cglobal packed_shuffle%1_%2, 6, 10, 2, \
exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride
@ -185,15 +195,9 @@ cglobal packed_shuffle%1_%2, 6, 10, 2, \
sub srcq, srcidxq
sub dstq, dstidxq
.loop:
%if %1 <= 4
movd m0, [srcq + srcidxq]
%elif %1 <= 8
movq m0, [srcq + srcidxq]
%else
movu m0, [srcq + srcidxq]
%endif
MOVSIZE %1, m0, [srcq + srcidxq]
pshufb m0, m1
movu [dstq + dstidxq], m0
MOVSIZE %2, [dstq + dstidxq], m0
add srcidxq, %1
IF %1 != %2,add dstidxq, %2
jnz .loop