swscale/x86/ops_int: use sized mov for packed_shuffle output

This code made the input read conditional on the byte count, but not the output, leading to a lot of over-write for cases like 15, 5. Signed-off-by: Niklas Haas <git@haasn.dev>
2026-04-18 16:40:23 +00:00 · 2026-04-12 18:16:39 +02:00 · 2026-04-12 18:16:39 +02:00 · ba516a34cd
commit ba516a34cd
parent 4264045137
2 changed files with 21 additions and 13 deletions
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@ -869,6 +869,13 @@ static bool op_is_type_invariant(const SwsOp *op)
    return false;
 }

+static int movsize(const int bytes, const int mmsize)
+{
+    return bytes <= 4 ? 4 : /* movd */
+           bytes <= 8 ? 8 : /* movq */
+           mmsize;          /* movu */
+}
+
 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
 {
    uint8_t shuffle[16];
@ -888,17 +895,14 @@ static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
    const int num_lanes = mmsize / 16;
    const int in_total  = num_lanes * read_bytes;
    const int out_total = num_lanes * write_bytes;
-    const int read_size = in_total <= 4 ? 4 : /* movd */
-                          in_total <= 8 ? 8 : /* movq */
-                          mmsize;             /* movu */

    *out = (SwsCompiledOp) {
        .priv        = av_memdup(shuffle, sizeof(shuffle)),
        .free        = av_free,
        .slice_align = 1,
        .block_size  = pixels * num_lanes,
-        .over_read   = read_size - in_total,
-        .over_write  = mmsize - out_total,
+        .over_read   = movsize(in_total,  mmsize) - in_total,
+        .over_write  = movsize(out_total, mmsize) - out_total,
        .cpu_flags   = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
                       mmsize > 16 ? AV_CPU_FLAG_AVX2 :
                                     AV_CPU_FLAG_SSE4,
--- a/libswscale/x86/ops_int.asm
+++ b/libswscale/x86/ops_int.asm
@ -163,6 +163,16 @@ process_fn 4
 ; For the clean multiples (e.g. rgba -> argb), we also define AVX2 and AVX512
 ; versions that can handle a larger number of bytes at once.

+%macro MOVSIZE 3 ; size, dst, src
+    %if %1 <= 4
+        movd %2, %3
+    %elif %1 <= 8
+        movq %2, %3
+    %else
+        movu %2, %3
+    %endif
+%endmacro
+
 %macro packed_shuffle 2 ; size_in, size_out
 cglobal packed_shuffle%1_%2, 6, 10, 2, \
    exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride
@ -185,15 +195,9 @@ cglobal packed_shuffle%1_%2, 6, 10, 2, \
            sub srcq, srcidxq
            sub dstq, dstidxq
 .loop:
-    %if %1 <= 4
-            movd m0, [srcq + srcidxq]
-    %elif %1 <= 8
-            movq m0, [srcq + srcidxq]
-    %else
-            movu m0, [srcq + srcidxq]
-    %endif
+            MOVSIZE %1, m0, [srcq + srcidxq]
            pshufb m0, m1
-            movu [dstq + dstidxq], m0
+            MOVSIZE %2, [dstq + dstidxq], m0
            add srcidxq, %1
 IF %1 != %2,add dstidxq, %2
            jnz .loop