mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-04-18 16:40:23 +00:00
swscale/x86/ops_int: use sized mov for packed_shuffle output
This code made the input read conditional on the byte count, but not the output, leading to a lot of over-write for cases like 15, 5. Signed-off-by: Niklas Haas <git@haasn.dev>
This commit is contained in:
parent
4264045137
commit
ba516a34cd
2 changed files with 21 additions and 13 deletions
|
|
@ -869,6 +869,13 @@ static bool op_is_type_invariant(const SwsOp *op)
|
|||
return false;
|
||||
}
|
||||
|
||||
static int movsize(const int bytes, const int mmsize)
|
||||
{
|
||||
return bytes <= 4 ? 4 : /* movd */
|
||||
bytes <= 8 ? 8 : /* movq */
|
||||
mmsize; /* movu */
|
||||
}
|
||||
|
||||
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
|
||||
{
|
||||
uint8_t shuffle[16];
|
||||
|
|
@ -888,17 +895,14 @@ static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
|
|||
const int num_lanes = mmsize / 16;
|
||||
const int in_total = num_lanes * read_bytes;
|
||||
const int out_total = num_lanes * write_bytes;
|
||||
const int read_size = in_total <= 4 ? 4 : /* movd */
|
||||
in_total <= 8 ? 8 : /* movq */
|
||||
mmsize; /* movu */
|
||||
|
||||
*out = (SwsCompiledOp) {
|
||||
.priv = av_memdup(shuffle, sizeof(shuffle)),
|
||||
.free = av_free,
|
||||
.slice_align = 1,
|
||||
.block_size = pixels * num_lanes,
|
||||
.over_read = read_size - in_total,
|
||||
.over_write = mmsize - out_total,
|
||||
.over_read = movsize(in_total, mmsize) - in_total,
|
||||
.over_write = movsize(out_total, mmsize) - out_total,
|
||||
.cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
|
||||
mmsize > 16 ? AV_CPU_FLAG_AVX2 :
|
||||
AV_CPU_FLAG_SSE4,
|
||||
|
|
|
|||
|
|
@ -163,6 +163,16 @@ process_fn 4
|
|||
; For the clean multiples (e.g. rgba -> argb), we also define AVX2 and AVX512
|
||||
; versions that can handle a larger number of bytes at once.
|
||||
|
||||
%macro MOVSIZE 3 ; size, dst, src
|
||||
%if %1 <= 4
|
||||
movd %2, %3
|
||||
%elif %1 <= 8
|
||||
movq %2, %3
|
||||
%else
|
||||
movu %2, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro packed_shuffle 2 ; size_in, size_out
|
||||
cglobal packed_shuffle%1_%2, 6, 10, 2, \
|
||||
exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride
|
||||
|
|
@ -185,15 +195,9 @@ cglobal packed_shuffle%1_%2, 6, 10, 2, \
|
|||
sub srcq, srcidxq
|
||||
sub dstq, dstidxq
|
||||
.loop:
|
||||
%if %1 <= 4
|
||||
movd m0, [srcq + srcidxq]
|
||||
%elif %1 <= 8
|
||||
movq m0, [srcq + srcidxq]
|
||||
%else
|
||||
movu m0, [srcq + srcidxq]
|
||||
%endif
|
||||
MOVSIZE %1, m0, [srcq + srcidxq]
|
||||
pshufb m0, m1
|
||||
movu [dstq + dstidxq], m0
|
||||
MOVSIZE %2, [dstq + dstidxq], m0
|
||||
add srcidxq, %1
|
||||
IF %1 != %2,add dstidxq, %2
|
||||
jnz .loop
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue