mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-04-18 00:20:21 +00:00
This commit pieces together the previous few commits to implement the NEON backend for sws_ops. In essence, a tool which runs on the target (sws_ops_aarch64) is used to enumerate all the functions that the backend needs to implement. The list it generates is stored in the repository (ops_entries.c). The list from above is used at build time by a code generator tool (ops_asmgen) to implement all the sws_ops functions the NEON backend supports, and generate a lookup function in C to retrieve the assembly function pointers. At runtime, the NEON backend fetches the function pointers to the assembly functions and chains them together in a continuation-passing style design, similar to the x86 backend. The following speedup is observed from legacy swscale to NEON: A520: Overall speedup=3.780x faster, min=0.137x max=91.928x A720: Overall speedup=4.129x faster, min=0.234x max=92.424x And the following from the C sws_ops implementation to NEON: A520: Overall speedup=5.513x faster, min=0.927x max=14.169x A720: Overall speedup=4.786x faster, min=0.585x max=20.157x The slowdowns from legacy to NEON are the same for C/x86. Mostly low bit-depth conversions that did not perform dithering in legacy. The 0.585x outlier from C to NEON is gbrpf32le -> gbrapf32le, which is mostly memcpy with the C implementation. All other conversions are better. Sponsored-by: Sovereign Tech Fund Signed-off-by: Ramiro Polla <ramiro.polla@gmail.com>
188 lines
6 KiB
C
188 lines
6 KiB
C
/*
|
|
* Copyright (C) 2026 Ramiro Polla
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#ifndef SWSCALE_AARCH64_OPS_IMPL_H
|
|
#define SWSCALE_AARCH64_OPS_IMPL_H
|
|
|
|
#include <assert.h>
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
|
|
/* Similar to SwsPixelType */
|
|
typedef enum SwsAArch64PixelType {
|
|
AARCH64_PIXEL_U8,
|
|
AARCH64_PIXEL_U16,
|
|
AARCH64_PIXEL_U32,
|
|
AARCH64_PIXEL_F32,
|
|
AARCH64_PIXEL_TYPE_NB,
|
|
} SwsAArch64PixelType;
|
|
|
|
/* Similar to SwsOpType */
|
|
typedef enum SwsAArch64OpType {
|
|
AARCH64_SWS_OP_NONE = 0,
|
|
AARCH64_SWS_OP_PROCESS,
|
|
AARCH64_SWS_OP_PROCESS_RETURN,
|
|
AARCH64_SWS_OP_READ_BIT,
|
|
AARCH64_SWS_OP_READ_NIBBLE,
|
|
AARCH64_SWS_OP_READ_PACKED,
|
|
AARCH64_SWS_OP_READ_PLANAR,
|
|
AARCH64_SWS_OP_WRITE_BIT,
|
|
AARCH64_SWS_OP_WRITE_NIBBLE,
|
|
AARCH64_SWS_OP_WRITE_PACKED,
|
|
AARCH64_SWS_OP_WRITE_PLANAR,
|
|
AARCH64_SWS_OP_SWAP_BYTES,
|
|
AARCH64_SWS_OP_SWIZZLE,
|
|
AARCH64_SWS_OP_UNPACK,
|
|
AARCH64_SWS_OP_PACK,
|
|
AARCH64_SWS_OP_LSHIFT,
|
|
AARCH64_SWS_OP_RSHIFT,
|
|
AARCH64_SWS_OP_CLEAR,
|
|
AARCH64_SWS_OP_CONVERT,
|
|
AARCH64_SWS_OP_EXPAND,
|
|
AARCH64_SWS_OP_MIN,
|
|
AARCH64_SWS_OP_MAX,
|
|
AARCH64_SWS_OP_SCALE,
|
|
AARCH64_SWS_OP_LINEAR,
|
|
AARCH64_SWS_OP_DITHER,
|
|
AARCH64_SWS_OP_TYPE_NB,
|
|
} SwsAArch64OpType;
|
|
|
|
/* Each nibble in the mask corresponds to one component. */
|
|
typedef uint16_t SwsAArch64OpMask;
|
|
|
|
/**
|
|
* Affine coefficient mask for linear op. Packs a 4x5 matrix in execution
|
|
* order, where the offset is the first element, with 2 bits per element:
|
|
* 00: m[i][j] == 0
|
|
* 01: m[i][j] == 1
|
|
* 11: m[i][j] is any other coefficient
|
|
*/
|
|
typedef uint64_t SwsAArch64LinearOpMask;
|
|
|
|
typedef struct SwsAArch64LinearOp {
|
|
SwsAArch64LinearOpMask mask;
|
|
uint8_t fmla;
|
|
} SwsAArch64LinearOp;
|
|
|
|
typedef struct SwsAArch64DitherOp {
|
|
uint16_t y_offset;
|
|
uint8_t size_log2;
|
|
} SwsAArch64DitherOp;
|
|
|
|
/**
|
|
* SwsAArch64OpImplParams describes the parameters for an SwsAArch64OpType
|
|
* operation. It consists of simplified parameters from the SwsOp structure,
|
|
* with the purpose of being straight-forward to implement and execute.
|
|
*/
|
|
typedef struct SwsAArch64OpImplParams {
|
|
SwsAArch64OpType op;
|
|
SwsAArch64OpMask mask;
|
|
SwsAArch64PixelType type;
|
|
uint8_t block_size;
|
|
union {
|
|
uint8_t shift;
|
|
SwsAArch64OpMask swizzle;
|
|
SwsAArch64OpMask pack;
|
|
SwsAArch64PixelType to_type;
|
|
SwsAArch64LinearOp linear;
|
|
SwsAArch64DitherOp dither;
|
|
};
|
|
} SwsAArch64OpImplParams;
|
|
|
|
/* SwsAArch64OpMask-related helpers. */
|
|
|
|
#define MASK_GET(mask, idx) (((mask) >> ((idx) << 2)) & 0xf)
|
|
#define MASK_SET(mask, idx, val) do { (mask) |= (((val) & 0xf) << ((idx) << 2)); } while (0)
|
|
|
|
#define LOOP(mask, idx) \
|
|
for (int idx = 0; idx < 4; idx++) \
|
|
if (MASK_GET(mask, idx))
|
|
#define LOOP_BWD(mask, idx) \
|
|
for (int idx = 3; idx >= 0; idx--) \
|
|
if (MASK_GET(mask, idx))
|
|
|
|
#define LOOP_MASK(p, idx) LOOP(p->mask, idx)
|
|
#define LOOP_MASK_BWD(p, idx) LOOP_BWD(p->mask, idx)
|
|
|
|
#define LINEAR_MASK_GET(mask, idx, jdx) (((mask) >> (2 * ((5 * (idx) + (jdx))))) & 3)
|
|
#define LINEAR_MASK_SET(mask, idx, jdx, val) do { \
|
|
(mask) |= ((((SwsAArch64LinearOpMask) (val)) & 3) << (2 * ((5 * (idx) + (jdx))))); \
|
|
} while (0)
|
|
#define LINEAR_MASK_0 0
|
|
#define LINEAR_MASK_1 1
|
|
#define LINEAR_MASK_X 3
|
|
|
|
#define LOOP_LINEAR_MASK(p, idx, jdx) \
|
|
LOOP_MASK(p, idx) \
|
|
for (int jdx = 0; jdx < 5; jdx++) \
|
|
if (LINEAR_MASK_GET(p->linear.mask, idx, jdx))
|
|
|
|
/* Compute number of vector registers needed to store all coefficients. */
|
|
static inline int linear_num_vregs(const SwsAArch64OpImplParams *params)
|
|
{
|
|
int count = 0;
|
|
LOOP_LINEAR_MASK(params, i, j)
|
|
count++;
|
|
return (count + 3) / 4;
|
|
}
|
|
|
|
static inline int linear_index_to_sws_op(int idx)
|
|
{
|
|
const int reorder_col[5] = { 4, 0, 1, 2, 3 };
|
|
return reorder_col[idx];
|
|
}
|
|
|
|
static inline int linear_index_is_offset(int idx)
|
|
{
|
|
return (idx == 0);
|
|
}
|
|
|
|
static inline int linear_index_to_vx(int idx)
|
|
{
|
|
/* The offset shouldn't map to any vx, but to please UBSan we map
|
|
* it to 0. */
|
|
if (linear_index_is_offset(idx))
|
|
return 0;
|
|
return (idx - 1);
|
|
}
|
|
|
|
/**
|
|
* These values will be used by ops_asmgen to access fields inside of
|
|
* SwsOpExec and SwsOpImpl. The sizes are checked below when compiling
|
|
* for AArch64 to make sure there is no mismatch.
|
|
*/
|
|
#define offsetof_exec_in 0
|
|
#define offsetof_exec_out 32
|
|
#define offsetof_exec_in_bump 128
|
|
#define offsetof_exec_out_bump 160
|
|
#define offsetof_impl_cont 0
|
|
#define offsetof_impl_priv 16
|
|
#define sizeof_impl 32
|
|
|
|
#if ARCH_AARCH64 && HAVE_NEON
|
|
static_assert(offsetof_exec_in == offsetof(SwsOpExec, in), "SwsOpExec layout mismatch");
|
|
static_assert(offsetof_exec_out == offsetof(SwsOpExec, out), "SwsOpExec layout mismatch");
|
|
static_assert(offsetof_exec_in_bump == offsetof(SwsOpExec, in_bump), "SwsOpExec layout mismatch");
|
|
static_assert(offsetof_exec_out_bump == offsetof(SwsOpExec, out_bump), "SwsOpExec layout mismatch");
|
|
static_assert(offsetof_impl_cont == offsetof(SwsOpImpl, cont), "SwsOpImpl layout mismatch");
|
|
static_assert(offsetof_impl_priv == offsetof(SwsOpImpl, priv), "SwsOpImpl layout mismatch");
|
|
#endif
|
|
|
|
#endif /* SWSCALE_AARCH64_OPS_IMPL_H */
|