/** * Copyright (C) 2025 Niklas Haas * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/avassert.h" #include "libavutil/cpu.h" #include "libavutil/mathematics.h" #include "libavutil/mem.h" #include "libavutil/mem_internal.h" #include "libavutil/refstruct.h" #include "ops.h" #include "ops_internal.h" #include "ops_dispatch.h" typedef struct SwsOpPass { SwsCompiledOp comp; SwsOpExec exec_base; SwsOpExec exec_tail; size_t num_blocks; int tail_off_in; int tail_off_out; int tail_size_in; int tail_size_out; int planes_in; int planes_out; int pixel_bits_in; int pixel_bits_out; int idx_in[4]; int idx_out[4]; int *offsets_y; int filter_size; bool memcpy_first; bool memcpy_last; bool memcpy_out; size_t tail_blocks; uint8_t *tail_buf; /* extra memory for fixing unpadded tails */ unsigned int tail_buf_size; } SwsOpPass; int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend, const SwsOpList *ops, SwsCompiledOp *out) { SwsOpList *copy; SwsCompiledOp compiled = {0}; int ret = 0; copy = ff_sws_op_list_duplicate(ops); if (!copy) return AVERROR(ENOMEM); /* Ensure these are always set during compilation */ ff_sws_op_list_update_comps(copy); ret = backend->compile(ctx, copy, &compiled); if (ret < 0) { int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR; av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n", backend->name, av_err2str(ret)); } else { *out = compiled; } ff_sws_op_list_free(©); return ret; } int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out) { for (int n = 0; ff_sws_op_backends[n]; n++) { const SwsOpBackend *backend = ff_sws_op_backends[n]; if (ops->src.hw_format != backend->hw_format || ops->dst.hw_format != backend->hw_format) continue; if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0) continue; av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': " "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n", backend->name, out->block_size, out->over_read, out->over_write, out->cpu_flags); ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, AV_LOG_TRACE, ops); return 0; } return AVERROR(ENOTSUP); } void ff_sws_compiled_op_unref(SwsCompiledOp *comp) { if (comp->free) comp->free(comp->priv); *comp = (SwsCompiledOp) {0}; } static void op_pass_free(void *ptr) { SwsOpPass *p = ptr; if (!p) return; ff_sws_compiled_op_unref(&p->comp); av_refstruct_unref(&p->offsets_y); av_free(p->exec_base.in_bump_y); av_free(p->exec_base.in_offset_x); av_free(p->tail_buf); av_free(p); } static inline void get_row_data(const SwsOpPass *p, const int y_dst, const uint8_t *in[4], uint8_t *out[4]) { const SwsOpExec *base = &p->exec_base; const int y_src = p->offsets_y ? p->offsets_y[y_dst] : y_dst; for (int i = 0; i < p->planes_in; i++) in[i] = base->in[i] + (y_src >> base->in_sub_y[i]) * base->in_stride[i]; for (int i = 0; i < p->planes_out; i++) out[i] = base->out[i] + (y_dst >> base->out_sub_y[i]) * base->out_stride[i]; } static inline size_t pixel_bytes(size_t pixels, int pixel_bits, enum AVRounding rounding) { const uint64_t bits = (uint64_t) pixels * pixel_bits; switch (rounding) { case AV_ROUND_ZERO: case AV_ROUND_DOWN: return bits >> 3; case AV_ROUND_INF: case AV_ROUND_UP: return (bits + 7) >> 3; default: av_unreachable("Invalid rounding mode"); return (size_t) -1; } } static size_t safe_bytes_pad(int linesize, int plane_pad) { av_assert1(linesize); int64_t safe_bytes = FFABS((int64_t) linesize) - plane_pad; return FFMAX(safe_bytes, 0); } static size_t safe_blocks_offset(size_t num_blocks, unsigned block_size, ptrdiff_t safe_offset, const int32_t *offset_bytes) { size_t safe_blocks = num_blocks; while (safe_blocks && offset_bytes[safe_blocks * block_size - 1] > safe_offset) safe_blocks--; return safe_blocks; } static int op_pass_setup(const SwsFrame *out, const SwsFrame *in, const SwsPass *pass) { const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->format); const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format); SwsOpPass *p = pass->priv; SwsOpExec *exec = &p->exec_base; const SwsCompiledOp *comp = &p->comp; /* Set up main loop parameters */ const unsigned block_size = comp->block_size; const size_t num_blocks = (pass->width + block_size - 1) / block_size; const size_t aligned_w = num_blocks * block_size; if (aligned_w < pass->width) /* overflow */ return AVERROR(EINVAL); p->num_blocks = num_blocks; p->memcpy_first = false; p->memcpy_last = false; p->memcpy_out = false; size_t safe_blocks = num_blocks; for (int i = 0; i < p->planes_in; i++) { int idx = p->idx_in[i]; int chroma = idx == 1 || idx == 2; int sub_x = chroma ? indesc->log2_chroma_w : 0; int sub_y = chroma ? indesc->log2_chroma_h : 0; size_t safe_bytes = safe_bytes_pad(in->linesize[idx], comp->over_read); size_t safe_blocks_in; if (exec->in_offset_x) { size_t filter_size = pixel_bytes(p->filter_size, p->pixel_bits_in, AV_ROUND_UP); safe_blocks_in = safe_blocks_offset(num_blocks, block_size, safe_bytes - filter_size, exec->in_offset_x); } else { safe_blocks_in = safe_bytes / exec->block_size_in; } if (safe_blocks_in < num_blocks) { p->memcpy_first |= in->linesize[idx] < 0; p->memcpy_last |= in->linesize[idx] > 0; safe_blocks = FFMIN(safe_blocks, safe_blocks_in); } size_t loop_size = num_blocks * exec->block_size_in; exec->in[i] = in->data[idx]; exec->in_stride[i] = in->linesize[idx]; exec->in_bump[i] = in->linesize[idx] - loop_size; exec->in_sub_y[i] = sub_y; exec->in_sub_x[i] = sub_x; } for (int i = 0; i < p->planes_out; i++) { int idx = p->idx_out[i]; int chroma = idx == 1 || idx == 2; int sub_x = chroma ? outdesc->log2_chroma_w : 0; int sub_y = chroma ? outdesc->log2_chroma_h : 0; size_t safe_bytes = safe_bytes_pad(out->linesize[idx], comp->over_write); size_t safe_blocks_out = safe_bytes / exec->block_size_out; if (safe_blocks_out < num_blocks) { p->memcpy_out = true; safe_blocks = FFMIN(safe_blocks, safe_blocks_out); } size_t loop_size = num_blocks * exec->block_size_out; exec->out[i] = out->data[idx]; exec->out_stride[i] = out->linesize[idx]; exec->out_bump[i] = out->linesize[idx] - loop_size; exec->out_sub_y[i] = sub_y; exec->out_sub_x[i] = sub_x; } const bool memcpy_in = p->memcpy_first || p->memcpy_last; if (!memcpy_in && !p->memcpy_out) { av_assert0(safe_blocks == num_blocks); return 0; } /* Set-up tail section parameters and buffers */ SwsOpExec *tail = &p->exec_tail; const int align = av_cpu_max_align(); size_t alloc_size = 0; *tail = *exec; const size_t safe_width = safe_blocks * block_size; const size_t tail_size = pass->width - safe_width; p->tail_off_out = pixel_bytes(safe_width, p->pixel_bits_out, AV_ROUND_DOWN); p->tail_size_out = pixel_bytes(tail_size, p->pixel_bits_out, AV_ROUND_UP); p->tail_blocks = num_blocks - safe_blocks; if (exec->in_offset_x) { p->tail_off_in = exec->in_offset_x[safe_width]; p->tail_size_in = exec->in_offset_x[pass->width - 1] - p->tail_off_in; p->tail_size_in += pixel_bytes(p->filter_size, p->pixel_bits_in, AV_ROUND_UP); } else { p->tail_off_in = pixel_bytes(safe_width, p->pixel_bits_in, AV_ROUND_DOWN); p->tail_size_in = pixel_bytes(tail_size, p->pixel_bits_in, AV_ROUND_UP); } const size_t alloc_width = aligned_w - safe_width; for (int i = 0; memcpy_in && i < p->planes_in; i++) { size_t needed_size; if (exec->in_offset_x) { /* The input offset map is already padded to multiples of the block * size, and clamps the input offsets to the image boundaries; so * we just need to compensate for the comp->over_read */ needed_size = p->tail_size_in; } else { needed_size = pixel_bytes(alloc_width, p->pixel_bits_in, AV_ROUND_UP); } size_t loop_size = p->tail_blocks * exec->block_size_in; tail->in_stride[i] = FFALIGN(needed_size + comp->over_read, align); tail->in_bump[i] = tail->in_stride[i] - loop_size; alloc_size += tail->in_stride[i] * in->height; } for (int i = 0; p->memcpy_out && i < p->planes_out; i++) { size_t needed_size = pixel_bytes(alloc_width, p->pixel_bits_out, AV_ROUND_UP); size_t loop_size = p->tail_blocks * exec->block_size_out; tail->out_stride[i] = FFALIGN(needed_size + comp->over_write, align); tail->out_bump[i] = tail->out_stride[i] - loop_size; alloc_size += tail->out_stride[i] * out->height; } if (memcpy_in && exec->in_offset_x) { /* `in_offset_x` is indexed relative to the line start, not the start * of the section being processed; so we need to over-allocate this * array to the full width of the image, even though we will only * partially fill in the offsets relevant to the tail region */ alloc_size += aligned_w * sizeof(*exec->in_offset_x); } av_fast_mallocz(&p->tail_buf, &p->tail_buf_size, alloc_size); if (!p->tail_buf) return AVERROR(ENOMEM); uint8_t *tail_buf = p->tail_buf; for (int i = 0; memcpy_in && i < p->planes_in; i++) { tail->in[i] = tail_buf; tail_buf += tail->in_stride[i] * in->height; } for (int i = 0; p->memcpy_out && i < p->planes_out; i++) { tail->out[i] = tail_buf; tail_buf += tail->out_stride[i] * out->height; } if (memcpy_in && exec->in_offset_x) { tail->in_offset_x = (int32_t *) tail_buf; for (int i = safe_width; i < aligned_w; i++) tail->in_offset_x[i] = exec->in_offset_x[i] - p->tail_off_in; } return 0; } static void copy_lines(uint8_t *dst, const size_t dst_stride, const uint8_t *src, const size_t src_stride, const int h, const size_t bytes) { for (int y = 0; y < h; y++) { memcpy(dst, src, bytes); dst += dst_stride; src += src_stride; } } static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y, const int h, const SwsPass *pass) { const SwsOpPass *p = pass->priv; const SwsCompiledOp *comp = &p->comp; /* Fill exec metadata for this slice */ DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base; exec.slice_y = y; exec.slice_h = h; /** * To ensure safety, we need to consider the following: * * 1. We can overread the input, unless this is the last line of an * unpadded buffer. All defined operations can handle arbitrary pixel * input, so overread of arbitrary data is fine. For flipped images, * this condition is actually *inverted* to where the first line is * the one at the end of the buffer. * * 2. We can overwrite the output, as long as we don't write more than the * amount of pixels that fit into one linesize. So we always need to * memcpy the last column on the output side if unpadded. */ const bool memcpy_in = p->memcpy_last && y + h == pass->height || p->memcpy_first && y == 0; const bool memcpy_out = p->memcpy_out; const size_t num_blocks = p->num_blocks; const size_t tail_blocks = p->tail_blocks; get_row_data(p, y, exec.in, exec.out); if (!memcpy_in && !memcpy_out) { /* Fast path (fully aligned/padded inputs and outputs) */ comp->func(&exec, comp->priv, 0, y, num_blocks, y + h); return; } /* Non-aligned case (slow path); process main blocks as normal, and * a separate tail (via memcpy into an appropriately padded buffer) */ if (num_blocks > tail_blocks) { for (int i = 0; i < 4; i++) { /* We process fewer blocks, so the in_bump needs to be increased * to reflect that the plane pointers are left on the last block, * not the end of the processed line, after each loop iteration */ exec.in_bump[i] += exec.block_size_in * tail_blocks; exec.out_bump[i] += exec.block_size_out * tail_blocks; } comp->func(&exec, comp->priv, 0, y, num_blocks - tail_blocks, y + h); } DECLARE_ALIGNED_32(SwsOpExec, tail) = p->exec_tail; tail.slice_y = y; tail.slice_h = h; for (int i = 0; i < p->planes_in; i++) { /* Input offsets are relative to the base pointer */ if (!exec.in_offset_x || memcpy_in) exec.in[i] += p->tail_off_in; tail.in[i] += y * tail.in_stride[i]; } for (int i = 0; i < p->planes_out; i++) { exec.out[i] += p->tail_off_out; tail.out[i] += y * tail.out_stride[i]; } for (int i = 0; i < p->planes_in; i++) { if (memcpy_in) { copy_lines((uint8_t *) tail.in[i], tail.in_stride[i], exec.in[i], exec.in_stride[i], h, p->tail_size_in); } else { /* Reuse input pointers directly */ const size_t loop_size = tail_blocks * exec.block_size_in; tail.in[i] = exec.in[i]; tail.in_stride[i] = exec.in_stride[i]; tail.in_bump[i] = exec.in_stride[i] - loop_size; } } for (int i = 0; !memcpy_out && i < p->planes_out; i++) { /* Reuse output pointers directly */ const size_t loop_size = tail_blocks * exec.block_size_out; tail.out[i] = exec.out[i]; tail.out_stride[i] = exec.out_stride[i]; tail.out_bump[i] = exec.out_stride[i] - loop_size; } /* Dispatch kernel over tail */ av_assert1(tail_blocks > 0); comp->func(&tail, comp->priv, num_blocks - tail_blocks, y, num_blocks, y + h); for (int i = 0; memcpy_out && i < p->planes_out; i++) { copy_lines(exec.out[i], exec.out_stride[i], tail.out[i], tail.out_stride[i], h, p->tail_size_out); } } static int rw_planes(const SwsOp *op) { return op->rw.packed ? 1 : op->rw.elems; } static int rw_pixel_bits(const SwsOp *op) { const int elems = op->rw.packed ? op->rw.elems : 1; const int size = ff_sws_pixel_type_size(op->type); const int bits = 8 >> op->rw.frac; av_assert1(bits >= 1); return elems * size * bits; } static void align_pass(SwsPass *pass, int block_size, int over_rw, int pixel_bits) { if (!pass) return; /* Add at least as many pixels as needed to cover the padding requirement */ const int pad = (over_rw * 8 + pixel_bits - 1) / pixel_bits; SwsPassBuffer *buf = pass->output; buf->width_align = FFMAX(buf->width_align, block_size); buf->width_pad = FFMAX(buf->width_pad, pad); } static int compile(SwsGraph *graph, const SwsOpList *ops, SwsPass *input, SwsPass **output) { SwsContext *ctx = graph->ctx; SwsOpPass *p = av_mallocz(sizeof(*p)); if (!p) return AVERROR(ENOMEM); int ret = ff_sws_ops_compile(ctx, ops, &p->comp); if (ret < 0) goto fail; const SwsCompiledOp *comp = &p->comp; const SwsFormat *dst = &ops->dst; if (p->comp.opaque) { SwsCompiledOp c = *comp; av_free(p); return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height, input, c.slice_align, c.func_opaque, NULL, c.priv, c.free, output); } const SwsOp *read = ff_sws_op_list_input(ops); const SwsOp *write = ff_sws_op_list_output(ops); p->planes_in = rw_planes(read); p->planes_out = rw_planes(write); p->pixel_bits_in = rw_pixel_bits(read); p->pixel_bits_out = rw_pixel_bits(write); p->exec_base = (SwsOpExec) { .width = dst->width, .height = dst->height, }; const int64_t block_bits_in = (int64_t) comp->block_size * p->pixel_bits_in; const int64_t block_bits_out = (int64_t) comp->block_size * p->pixel_bits_out; if (block_bits_in & 0x7 || block_bits_out & 0x7) { av_log(ctx, AV_LOG_ERROR, "Block size must be a multiple of the pixel size.\n"); ret = AVERROR(EINVAL); goto fail; } p->exec_base.block_size_in = block_bits_in >> 3; p->exec_base.block_size_out = block_bits_out >> 3; for (int i = 0; i < 4; i++) { p->idx_in[i] = i < p->planes_in ? ops->plane_src[i] : -1; p->idx_out[i] = i < p->planes_out ? ops->plane_dst[i] : -1; } const SwsFilterWeights *filter = read->rw.kernel; if (read->rw.filter == SWS_OP_FILTER_V) { p->offsets_y = av_refstruct_ref(filter->offsets); /* Compute relative pointer bumps for each output line */ int32_t *bump = av_malloc_array(filter->dst_size, sizeof(*bump)); if (!bump) { ret = AVERROR(ENOMEM); goto fail; } int line = filter->offsets[0]; for (int y = 0; y < filter->dst_size - 1; y++) { int next = filter->offsets[y + 1]; bump[y] = next - line - 1; line = next; } bump[filter->dst_size - 1] = 0; p->exec_base.in_bump_y = bump; } else if (read->rw.filter == SWS_OP_FILTER_H) { /* Compute pixel offset map for each output line */ const int pixels = FFALIGN(filter->dst_size, p->comp.block_size); int32_t *offset = av_malloc_array(pixels, sizeof(*offset)); if (!offset) { ret = AVERROR(ENOMEM); goto fail; } for (int x = 0; x < filter->dst_size; x++) { /* Sanity check; if the tap would land on a half-pixel, we cannot * reasonably expect the implementation to know about this. Just * error out in such (theoretical) cases. */ int64_t bits = (int64_t) filter->offsets[x] * p->pixel_bits_in; if ((bits & 0x7) || (bits >> 3) > INT32_MAX) { ret = AVERROR(EINVAL); goto fail; } offset[x] = bits >> 3; } for (int x = filter->dst_size; x < pixels; x++) offset[x] = offset[filter->dst_size - 1]; p->exec_base.in_offset_x = offset; p->exec_base.block_size_in = 0; /* ptr does not advance */ p->filter_size = filter->filter_size; } ret = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height, input, comp->slice_align, op_pass_run, op_pass_setup, p, op_pass_free, output); if (ret < 0) return ret; align_pass(input, comp->block_size, comp->over_read, p->pixel_bits_in); align_pass(*output, comp->block_size, comp->over_write, p->pixel_bits_out); return 0; fail: op_pass_free(p); return ret; } int ff_sws_compile_pass(SwsGraph *graph, SwsOpList **pops, int flags, SwsPass *input, SwsPass **output) { const int passes_orig = graph->num_passes; SwsContext *ctx = graph->ctx; SwsOpList *ops = *pops; int ret = 0; /* Check if the whole operation graph is an end-to-end no-op */ if (ff_sws_op_list_is_noop(ops)) { *output = input; goto out; } const SwsOp *read = ff_sws_op_list_input(ops); const SwsOp *write = ff_sws_op_list_output(ops); if (!read || !write) { av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read " "and write, respectively.\n"); ret = AVERROR(EINVAL); goto out; } if (flags & SWS_OP_FLAG_OPTIMIZE) { ret = ff_sws_op_list_optimize(ops); if (ret < 0) goto out; av_log(ctx, AV_LOG_DEBUG, "Operation list after optimizing:\n"); ff_sws_op_list_print(ctx, AV_LOG_DEBUG, AV_LOG_TRACE, ops); } ret = compile(graph, ops, input, output); if (ret != AVERROR(ENOTSUP)) goto out; av_log(ctx, AV_LOG_DEBUG, "Retrying with separated filter passes.\n"); SwsPass *prev = input; while (ops) { SwsOpList *rest; ret = ff_sws_op_list_subpass(ops, &rest); if (ret < 0) goto out; if (prev == input && !rest) { /* No point in compiling an unsplit pass again */ ret = AVERROR(ENOTSUP); goto out; } ret = compile(graph, ops, prev, &prev); if (ret < 0) { ff_sws_op_list_free(&rest); goto out; } ff_sws_op_list_free(&ops); ops = rest; } /* Return last subpass successfully compiled */ av_log(ctx, AV_LOG_VERBOSE, "Using %d separate passes.\n", graph->num_passes - passes_orig); *output = prev; out: if (ret == AVERROR(ENOTSUP)) { av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n"); ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops); } if (ret < 0) ff_sws_graph_rollback(graph, passes_orig); ff_sws_op_list_free(&ops); *pops = NULL; return ret; }