From d00f41f21347a93cbcf533dceccd7532bd925067 Mon Sep 17 00:00:00 2001 From: averne Date: Sat, 8 Nov 2025 19:57:37 +0100 Subject: [PATCH] vulkan/prores: forward quantization parameter to the IDCT shader The qScale syntax element has a maximum value of 512, which would overflow the 16-bit store from the VLD shader in extreme cases. This fixes that edge case by forwarding the element in a storage buffer, and applying the inverse quantization fully in the IDCT shader. --- libavcodec/vulkan/prores_idct.comp | 13 +++- libavcodec/vulkan/prores_vld.comp | 34 +++++---- libavcodec/vulkan_prores.c | 115 ++++++++++++++++++++++------- 3 files changed, 114 insertions(+), 48 deletions(-) diff --git a/libavcodec/vulkan/prores_idct.comp b/libavcodec/vulkan/prores_idct.comp index 645cb02979..4b39b3d8ae 100644 --- a/libavcodec/vulkan/prores_idct.comp +++ b/libavcodec/vulkan/prores_idct.comp @@ -87,7 +87,7 @@ void main(void) uint chroma_shift = comp != 0 ? log2_chroma_w : 0; bool act = gid.x < mb_width << (4 - chroma_shift); - /* Coalesced load of DCT coeffs in shared memory, second part of inverse quantization */ + /* Coalesced load of DCT coeffs in shared memory, inverse quantization */ if (act) { /** * According to spec indexing an array in push constant memory with @@ -95,9 +95,14 @@ void main(void) * so copy the whole matrix locally. */ uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma; + + /* Table 15 */ + uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> 4)]; + int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx; + [[unroll]] for (uint i = 0; i < 8; ++i) { - int v = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) | i))), 16); - blocks[block][i * 9 + idx] = float(v * int(qmat[(i << 3) + idx])); + int v = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) + i))), 16); + blocks[block][i * 9 + idx] = float(v * qscale * int(qmat[(i << 3) + idx])); } } @@ -117,7 +122,7 @@ void main(void) if (act) { [[unroll]] for (uint i = 0; i < 8; ++i) { float v = blocks[block][i * 9 + idx] * fact + off; - put_px(comp, ivec2(gid.x, (gid.y << 3) | i), clamp(int(v), 0, maxv)); + put_px(comp, ivec2(gid.x, (gid.y << 3) + i), clamp(int(v), 0, maxv)); } } } diff --git a/libavcodec/vulkan/prores_vld.comp b/libavcodec/vulkan/prores_vld.comp index 00e78e08ff..298a5baf4c 100644 --- a/libavcodec/vulkan/prores_vld.comp +++ b/libavcodec/vulkan/prores_vld.comp @@ -22,9 +22,9 @@ void put_px(uint tex_idx, ivec2 pos, uint v) { #ifndef INTERLACED - imageStore(dst[tex_idx], pos, uvec4(v)); + imageStore(dst[tex_idx], pos, uvec4(uint16_t(v))); #else - imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v)); + imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(uint16_t(v))); #endif } @@ -57,7 +57,7 @@ uint decode_codeword(inout GetBitContext gb, int codebook) } } -void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale) +void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count) { uvec3 gid = gl_GlobalInvocationID; uint is_luma = uint(gid.z == 0); @@ -70,7 +70,7 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale) { /* First coeff */ uint c = to_signed(decode_codeword(gb, 0x650)); - put_px(gid.z, base_pos, c * qscale & 0xffff); + put_px(gid.z, base_pos, c); /** * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8) @@ -89,7 +89,7 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale) int s = int(prev_dc_diff) >> 31; c += prev_dc_diff = (to_signed(cw) ^ s) - s; - put_px(gid.z, base_pos + pos_to_block(i, is_luma), c * qscale & 0xffff); + put_px(gid.z, base_pos + pos_to_block(i, is_luma), c); } } @@ -152,7 +152,7 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale) ivec2 bpos = ivec2(scan & 0xf, scan >> 4); uint c = ((level + 1) ^ -s) + s; - put_px(gid.z, base_pos + spos + bpos, c * qscale & 0xffff); + put_px(gid.z, base_pos + spos + bpos, c); } } } @@ -218,7 +218,7 @@ void decode_alpha(in GetBitContext gb, uvec2 mb_pos, uint mb_count) */ uint val = (alpha << alpha_rescale_lshift) | (alpha >> alpha_rescale_rshift); for (uint end = pos + run; pos < end; ++pos) - put_px(3, base_pos + ivec2(pos & block_mask, pos >> block_shift), val & 0xffff); + put_px(3, base_pos + ivec2(pos & block_mask, pos >> block_shift), val); } } @@ -235,13 +235,8 @@ void main(void) u8buf bs = u8buf(slice_data + slice_off); /* Decode slice header */ - uint hdr_size, y_size, u_size, v_size, a_size; - hdr_size = bs[0].v >> 3; - - /* Table 15 */ - uint qidx = clamp(bs[1].v, 1, 224), - qscale = qidx > 128 ? (qidx - 96) << 2 : qidx; - + uint hdr_size, qidx, y_size, u_size, v_size, a_size; + hdr_size = bs[0].v >> 3, qidx = clamp(bs[1].v, 1, 224); y_size = (uint(bs[2].v) << 8) | bs[3].v; u_size = (uint(bs[4].v) << 8) | bs[5].v; @@ -308,10 +303,17 @@ void main(void) uint mb_count = 1 << log2_width; if (gid.z < 3) { - /* Color entropy decoding, inverse scanning, first part of inverse quantization */ - decode_comp(gb, uvec2(mb_x, mb_y), mb_count, qscale); + /* Color entropy decoding, inverse scanning */ + decode_comp(gb, uvec2(mb_x, mb_y), mb_count); } else { /* Alpha entropy decoding */ decode_alpha(gb, uvec2(mb_x, mb_y), mb_count); } + + /* Forward the quantization index to the IDCT shader */ + if (gid.z == 0) { + uint base = mb_y * mb_width + mb_x; + for (uint i = 0; i < mb_count; ++i) + quant_idx[base + i] = uint8_t(qidx); + } } diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c index 2602be112b..8849e337c3 100644 --- a/libavcodec/vulkan_prores.c +++ b/libavcodec/vulkan_prores.c @@ -37,11 +37,13 @@ const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc = { typedef struct ProresVulkanDecodePicture { FFVulkanDecodePicture vp; - AVBufferRef *slice_offset_buf; - uint32_t slice_num; + AVBufferRef *metadata_buf; uint32_t bitstream_start; uint32_t bitstream_size; + uint32_t slice_num; + + uint32_t slice_offsets_sz, mb_params_sz; } ProresVulkanDecodePicture; typedef struct ProresVulkanDecodeContext { @@ -51,7 +53,7 @@ typedef struct ProresVulkanDecodeContext { FFVulkanShader idct; } shaders[2]; /* Progressive/interlaced */ - AVBufferPool *slice_offset_pool; + AVBufferPool *metadata_pool; } ProresVulkanDecodeContext; typedef struct ProresVkParameters { @@ -88,6 +90,9 @@ static int vk_prores_start_frame(AVCodecContext *avctx, int err; + pp->slice_offsets_sz = (pr->slice_count + 1) * sizeof(uint32_t); + pp->mb_params_sz = pr->mb_width * pr->mb_height * sizeof(uint8_t); + /* Host map the input slices data if supported */ if (!vp->slices_buf && ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) RET(ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data, @@ -96,11 +101,10 @@ static int vk_prores_start_frame(AVCodecContext *avctx, VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT)); /* Allocate slice offsets buffer */ - RET(ff_vk_get_pooled_buffer(&ctx->s, &pv->slice_offset_pool, - &pp->slice_offset_buf, - VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, - NULL, (pr->slice_count + 1) * sizeof(uint32_t), + RET(ff_vk_get_pooled_buffer(&ctx->s, &pv->metadata_pool, + &pp->metadata_buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + NULL, pp->slice_offsets_sz + pp->mb_params_sz, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); @@ -123,7 +127,7 @@ static int vk_prores_decode_slice(AVCodecContext *avctx, ProresVulkanDecodePicture *pp = pr->hwaccel_picture_private; FFVulkanDecodePicture *vp = &pp->vp; - FFVkBuffer *slice_offset = (FFVkBuffer *)pp->slice_offset_buf->data; + FFVkBuffer *slice_offset = (FFVkBuffer *)pp->metadata_buf->data; FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL; /* Skip picture header */ @@ -158,7 +162,7 @@ static int vk_prores_end_frame(AVCodecContext *avctx) FFVulkanDecodePicture *vp = &pp->vp; ProresVkParameters pd; - FFVkBuffer *slice_data, *slice_offsets; + FFVkBuffer *slice_data, *metadata; struct ProresVulkanShaderVariants *shaders; VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS]; VkBufferMemoryBarrier2 buf_bar[2]; @@ -172,8 +176,8 @@ static int vk_prores_end_frame(AVCodecContext *avctx) if (!pix_desc) return AVERROR(EINVAL); - slice_data = (FFVkBuffer *)vp->slices_buf->data; - slice_offsets = (FFVkBuffer *)pp->slice_offset_buf->data; + slice_data = (FFVkBuffer *)vp->slices_buf->data; + metadata = (FFVkBuffer *)pp->metadata_buf->data; shaders = &pv->shaders[pr->frame_type != 0]; @@ -209,13 +213,13 @@ static int vk_prores_end_frame(AVCodecContext *avctx) pr->frame)); RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, - (AVBufferRef *[]){ vp->slices_buf, pp->slice_offset_buf }, + (AVBufferRef *[]){ vp->slices_buf, pp->metadata_buf, }, 2, 0)); /* Transfer ownership to the exec context */ - vp->slices_buf = pp->slice_offset_buf = NULL; + vp->slices_buf = pp->metadata_buf = NULL; - /* Input frame barrier */ + /* Input barrier */ ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, @@ -223,6 +227,21 @@ static int vk_prores_end_frame(AVCodecContext *avctx) VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_IGNORED); + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = metadata->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = metadata->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = metadata->buf, + .offset = pp->slice_offsets_sz, + .size = pp->mb_params_sz, + }; + metadata->stage = buf_bar[0].dstStageMask; + metadata->access = buf_bar[0].dstAccessMask; + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pBufferMemoryBarriers = buf_bar, @@ -267,12 +286,17 @@ static int vk_prores_end_frame(AVCodecContext *avctx) /* Entropy decode */ ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->vld, 0, 0, 0, - slice_offsets, - 0, (pp->slice_num + 1) * sizeof(uint32_t), + metadata, 0, + pp->slice_offsets_sz, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->vld, + 0, 1, 0, + metadata, pp->slice_offsets_sz, + pp->mb_params_sz, VK_FORMAT_UNDEFINED); ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->vld, pr->frame, vp->view.out, - 0, 1, + 0, 2, VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE); @@ -286,7 +310,6 @@ static int vk_prores_end_frame(AVCodecContext *avctx) 3 + !!pr->alpha_info); /* Synchronize vld and idct shaders */ - nb_img_bar = 0; ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, @@ -294,6 +317,21 @@ static int vk_prores_end_frame(AVCodecContext *avctx) VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_IGNORED); + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = metadata->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = metadata->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = metadata->buf, + .offset = pp->slice_offsets_sz, + .size = pp->mb_params_sz, + }; + metadata->stage = buf_bar[0].dstStageMask; + metadata->access = buf_bar[0].dstAccessMask; + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pBufferMemoryBarriers = buf_bar, @@ -304,9 +342,14 @@ static int vk_prores_end_frame(AVCodecContext *avctx) nb_img_bar = nb_buf_bar = 0; /* Inverse transform */ + ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->idct, + 0, 0, 0, + metadata, pp->slice_offsets_sz, + pp->mb_params_sz, + VK_FORMAT_UNDEFINED); ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->idct, pr->frame, vp->view.out, - 0, 0, + 0, 1, VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE); @@ -406,23 +449,23 @@ static void vk_decode_prores_uninit(FFVulkanDecodeShared *ctx) ff_vk_shader_free(&ctx->s, &pv->shaders[i].idct); } - av_buffer_pool_uninit(&pv->slice_offset_pool); + av_buffer_pool_uninit(&pv->metadata_pool); av_freep(&pv); } static int vk_decode_prores_init(AVCodecContext *avctx) { - FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; - FFVulkanDecodeShared *ctx = NULL; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = NULL; AVHWFramesContext *out_frames_ctx; ProresVulkanDecodeContext *pv; FFVkSPIRVCompiler *spv; FFVulkanDescriptorSetBinding *desc_set; - int max_num_slices, i, err; + int max_num_mbs, i, err; - max_num_slices = (avctx->coded_width >> 4) * (avctx->coded_height >> 4); + max_num_mbs = (avctx->coded_width >> 4) * (avctx->coded_height >> 4); spv = ff_vk_spirv_init(); if (!spv) { @@ -471,7 +514,15 @@ static int vk_decode_prores_init(AVCodecContext *avctx) .stages = VK_SHADER_STAGE_COMPUTE_BIT, .mem_quali = "readonly", .buf_content = "uint32_t slice_offsets", - .buf_elems = max_num_slices + 1, + .buf_elems = max_num_mbs + 1, + }, + { + .name = "quant_idx_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_quali = "writeonly", + .buf_content = "uint8_t quant_idx", + .buf_elems = max_num_mbs, }, { .name = "dst", @@ -485,10 +536,18 @@ static int vk_decode_prores_init(AVCodecContext *avctx) }, }; RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->vld, - "prores_dec_vld", "main", desc_set, 2, + "prores_dec_vld", "main", desc_set, 3, ff_source_prores_vld_comp, 0x080801, i)); desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "quant_idx_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_quali = "readonly", + .buf_content = "uint8_t quant_idx", + .buf_elems = max_num_mbs, + }, { .name = "dst", .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, @@ -500,7 +559,7 @@ static int vk_decode_prores_init(AVCodecContext *avctx) }, }; RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->idct, - "prores_dec_idct", "main", desc_set, 1, + "prores_dec_idct", "main", desc_set, 2, ff_source_prores_idct_comp, 0x200201, i)); }