vulkan/prores: forward quantization parameter to the IDCT shader

The qScale syntax element has a maximum value of 512, which would overflow the 16-bit store from the VLD shader in extreme cases.
This fixes that edge case by forwarding the element in a storage buffer, and applying the inverse quantization fully in the IDCT shader.
This commit is contained in:
averne 2025-11-08 19:57:37 +01:00 committed by Lynne
parent 28461f2c43
commit d00f41f213
3 changed files with 114 additions and 48 deletions

View file

@ -87,7 +87,7 @@ void main(void)
uint chroma_shift = comp != 0 ? log2_chroma_w : 0;
bool act = gid.x < mb_width << (4 - chroma_shift);
/* Coalesced load of DCT coeffs in shared memory, second part of inverse quantization */
/* Coalesced load of DCT coeffs in shared memory, inverse quantization */
if (act) {
/**
* According to spec indexing an array in push constant memory with
@ -95,9 +95,14 @@ void main(void)
* so copy the whole matrix locally.
*/
uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma;
/* Table 15 */
uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> 4)];
int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx;
[[unroll]] for (uint i = 0; i < 8; ++i) {
int v = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) | i))), 16);
blocks[block][i * 9 + idx] = float(v * int(qmat[(i << 3) + idx]));
int v = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) + i))), 16);
blocks[block][i * 9 + idx] = float(v * qscale * int(qmat[(i << 3) + idx]));
}
}
@ -117,7 +122,7 @@ void main(void)
if (act) {
[[unroll]] for (uint i = 0; i < 8; ++i) {
float v = blocks[block][i * 9 + idx] * fact + off;
put_px(comp, ivec2(gid.x, (gid.y << 3) | i), clamp(int(v), 0, maxv));
put_px(comp, ivec2(gid.x, (gid.y << 3) + i), clamp(int(v), 0, maxv));
}
}
}

View file

@ -22,9 +22,9 @@
void put_px(uint tex_idx, ivec2 pos, uint v)
{
#ifndef INTERLACED
imageStore(dst[tex_idx], pos, uvec4(v));
imageStore(dst[tex_idx], pos, uvec4(uint16_t(v)));
#else
imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v));
imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(uint16_t(v)));
#endif
}
@ -57,7 +57,7 @@ uint decode_codeword(inout GetBitContext gb, int codebook)
}
}
void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale)
void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count)
{
uvec3 gid = gl_GlobalInvocationID;
uint is_luma = uint(gid.z == 0);
@ -70,7 +70,7 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale)
{
/* First coeff */
uint c = to_signed(decode_codeword(gb, 0x650));
put_px(gid.z, base_pos, c * qscale & 0xffff);
put_px(gid.z, base_pos, c);
/**
* Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8)
@ -89,7 +89,7 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale)
int s = int(prev_dc_diff) >> 31;
c += prev_dc_diff = (to_signed(cw) ^ s) - s;
put_px(gid.z, base_pos + pos_to_block(i, is_luma), c * qscale & 0xffff);
put_px(gid.z, base_pos + pos_to_block(i, is_luma), c);
}
}
@ -152,7 +152,7 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale)
ivec2 bpos = ivec2(scan & 0xf, scan >> 4);
uint c = ((level + 1) ^ -s) + s;
put_px(gid.z, base_pos + spos + bpos, c * qscale & 0xffff);
put_px(gid.z, base_pos + spos + bpos, c);
}
}
}
@ -218,7 +218,7 @@ void decode_alpha(in GetBitContext gb, uvec2 mb_pos, uint mb_count)
*/
uint val = (alpha << alpha_rescale_lshift) | (alpha >> alpha_rescale_rshift);
for (uint end = pos + run; pos < end; ++pos)
put_px(3, base_pos + ivec2(pos & block_mask, pos >> block_shift), val & 0xffff);
put_px(3, base_pos + ivec2(pos & block_mask, pos >> block_shift), val);
}
}
@ -235,13 +235,8 @@ void main(void)
u8buf bs = u8buf(slice_data + slice_off);
/* Decode slice header */
uint hdr_size, y_size, u_size, v_size, a_size;
hdr_size = bs[0].v >> 3;
/* Table 15 */
uint qidx = clamp(bs[1].v, 1, 224),
qscale = qidx > 128 ? (qidx - 96) << 2 : qidx;
uint hdr_size, qidx, y_size, u_size, v_size, a_size;
hdr_size = bs[0].v >> 3, qidx = clamp(bs[1].v, 1, 224);
y_size = (uint(bs[2].v) << 8) | bs[3].v;
u_size = (uint(bs[4].v) << 8) | bs[5].v;
@ -308,10 +303,17 @@ void main(void)
uint mb_count = 1 << log2_width;
if (gid.z < 3) {
/* Color entropy decoding, inverse scanning, first part of inverse quantization */
decode_comp(gb, uvec2(mb_x, mb_y), mb_count, qscale);
/* Color entropy decoding, inverse scanning */
decode_comp(gb, uvec2(mb_x, mb_y), mb_count);
} else {
/* Alpha entropy decoding */
decode_alpha(gb, uvec2(mb_x, mb_y), mb_count);
}
/* Forward the quantization index to the IDCT shader */
if (gid.z == 0) {
uint base = mb_y * mb_width + mb_x;
for (uint i = 0; i < mb_count; ++i)
quant_idx[base + i] = uint8_t(qidx);
}
}

View file

@ -37,11 +37,13 @@ const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc = {
typedef struct ProresVulkanDecodePicture {
FFVulkanDecodePicture vp;
AVBufferRef *slice_offset_buf;
uint32_t slice_num;
AVBufferRef *metadata_buf;
uint32_t bitstream_start;
uint32_t bitstream_size;
uint32_t slice_num;
uint32_t slice_offsets_sz, mb_params_sz;
} ProresVulkanDecodePicture;
typedef struct ProresVulkanDecodeContext {
@ -51,7 +53,7 @@ typedef struct ProresVulkanDecodeContext {
FFVulkanShader idct;
} shaders[2]; /* Progressive/interlaced */
AVBufferPool *slice_offset_pool;
AVBufferPool *metadata_pool;
} ProresVulkanDecodeContext;
typedef struct ProresVkParameters {
@ -88,6 +90,9 @@ static int vk_prores_start_frame(AVCodecContext *avctx,
int err;
pp->slice_offsets_sz = (pr->slice_count + 1) * sizeof(uint32_t);
pp->mb_params_sz = pr->mb_width * pr->mb_height * sizeof(uint8_t);
/* Host map the input slices data if supported */
if (!vp->slices_buf && ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY)
RET(ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data,
@ -96,11 +101,10 @@ static int vk_prores_start_frame(AVCodecContext *avctx,
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT));
/* Allocate slice offsets buffer */
RET(ff_vk_get_pooled_buffer(&ctx->s, &pv->slice_offset_pool,
&pp->slice_offset_buf,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
NULL, (pr->slice_count + 1) * sizeof(uint32_t),
RET(ff_vk_get_pooled_buffer(&ctx->s, &pv->metadata_pool,
&pp->metadata_buf,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
NULL, pp->slice_offsets_sz + pp->mb_params_sz,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
@ -123,7 +127,7 @@ static int vk_prores_decode_slice(AVCodecContext *avctx,
ProresVulkanDecodePicture *pp = pr->hwaccel_picture_private;
FFVulkanDecodePicture *vp = &pp->vp;
FFVkBuffer *slice_offset = (FFVkBuffer *)pp->slice_offset_buf->data;
FFVkBuffer *slice_offset = (FFVkBuffer *)pp->metadata_buf->data;
FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL;
/* Skip picture header */
@ -158,7 +162,7 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
FFVulkanDecodePicture *vp = &pp->vp;
ProresVkParameters pd;
FFVkBuffer *slice_data, *slice_offsets;
FFVkBuffer *slice_data, *metadata;
struct ProresVulkanShaderVariants *shaders;
VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS];
VkBufferMemoryBarrier2 buf_bar[2];
@ -172,8 +176,8 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
if (!pix_desc)
return AVERROR(EINVAL);
slice_data = (FFVkBuffer *)vp->slices_buf->data;
slice_offsets = (FFVkBuffer *)pp->slice_offset_buf->data;
slice_data = (FFVkBuffer *)vp->slices_buf->data;
metadata = (FFVkBuffer *)pp->metadata_buf->data;
shaders = &pv->shaders[pr->frame_type != 0];
@ -209,13 +213,13 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
pr->frame));
RET(ff_vk_exec_add_dep_buf(&ctx->s, exec,
(AVBufferRef *[]){ vp->slices_buf, pp->slice_offset_buf },
(AVBufferRef *[]){ vp->slices_buf, pp->metadata_buf, },
2, 0));
/* Transfer ownership to the exec context */
vp->slices_buf = pp->slice_offset_buf = NULL;
vp->slices_buf = pp->metadata_buf = NULL;
/* Input frame barrier */
/* Input barrier */
ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar,
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
@ -223,6 +227,21 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
VK_IMAGE_LAYOUT_GENERAL,
VK_QUEUE_FAMILY_IGNORED);
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = metadata->stage,
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
.srcAccessMask = metadata->access,
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = metadata->buf,
.offset = pp->slice_offsets_sz,
.size = pp->mb_params_sz,
};
metadata->stage = buf_bar[0].dstStageMask;
metadata->access = buf_bar[0].dstAccessMask;
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pBufferMemoryBarriers = buf_bar,
@ -267,12 +286,17 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
/* Entropy decode */
ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->vld,
0, 0, 0,
slice_offsets,
0, (pp->slice_num + 1) * sizeof(uint32_t),
metadata, 0,
pp->slice_offsets_sz,
VK_FORMAT_UNDEFINED);
ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->vld,
0, 1, 0,
metadata, pp->slice_offsets_sz,
pp->mb_params_sz,
VK_FORMAT_UNDEFINED);
ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->vld,
pr->frame, vp->view.out,
0, 1,
0, 2,
VK_IMAGE_LAYOUT_GENERAL,
VK_NULL_HANDLE);
@ -286,7 +310,6 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
3 + !!pr->alpha_info);
/* Synchronize vld and idct shaders */
nb_img_bar = 0;
ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
@ -294,6 +317,21 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
VK_IMAGE_LAYOUT_GENERAL,
VK_QUEUE_FAMILY_IGNORED);
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = metadata->stage,
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
.srcAccessMask = metadata->access,
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = metadata->buf,
.offset = pp->slice_offsets_sz,
.size = pp->mb_params_sz,
};
metadata->stage = buf_bar[0].dstStageMask;
metadata->access = buf_bar[0].dstAccessMask;
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pBufferMemoryBarriers = buf_bar,
@ -304,9 +342,14 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
nb_img_bar = nb_buf_bar = 0;
/* Inverse transform */
ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->idct,
0, 0, 0,
metadata, pp->slice_offsets_sz,
pp->mb_params_sz,
VK_FORMAT_UNDEFINED);
ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->idct,
pr->frame, vp->view.out,
0, 0,
0, 1,
VK_IMAGE_LAYOUT_GENERAL,
VK_NULL_HANDLE);
@ -406,23 +449,23 @@ static void vk_decode_prores_uninit(FFVulkanDecodeShared *ctx)
ff_vk_shader_free(&ctx->s, &pv->shaders[i].idct);
}
av_buffer_pool_uninit(&pv->slice_offset_pool);
av_buffer_pool_uninit(&pv->metadata_pool);
av_freep(&pv);
}
static int vk_decode_prores_init(AVCodecContext *avctx)
{
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
FFVulkanDecodeShared *ctx = NULL;
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
FFVulkanDecodeShared *ctx = NULL;
AVHWFramesContext *out_frames_ctx;
ProresVulkanDecodeContext *pv;
FFVkSPIRVCompiler *spv;
FFVulkanDescriptorSetBinding *desc_set;
int max_num_slices, i, err;
int max_num_mbs, i, err;
max_num_slices = (avctx->coded_width >> 4) * (avctx->coded_height >> 4);
max_num_mbs = (avctx->coded_width >> 4) * (avctx->coded_height >> 4);
spv = ff_vk_spirv_init();
if (!spv) {
@ -471,7 +514,15 @@ static int vk_decode_prores_init(AVCodecContext *avctx)
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.mem_quali = "readonly",
.buf_content = "uint32_t slice_offsets",
.buf_elems = max_num_slices + 1,
.buf_elems = max_num_mbs + 1,
},
{
.name = "quant_idx_buf",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.mem_quali = "writeonly",
.buf_content = "uint8_t quant_idx",
.buf_elems = max_num_mbs,
},
{
.name = "dst",
@ -485,10 +536,18 @@ static int vk_decode_prores_init(AVCodecContext *avctx)
},
};
RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->vld,
"prores_dec_vld", "main", desc_set, 2,
"prores_dec_vld", "main", desc_set, 3,
ff_source_prores_vld_comp, 0x080801, i));
desc_set = (FFVulkanDescriptorSetBinding []) {
{
.name = "quant_idx_buf",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.mem_quali = "readonly",
.buf_content = "uint8_t quant_idx",
.buf_elems = max_num_mbs,
},
{
.name = "dst",
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
@ -500,7 +559,7 @@ static int vk_decode_prores_init(AVCodecContext *avctx)
},
};
RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->idct,
"prores_dec_idct", "main", desc_set, 1,
"prores_dec_idct", "main", desc_set, 2,
ff_source_prores_idct_comp, 0x200201, i));
}