mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-02-06 09:49:56 +00:00
vulkan/prores: forward quantization parameter to the IDCT shader
The qScale syntax element has a maximum value of 512, which would overflow the 16-bit store from the VLD shader in extreme cases. This fixes that edge case by forwarding the element in a storage buffer, and applying the inverse quantization fully in the IDCT shader.
This commit is contained in:
parent
28461f2c43
commit
d00f41f213
3 changed files with 114 additions and 48 deletions
|
|
@ -87,7 +87,7 @@ void main(void)
|
|||
uint chroma_shift = comp != 0 ? log2_chroma_w : 0;
|
||||
bool act = gid.x < mb_width << (4 - chroma_shift);
|
||||
|
||||
/* Coalesced load of DCT coeffs in shared memory, second part of inverse quantization */
|
||||
/* Coalesced load of DCT coeffs in shared memory, inverse quantization */
|
||||
if (act) {
|
||||
/**
|
||||
* According to spec indexing an array in push constant memory with
|
||||
|
|
@ -95,9 +95,14 @@ void main(void)
|
|||
* so copy the whole matrix locally.
|
||||
*/
|
||||
uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma;
|
||||
|
||||
/* Table 15 */
|
||||
uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> 4)];
|
||||
int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx;
|
||||
|
||||
[[unroll]] for (uint i = 0; i < 8; ++i) {
|
||||
int v = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) | i))), 16);
|
||||
blocks[block][i * 9 + idx] = float(v * int(qmat[(i << 3) + idx]));
|
||||
int v = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) + i))), 16);
|
||||
blocks[block][i * 9 + idx] = float(v * qscale * int(qmat[(i << 3) + idx]));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -117,7 +122,7 @@ void main(void)
|
|||
if (act) {
|
||||
[[unroll]] for (uint i = 0; i < 8; ++i) {
|
||||
float v = blocks[block][i * 9 + idx] * fact + off;
|
||||
put_px(comp, ivec2(gid.x, (gid.y << 3) | i), clamp(int(v), 0, maxv));
|
||||
put_px(comp, ivec2(gid.x, (gid.y << 3) + i), clamp(int(v), 0, maxv));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,9 +22,9 @@
|
|||
void put_px(uint tex_idx, ivec2 pos, uint v)
|
||||
{
|
||||
#ifndef INTERLACED
|
||||
imageStore(dst[tex_idx], pos, uvec4(v));
|
||||
imageStore(dst[tex_idx], pos, uvec4(uint16_t(v)));
|
||||
#else
|
||||
imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v));
|
||||
imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(uint16_t(v)));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -57,7 +57,7 @@ uint decode_codeword(inout GetBitContext gb, int codebook)
|
|||
}
|
||||
}
|
||||
|
||||
void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale)
|
||||
void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count)
|
||||
{
|
||||
uvec3 gid = gl_GlobalInvocationID;
|
||||
uint is_luma = uint(gid.z == 0);
|
||||
|
|
@ -70,7 +70,7 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale)
|
|||
{
|
||||
/* First coeff */
|
||||
uint c = to_signed(decode_codeword(gb, 0x650));
|
||||
put_px(gid.z, base_pos, c * qscale & 0xffff);
|
||||
put_px(gid.z, base_pos, c);
|
||||
|
||||
/**
|
||||
* Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8)
|
||||
|
|
@ -89,7 +89,7 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale)
|
|||
int s = int(prev_dc_diff) >> 31;
|
||||
c += prev_dc_diff = (to_signed(cw) ^ s) - s;
|
||||
|
||||
put_px(gid.z, base_pos + pos_to_block(i, is_luma), c * qscale & 0xffff);
|
||||
put_px(gid.z, base_pos + pos_to_block(i, is_luma), c);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -152,7 +152,7 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale)
|
|||
ivec2 bpos = ivec2(scan & 0xf, scan >> 4);
|
||||
|
||||
uint c = ((level + 1) ^ -s) + s;
|
||||
put_px(gid.z, base_pos + spos + bpos, c * qscale & 0xffff);
|
||||
put_px(gid.z, base_pos + spos + bpos, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -218,7 +218,7 @@ void decode_alpha(in GetBitContext gb, uvec2 mb_pos, uint mb_count)
|
|||
*/
|
||||
uint val = (alpha << alpha_rescale_lshift) | (alpha >> alpha_rescale_rshift);
|
||||
for (uint end = pos + run; pos < end; ++pos)
|
||||
put_px(3, base_pos + ivec2(pos & block_mask, pos >> block_shift), val & 0xffff);
|
||||
put_px(3, base_pos + ivec2(pos & block_mask, pos >> block_shift), val);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -235,13 +235,8 @@ void main(void)
|
|||
u8buf bs = u8buf(slice_data + slice_off);
|
||||
|
||||
/* Decode slice header */
|
||||
uint hdr_size, y_size, u_size, v_size, a_size;
|
||||
hdr_size = bs[0].v >> 3;
|
||||
|
||||
/* Table 15 */
|
||||
uint qidx = clamp(bs[1].v, 1, 224),
|
||||
qscale = qidx > 128 ? (qidx - 96) << 2 : qidx;
|
||||
|
||||
uint hdr_size, qidx, y_size, u_size, v_size, a_size;
|
||||
hdr_size = bs[0].v >> 3, qidx = clamp(bs[1].v, 1, 224);
|
||||
y_size = (uint(bs[2].v) << 8) | bs[3].v;
|
||||
u_size = (uint(bs[4].v) << 8) | bs[5].v;
|
||||
|
||||
|
|
@ -308,10 +303,17 @@ void main(void)
|
|||
uint mb_count = 1 << log2_width;
|
||||
|
||||
if (gid.z < 3) {
|
||||
/* Color entropy decoding, inverse scanning, first part of inverse quantization */
|
||||
decode_comp(gb, uvec2(mb_x, mb_y), mb_count, qscale);
|
||||
/* Color entropy decoding, inverse scanning */
|
||||
decode_comp(gb, uvec2(mb_x, mb_y), mb_count);
|
||||
} else {
|
||||
/* Alpha entropy decoding */
|
||||
decode_alpha(gb, uvec2(mb_x, mb_y), mb_count);
|
||||
}
|
||||
|
||||
/* Forward the quantization index to the IDCT shader */
|
||||
if (gid.z == 0) {
|
||||
uint base = mb_y * mb_width + mb_x;
|
||||
for (uint i = 0; i < mb_count; ++i)
|
||||
quant_idx[base + i] = uint8_t(qidx);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,11 +37,13 @@ const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc = {
|
|||
typedef struct ProresVulkanDecodePicture {
|
||||
FFVulkanDecodePicture vp;
|
||||
|
||||
AVBufferRef *slice_offset_buf;
|
||||
uint32_t slice_num;
|
||||
AVBufferRef *metadata_buf;
|
||||
|
||||
uint32_t bitstream_start;
|
||||
uint32_t bitstream_size;
|
||||
uint32_t slice_num;
|
||||
|
||||
uint32_t slice_offsets_sz, mb_params_sz;
|
||||
} ProresVulkanDecodePicture;
|
||||
|
||||
typedef struct ProresVulkanDecodeContext {
|
||||
|
|
@ -51,7 +53,7 @@ typedef struct ProresVulkanDecodeContext {
|
|||
FFVulkanShader idct;
|
||||
} shaders[2]; /* Progressive/interlaced */
|
||||
|
||||
AVBufferPool *slice_offset_pool;
|
||||
AVBufferPool *metadata_pool;
|
||||
} ProresVulkanDecodeContext;
|
||||
|
||||
typedef struct ProresVkParameters {
|
||||
|
|
@ -88,6 +90,9 @@ static int vk_prores_start_frame(AVCodecContext *avctx,
|
|||
|
||||
int err;
|
||||
|
||||
pp->slice_offsets_sz = (pr->slice_count + 1) * sizeof(uint32_t);
|
||||
pp->mb_params_sz = pr->mb_width * pr->mb_height * sizeof(uint8_t);
|
||||
|
||||
/* Host map the input slices data if supported */
|
||||
if (!vp->slices_buf && ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY)
|
||||
RET(ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data,
|
||||
|
|
@ -96,11 +101,10 @@ static int vk_prores_start_frame(AVCodecContext *avctx,
|
|||
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT));
|
||||
|
||||
/* Allocate slice offsets buffer */
|
||||
RET(ff_vk_get_pooled_buffer(&ctx->s, &pv->slice_offset_pool,
|
||||
&pp->slice_offset_buf,
|
||||
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
|
||||
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
|
||||
NULL, (pr->slice_count + 1) * sizeof(uint32_t),
|
||||
RET(ff_vk_get_pooled_buffer(&ctx->s, &pv->metadata_pool,
|
||||
&pp->metadata_buf,
|
||||
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
|
||||
NULL, pp->slice_offsets_sz + pp->mb_params_sz,
|
||||
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
|
||||
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
|
||||
|
||||
|
|
@ -123,7 +127,7 @@ static int vk_prores_decode_slice(AVCodecContext *avctx,
|
|||
ProresVulkanDecodePicture *pp = pr->hwaccel_picture_private;
|
||||
FFVulkanDecodePicture *vp = &pp->vp;
|
||||
|
||||
FFVkBuffer *slice_offset = (FFVkBuffer *)pp->slice_offset_buf->data;
|
||||
FFVkBuffer *slice_offset = (FFVkBuffer *)pp->metadata_buf->data;
|
||||
FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL;
|
||||
|
||||
/* Skip picture header */
|
||||
|
|
@ -158,7 +162,7 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
|
|||
FFVulkanDecodePicture *vp = &pp->vp;
|
||||
|
||||
ProresVkParameters pd;
|
||||
FFVkBuffer *slice_data, *slice_offsets;
|
||||
FFVkBuffer *slice_data, *metadata;
|
||||
struct ProresVulkanShaderVariants *shaders;
|
||||
VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS];
|
||||
VkBufferMemoryBarrier2 buf_bar[2];
|
||||
|
|
@ -172,8 +176,8 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
|
|||
if (!pix_desc)
|
||||
return AVERROR(EINVAL);
|
||||
|
||||
slice_data = (FFVkBuffer *)vp->slices_buf->data;
|
||||
slice_offsets = (FFVkBuffer *)pp->slice_offset_buf->data;
|
||||
slice_data = (FFVkBuffer *)vp->slices_buf->data;
|
||||
metadata = (FFVkBuffer *)pp->metadata_buf->data;
|
||||
|
||||
shaders = &pv->shaders[pr->frame_type != 0];
|
||||
|
||||
|
|
@ -209,13 +213,13 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
|
|||
pr->frame));
|
||||
|
||||
RET(ff_vk_exec_add_dep_buf(&ctx->s, exec,
|
||||
(AVBufferRef *[]){ vp->slices_buf, pp->slice_offset_buf },
|
||||
(AVBufferRef *[]){ vp->slices_buf, pp->metadata_buf, },
|
||||
2, 0));
|
||||
|
||||
/* Transfer ownership to the exec context */
|
||||
vp->slices_buf = pp->slice_offset_buf = NULL;
|
||||
vp->slices_buf = pp->metadata_buf = NULL;
|
||||
|
||||
/* Input frame barrier */
|
||||
/* Input barrier */
|
||||
ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar,
|
||||
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
|
||||
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
|
|
@ -223,6 +227,21 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
|
|||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_QUEUE_FAMILY_IGNORED);
|
||||
|
||||
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
|
||||
.srcStageMask = metadata->stage,
|
||||
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
.srcAccessMask = metadata->access,
|
||||
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
|
||||
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.buffer = metadata->buf,
|
||||
.offset = pp->slice_offsets_sz,
|
||||
.size = pp->mb_params_sz,
|
||||
};
|
||||
metadata->stage = buf_bar[0].dstStageMask;
|
||||
metadata->access = buf_bar[0].dstAccessMask;
|
||||
|
||||
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
|
||||
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
|
||||
.pBufferMemoryBarriers = buf_bar,
|
||||
|
|
@ -267,12 +286,17 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
|
|||
/* Entropy decode */
|
||||
ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->vld,
|
||||
0, 0, 0,
|
||||
slice_offsets,
|
||||
0, (pp->slice_num + 1) * sizeof(uint32_t),
|
||||
metadata, 0,
|
||||
pp->slice_offsets_sz,
|
||||
VK_FORMAT_UNDEFINED);
|
||||
ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->vld,
|
||||
0, 1, 0,
|
||||
metadata, pp->slice_offsets_sz,
|
||||
pp->mb_params_sz,
|
||||
VK_FORMAT_UNDEFINED);
|
||||
ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->vld,
|
||||
pr->frame, vp->view.out,
|
||||
0, 1,
|
||||
0, 2,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_NULL_HANDLE);
|
||||
|
||||
|
|
@ -286,7 +310,6 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
|
|||
3 + !!pr->alpha_info);
|
||||
|
||||
/* Synchronize vld and idct shaders */
|
||||
nb_img_bar = 0;
|
||||
ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar,
|
||||
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
|
|
@ -294,6 +317,21 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
|
|||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_QUEUE_FAMILY_IGNORED);
|
||||
|
||||
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
|
||||
.srcStageMask = metadata->stage,
|
||||
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
.srcAccessMask = metadata->access,
|
||||
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
|
||||
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.buffer = metadata->buf,
|
||||
.offset = pp->slice_offsets_sz,
|
||||
.size = pp->mb_params_sz,
|
||||
};
|
||||
metadata->stage = buf_bar[0].dstStageMask;
|
||||
metadata->access = buf_bar[0].dstAccessMask;
|
||||
|
||||
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
|
||||
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
|
||||
.pBufferMemoryBarriers = buf_bar,
|
||||
|
|
@ -304,9 +342,14 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
|
|||
nb_img_bar = nb_buf_bar = 0;
|
||||
|
||||
/* Inverse transform */
|
||||
ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->idct,
|
||||
0, 0, 0,
|
||||
metadata, pp->slice_offsets_sz,
|
||||
pp->mb_params_sz,
|
||||
VK_FORMAT_UNDEFINED);
|
||||
ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->idct,
|
||||
pr->frame, vp->view.out,
|
||||
0, 0,
|
||||
0, 1,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_NULL_HANDLE);
|
||||
|
||||
|
|
@ -406,23 +449,23 @@ static void vk_decode_prores_uninit(FFVulkanDecodeShared *ctx)
|
|||
ff_vk_shader_free(&ctx->s, &pv->shaders[i].idct);
|
||||
}
|
||||
|
||||
av_buffer_pool_uninit(&pv->slice_offset_pool);
|
||||
av_buffer_pool_uninit(&pv->metadata_pool);
|
||||
|
||||
av_freep(&pv);
|
||||
}
|
||||
|
||||
static int vk_decode_prores_init(AVCodecContext *avctx)
|
||||
{
|
||||
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
|
||||
FFVulkanDecodeShared *ctx = NULL;
|
||||
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
|
||||
FFVulkanDecodeShared *ctx = NULL;
|
||||
|
||||
AVHWFramesContext *out_frames_ctx;
|
||||
ProresVulkanDecodeContext *pv;
|
||||
FFVkSPIRVCompiler *spv;
|
||||
FFVulkanDescriptorSetBinding *desc_set;
|
||||
int max_num_slices, i, err;
|
||||
int max_num_mbs, i, err;
|
||||
|
||||
max_num_slices = (avctx->coded_width >> 4) * (avctx->coded_height >> 4);
|
||||
max_num_mbs = (avctx->coded_width >> 4) * (avctx->coded_height >> 4);
|
||||
|
||||
spv = ff_vk_spirv_init();
|
||||
if (!spv) {
|
||||
|
|
@ -471,7 +514,15 @@ static int vk_decode_prores_init(AVCodecContext *avctx)
|
|||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.mem_quali = "readonly",
|
||||
.buf_content = "uint32_t slice_offsets",
|
||||
.buf_elems = max_num_slices + 1,
|
||||
.buf_elems = max_num_mbs + 1,
|
||||
},
|
||||
{
|
||||
.name = "quant_idx_buf",
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.mem_quali = "writeonly",
|
||||
.buf_content = "uint8_t quant_idx",
|
||||
.buf_elems = max_num_mbs,
|
||||
},
|
||||
{
|
||||
.name = "dst",
|
||||
|
|
@ -485,10 +536,18 @@ static int vk_decode_prores_init(AVCodecContext *avctx)
|
|||
},
|
||||
};
|
||||
RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->vld,
|
||||
"prores_dec_vld", "main", desc_set, 2,
|
||||
"prores_dec_vld", "main", desc_set, 3,
|
||||
ff_source_prores_vld_comp, 0x080801, i));
|
||||
|
||||
desc_set = (FFVulkanDescriptorSetBinding []) {
|
||||
{
|
||||
.name = "quant_idx_buf",
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.mem_quali = "readonly",
|
||||
.buf_content = "uint8_t quant_idx",
|
||||
.buf_elems = max_num_mbs,
|
||||
},
|
||||
{
|
||||
.name = "dst",
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
|
|
@ -500,7 +559,7 @@ static int vk_decode_prores_init(AVCodecContext *avctx)
|
|||
},
|
||||
};
|
||||
RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->idct,
|
||||
"prores_dec_idct", "main", desc_set, 1,
|
||||
"prores_dec_idct", "main", desc_set, 2,
|
||||
ff_source_prores_idct_comp, 0x200201, i));
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue