diff --git a/libavcodec/prores_raw.c b/libavcodec/prores_raw.c index c1c05fd959..375a4f2c99 100644 --- a/libavcodec/prores_raw.c +++ b/libavcodec/prores_raw.c @@ -20,6 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/avassert.h" #include "libavutil/intreadwrite.h" #include "libavutil/mem_internal.h" #include "libavutil/mem.h" @@ -131,11 +132,10 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile, uint16_t *dst = (uint16_t *)(frame->data[0] + tile->y*frame->linesize[0] + 2*tile->x); int idx; - const int w = FFMIN(s->tw, avctx->width - tile->x) / 2; - const int nb_blocks = w / 8; - const int log2_nb_blocks = 31 - ff_clz(nb_blocks); - const int block_mask = (1 << log2_nb_blocks) - 1; - const int nb_codes = 64 * nb_blocks; + const int log2_nb_blocks = tile->log2_nb_blocks; + const int nb_blocks = 1 << log2_nb_blocks; + const int block_mask = nb_blocks - 1; + const int nb_codes = 64 * nb_blocks; LOCAL_ALIGNED_32(int16_t, block, [64*16]); @@ -426,15 +426,13 @@ static int decode_frame(AVCodecContext *avctx, ff_permute_scantable(s->qmat, s->prodsp.idct_permutation, qmat); - s->nb_tw = (w + 15) >> 4; + int tw16 = (w + 15) >> 4; + s->nb_tw = (tw16 >> align) + av_popcount(~(-1 * (1 << align)) & tw16); s->nb_th = (h + 15) >> 4; - s->nb_tw = (s->nb_tw >> align) + av_popcount(~(-1 * (1 << align)) & s->nb_tw); s->nb_tiles = s->nb_tw * s->nb_th; av_log(avctx, AV_LOG_DEBUG, "%dx%d | nb_tiles: %d\n", s->nb_tw, s->nb_th, s->nb_tiles); - s->tw = s->version == 0 ? 128 : 256; s->th = 16; - av_log(avctx, AV_LOG_DEBUG, "tile_size: %dx%d\n", s->tw, s->th); av_fast_mallocz(&s->tiles, &s->tiles_size, s->nb_tiles * sizeof(*s->tiles)); if (!s->tiles) @@ -443,29 +441,38 @@ static int decode_frame(AVCodecContext *avctx, if (bytestream2_get_bytes_left(&gb) < s->nb_tiles * 2) return AVERROR_INVALIDDATA; - /* Read tile data offsets */ + /* First tile that extends past the right edge gets halved in width, + * next one gets quartered, and so on */ int offset = bytestream2_tell(&gb) + s->nb_tiles * 2; - for (int n = 0; n < s->nb_tiles; n++) { - TileContext *tile = &s->tiles[n]; + int n = 0; + for (int ty = 0; ty < s->nb_th; ty++) { + unsigned tx = 0; + int rem = tw16; + for (int e = align; rem > 0; e--) { + int unit = 1 << e; + while (unit <= rem) { + TileContext *tile = &s->tiles[n++]; + int size = bytestream2_get_be16(&gb); - int size = bytestream2_get_be16(&gb); - if (offset >= avpkt->size) - return AVERROR_INVALIDDATA; - if (size >= avpkt->size) - return AVERROR_INVALIDDATA; - if (offset > avpkt->size - size) - return AVERROR_INVALIDDATA; + if (offset >= avpkt->size) + return AVERROR_INVALIDDATA; + if (size >= avpkt->size) + return AVERROR_INVALIDDATA; + if (offset > avpkt->size - size) + return AVERROR_INVALIDDATA; - bytestream2_init(&tile->gb, avpkt->data + offset, size); + bytestream2_init(&tile->gb, avpkt->data + offset, size); + tile->x = tx * 16; + tile->y = ty * s->th; + tile->log2_nb_blocks = e; + offset += size; - tile->y = (n / s->nb_tw) * s->th; - tile->x = (n % s->nb_tw) * s->tw; - - if (avctx->width - tile->x < 16) - return AVERROR_PATCHWELCOME; - - offset += size; + tx += unit; + rem -= unit; + } + } } + av_assert1(n == s->nb_tiles); ret = ff_thread_get_buffer(avctx, frame, 0); if (ret < 0) diff --git a/libavcodec/prores_raw.h b/libavcodec/prores_raw.h index 3ac8068dd5..23b55661e4 100644 --- a/libavcodec/prores_raw.h +++ b/libavcodec/prores_raw.h @@ -33,6 +33,7 @@ typedef struct TileContext { GetByteContext gb; unsigned x, y; + int log2_nb_blocks; } TileContext; typedef struct ProResRAWContext { @@ -42,7 +43,7 @@ typedef struct ProResRAWContext { TileContext *tiles; unsigned int tiles_size; int nb_tiles; - int tw, th; + int th; int nb_tw, nb_th; enum AVPixelFormat pix_fmt; diff --git a/libavcodec/vulkan/prores_raw_decode.comp.glsl b/libavcodec/vulkan/prores_raw_decode.comp.glsl index c1ab920e27..92859d59d0 100644 --- a/libavcodec/vulkan/prores_raw_decode.comp.glsl +++ b/libavcodec/vulkan/prores_raw_decode.comp.glsl @@ -30,6 +30,7 @@ struct TileData { ivec2 pos; uint offset; uint size; + uint log2_nb_blocks; }; layout (set = 0, binding = 0, r16ui) uniform writeonly uimage2D dst; @@ -39,7 +40,6 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf { layout (push_constant, scalar) uniform pushConstants { u8buf pkt_data; - ivec2 tile_size; }; #define COMP_ID (gl_LocalInvocationID.y) @@ -215,10 +215,6 @@ void main(void) const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; TileData td = tile_data[tile_idx]; - int width = imageSize(dst).x; - if (expectEXT(td.pos.x >= width, false)) - return; - uint64_t pkt_offset = uint64_t(pkt_data) + td.offset; u8vec2buf hdr_data = u8vec2buf(pkt_offset); int header_len = hdr_data[0].v.x >> 3; @@ -232,8 +228,7 @@ void main(void) return; const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1); - const int w = min(tile_size.x, width - td.pos.x) >> 1; - const int nb_blocks = w >> 3; + const int nb_blocks = 1 << td.log2_nb_blocks; const ivec4 comp_offset = ivec4(size[2] + size[1] + size[3], size[2], diff --git a/libavcodec/vulkan/prores_raw_idct.comp.glsl b/libavcodec/vulkan/prores_raw_idct.comp.glsl index 15af6d5a3f..3393ea3402 100644 --- a/libavcodec/vulkan/prores_raw_idct.comp.glsl +++ b/libavcodec/vulkan/prores_raw_idct.comp.glsl @@ -30,6 +30,7 @@ struct TileData { ivec2 pos; uint offset; uint size; + uint log2_nb_blocks; }; layout (set = 0, binding = 0, r16ui) uniform uimage2D dst; @@ -39,7 +40,6 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf { layout (push_constant, scalar) uniform pushConstants { u8buf pkt_data; - ivec2 tile_size; uint8_t qmat[64]; }; @@ -73,17 +73,12 @@ void main(void) const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; TileData td = tile_data[tile_idx]; - int width = imageSize(dst).x; - if (expectEXT(td.pos.x >= width, false)) - return; - uint64_t pkt_offset = uint64_t(pkt_data) + td.offset; u8vec2buf hdr_data = u8vec2buf(pkt_offset); int qscale = pack16(hdr_data[0].v.yx); const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1); - const uint w = min(tile_size.x, width - td.pos.x) >> 1; - const uint nb_blocks = w >> 3; + const uint nb_blocks = 1 << td.log2_nb_blocks; /* Copy push-constant qmat into shared memory for fast non-uniform access */ if (gl_LocalInvocationIndex < 64) @@ -110,6 +105,10 @@ void main(void) idct8(BLOCK_ID, COMP_ID*72 + ROW_ID * 9, 1); barrier(); + /* Border tile check */ + if (BLOCK_ID >= nb_blocks) + return; + [[unroll]] for (uint y = 0; y < 8; y++) { int v = int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID]*4095.0)); diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c index 392b74a863..953b67d592 100644 --- a/libavcodec/vulkan_prores_raw.c +++ b/libavcodec/vulkan_prores_raw.c @@ -51,7 +51,6 @@ typedef struct ProResRAWVulkanDecodeContext { typedef struct DecodePushData { VkDeviceAddress pkt_data; - int32_t tile_size[2]; uint8_t qmat[64]; } DecodePushData; @@ -59,6 +58,7 @@ typedef struct TileData { int32_t pos[2]; uint32_t offset; uint32_t size; + uint32_t log2_nb_blocks; } TileData; static int vk_prores_raw_start_frame(AVCodecContext *avctx, @@ -118,6 +118,7 @@ static int vk_prores_raw_decode_slice(AVCodecContext *avctx, td[pp->nb_tiles].pos[0] = prr->tiles[pp->nb_tiles].x; td[pp->nb_tiles].pos[1] = prr->tiles[pp->nb_tiles].y; td[pp->nb_tiles].size = size; + td[pp->nb_tiles].log2_nb_blocks = prr->tiles[pp->nb_tiles].log2_nb_blocks; if (vp->slices_buf && slices_buf->host_ref) { td[pp->nb_tiles].offset = data - slices_buf->mapped_mem; @@ -229,8 +230,6 @@ static int vk_prores_raw_end_frame(AVCodecContext *avctx) /* Update push data */ DecodePushData pd_decode = (DecodePushData) { .pkt_data = slices_buf->address, - .tile_size[0] = prr->tw, - .tile_size[1] = prr->th, }; memcpy(pd_decode.qmat, prr->qmat, 64); ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader,