prores_raw: fix tile alignment issues

Reverse engineered the decoder a bit more. All tiles are always 16x1. The issue is that at the edges, tiles don't have the same width. Instead, the first tile that starts to clip is half, and then the next tile after that is also half the previous tile's width.
2026-06-04 22:50:24 +00:00 · 2026-05-15 02:46:11 +09:00 · 2026-05-15 02:46:11 +09:00 · d8cb567171
commit d8cb567171
parent eb24fb0c7f
5 changed files with 46 additions and 45 deletions
--- a/libavcodec/prores_raw.c
+++ b/libavcodec/prores_raw.c
@ -20,6 +20,7 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

+#include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/mem.h"
@ -131,11 +132,10 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile,
    uint16_t *dst = (uint16_t *)(frame->data[0] + tile->y*frame->linesize[0] + 2*tile->x);

    int idx;
-    const int w = FFMIN(s->tw, avctx->width - tile->x) / 2;
-    const int nb_blocks = w / 8;
-    const int log2_nb_blocks = 31 - ff_clz(nb_blocks);
-    const int block_mask = (1 << log2_nb_blocks) - 1;
-    const int nb_codes = 64 * nb_blocks;
+    const int log2_nb_blocks = tile->log2_nb_blocks;
+    const int nb_blocks  = 1 << log2_nb_blocks;
+    const int block_mask = nb_blocks - 1;
+    const int nb_codes   = 64 * nb_blocks;

    LOCAL_ALIGNED_32(int16_t, block, [64*16]);

@ -426,15 +426,13 @@ static int decode_frame(AVCodecContext *avctx,

    ff_permute_scantable(s->qmat, s->prodsp.idct_permutation, qmat);

-    s->nb_tw = (w + 15) >> 4;
+    int tw16 = (w + 15) >> 4;
+    s->nb_tw = (tw16 >> align) + av_popcount(~(-1 * (1 << align)) & tw16);
    s->nb_th = (h + 15) >> 4;
-    s->nb_tw = (s->nb_tw >> align) + av_popcount(~(-1 * (1 << align)) & s->nb_tw);
    s->nb_tiles = s->nb_tw * s->nb_th;
    av_log(avctx, AV_LOG_DEBUG, "%dx%d | nb_tiles: %d\n", s->nb_tw, s->nb_th, s->nb_tiles);

-    s->tw = s->version == 0 ? 128 : 256;
    s->th = 16;
-    av_log(avctx, AV_LOG_DEBUG, "tile_size: %dx%d\n", s->tw, s->th);

    av_fast_mallocz(&s->tiles, &s->tiles_size, s->nb_tiles * sizeof(*s->tiles));
    if (!s->tiles)
@ -443,29 +441,38 @@ static int decode_frame(AVCodecContext *avctx,
    if (bytestream2_get_bytes_left(&gb) < s->nb_tiles * 2)
        return AVERROR_INVALIDDATA;

-    /* Read tile data offsets */
+    /* First tile that extends past the right edge gets halved in width,
+     * next one gets quartered, and so on */
    int offset = bytestream2_tell(&gb) + s->nb_tiles * 2;
-    for (int n = 0; n < s->nb_tiles; n++) {
-        TileContext *tile = &s->tiles[n];
+    int n = 0;
+    for (int ty = 0; ty < s->nb_th; ty++) {
+        unsigned tx = 0;
+        int rem = tw16;
+        for (int e = align; rem > 0; e--) {
+            int unit = 1 << e;
+            while (unit <= rem) {
+                TileContext *tile = &s->tiles[n++];
+                int size = bytestream2_get_be16(&gb);

-        int size = bytestream2_get_be16(&gb);
-        if (offset >= avpkt->size)
-            return AVERROR_INVALIDDATA;
-        if (size >= avpkt->size)
-            return AVERROR_INVALIDDATA;
-        if (offset > avpkt->size - size)
-            return AVERROR_INVALIDDATA;
+                if (offset >= avpkt->size)
+                    return AVERROR_INVALIDDATA;
+                if (size >= avpkt->size)
+                    return AVERROR_INVALIDDATA;
+                if (offset > avpkt->size - size)
+                    return AVERROR_INVALIDDATA;

-        bytestream2_init(&tile->gb, avpkt->data + offset, size);
+                bytestream2_init(&tile->gb, avpkt->data + offset, size);
+                tile->x = tx * 16;
+                tile->y = ty * s->th;
+                tile->log2_nb_blocks = e;
+                offset += size;

-        tile->y = (n / s->nb_tw) * s->th;
-        tile->x = (n % s->nb_tw) * s->tw;
-
-        if (avctx->width - tile->x < 16)
-            return AVERROR_PATCHWELCOME;
-
-        offset += size;
+                tx  += unit;
+                rem -= unit;
+            }
+        }
    }
+    av_assert1(n == s->nb_tiles);

    ret = ff_thread_get_buffer(avctx, frame, 0);
    if (ret < 0)
--- a/libavcodec/prores_raw.h
+++ b/libavcodec/prores_raw.h
@ -33,6 +33,7 @@
 typedef struct TileContext {
    GetByteContext gb;
    unsigned x, y;
+    int log2_nb_blocks;
 } TileContext;

 typedef struct ProResRAWContext {
@ -42,7 +43,7 @@ typedef struct ProResRAWContext {
    TileContext *tiles;
    unsigned int tiles_size;
    int nb_tiles;
-    int tw, th;
+    int th;
    int nb_tw, nb_th;

    enum AVPixelFormat pix_fmt;
--- a/libavcodec/vulkan/prores_raw_decode.comp.glsl
+++ b/libavcodec/vulkan/prores_raw_decode.comp.glsl
@ -30,6 +30,7 @@ struct TileData {
   ivec2 pos;
   uint offset;
   uint size;
+   uint log2_nb_blocks;
 };

 layout (set = 0, binding = 0, r16ui) uniform writeonly uimage2D dst;
@ -39,7 +40,6 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {

 layout (push_constant, scalar) uniform pushConstants {
   u8buf pkt_data;
-   ivec2 tile_size;
 };

 #define COMP_ID (gl_LocalInvocationID.y)
@ -215,10 +215,6 @@ void main(void)
    const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
    TileData td = tile_data[tile_idx];

-    int width = imageSize(dst).x;
-    if (expectEXT(td.pos.x >= width, false))
-        return;
-
    uint64_t pkt_offset = uint64_t(pkt_data) + td.offset;
    u8vec2buf hdr_data = u8vec2buf(pkt_offset);
    int header_len = hdr_data[0].v.x >> 3;
@ -232,8 +228,7 @@ void main(void)
        return;

    const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1);
-    const int w = min(tile_size.x, width - td.pos.x) >> 1;
-    const int nb_blocks = w >> 3;
+    const int nb_blocks = 1 << td.log2_nb_blocks;

    const ivec4 comp_offset = ivec4(size[2] + size[1] + size[3],
                                    size[2],
--- a/libavcodec/vulkan/prores_raw_idct.comp.glsl
+++ b/libavcodec/vulkan/prores_raw_idct.comp.glsl
@ -30,6 +30,7 @@ struct TileData {
   ivec2 pos;
   uint offset;
   uint size;
+   uint log2_nb_blocks;
 };

 layout (set = 0, binding = 0, r16ui) uniform uimage2D dst;
@ -39,7 +40,6 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {

 layout (push_constant, scalar) uniform pushConstants {
   u8buf pkt_data;
-   ivec2 tile_size;
   uint8_t qmat[64];
 };

@ -73,17 +73,12 @@ void main(void)
    const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
    TileData td = tile_data[tile_idx];

-    int width = imageSize(dst).x;
-    if (expectEXT(td.pos.x >= width, false))
-        return;
-
    uint64_t pkt_offset = uint64_t(pkt_data) + td.offset;
    u8vec2buf hdr_data = u8vec2buf(pkt_offset);
    int qscale = pack16(hdr_data[0].v.yx);

    const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1);
-    const uint w = min(tile_size.x, width - td.pos.x) >> 1;
-    const uint nb_blocks = w >> 3;
+    const uint nb_blocks = 1 << td.log2_nb_blocks;

    /* Copy push-constant qmat into shared memory for fast non-uniform access */
    if (gl_LocalInvocationIndex < 64)
@ -110,6 +105,10 @@ void main(void)
    idct8(BLOCK_ID, COMP_ID*72 + ROW_ID * 9, 1);
    barrier();

+    /* Border tile check */
+    if (BLOCK_ID >= nb_blocks)
+        return;
+
    [[unroll]]
    for (uint y = 0; y < 8; y++) {
        int v = int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID]*4095.0));
--- a/libavcodec/vulkan_prores_raw.c
+++ b/libavcodec/vulkan_prores_raw.c
@ -51,7 +51,6 @@ typedef struct ProResRAWVulkanDecodeContext {

 typedef struct DecodePushData {
    VkDeviceAddress pkt_data;
-    int32_t tile_size[2];
    uint8_t  qmat[64];
 } DecodePushData;

@ -59,6 +58,7 @@ typedef struct TileData {
    int32_t pos[2];
    uint32_t offset;
    uint32_t size;
+    uint32_t log2_nb_blocks;
 } TileData;

 static int vk_prores_raw_start_frame(AVCodecContext          *avctx,
@ -118,6 +118,7 @@ static int vk_prores_raw_decode_slice(AVCodecContext *avctx,
    td[pp->nb_tiles].pos[0] = prr->tiles[pp->nb_tiles].x;
    td[pp->nb_tiles].pos[1] = prr->tiles[pp->nb_tiles].y;
    td[pp->nb_tiles].size = size;
+    td[pp->nb_tiles].log2_nb_blocks = prr->tiles[pp->nb_tiles].log2_nb_blocks;

    if (vp->slices_buf && slices_buf->host_ref) {
        td[pp->nb_tiles].offset = data - slices_buf->mapped_mem;
@ -229,8 +230,6 @@ static int vk_prores_raw_end_frame(AVCodecContext *avctx)
    /* Update push data */
    DecodePushData pd_decode = (DecodePushData) {
        .pkt_data = slices_buf->address,
-        .tile_size[0] = prr->tw,
-        .tile_size[1] = prr->th,
    };
    memcpy(pd_decode.qmat, prr->qmat, 64);
    ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader,