ffmpeg/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl
Lynne d66552e676
vulkan/ffv1: add 32-bit float RGB encoding and a rice + remap path
This implements 32-bit float RGB encoding and makes the Vulkan implementation
on-par with the C implementation.

Sponsored-by: Sovereign Tech Fund
2026-05-30 12:10:01 +09:00

153 lines
5.6 KiB
GLSL

/*
* FFv1 codec
*
* Copyright (c) 2026 Lynne <dev@lynne.ee>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#pragma shader_stage(compute)
#extension GL_GOOGLE_include_directive : require
#define SB_QUALI readonly
#include "common.glsl"
#include "ffv1_common.glsl"
layout (set = 1, binding = 1) uniform uimage2D src[];
layout (set = 1, binding = 2, scalar) buffer fltmap_buf {
uint fltmap[];
};
/* The shared fltmap_buf is laid out per (slice, plane) as a
* max_pixels_per_slice*3 uint block, where the first
* max_pixels_per_slice*2 entries hold interleaved (val, ndx) pairs and
* the trailing [max_pixels_per_slice] entries are the bitmap region used
* by the setup/encode shaders. Padding past pixel_num is the sentinel
* (UINT32_MAX, UINT32_MAX) so it sorts at the end. */
/* Per-workgroup bitonic-sort buffer. Limits a slice's pow2 size; large
* slices fall back to working in global memory */
shared u32vec2 smem[8192];
void main(void)
{
const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
uvec2 img_size = imageSize(src[0]);
uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0,
gl_NumWorkGroups.x, 0);
uint sxe = slice_coord(img_size.x, gl_WorkGroupID.x + 1,
gl_NumWorkGroups.x, 0);
uint sys = slice_coord(img_size.y, gl_WorkGroupID.y + 0,
gl_NumWorkGroups.y, 0);
uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1,
gl_NumWorkGroups.y, 0);
uint slice_w = sxe - sxs;
uint slice_h = sye - sys;
uint pixel_num = slice_w * slice_h;
/* Round up to next pow2 for bitonic sort */
uint N = 1;
while (N < pixel_num)
N <<= 1;
N = max(N, 2);
if (N > max_pixels_per_slice)
N = max_pixels_per_slice;
const uint plane_stride = max_pixels_per_slice*3u;
const bool use_smem = N <= 8192u;
for (int p = 0; p < color_planes; p++) {
uint base = (slice_idx*4u + uint(p))*plane_stride;
/* Load pixels */
for (uint i = gl_LocalInvocationIndex; i < N;
i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
uint v, ndx;
if (i < pixel_num) {
uint y = i / slice_w;
uint x = i - y*slice_w;
v = imageLoad(src[p], ivec2(sxs + x, sys + y))[0];
if (remap_mode == 2)
v = ((v & 0x80000000u) != 0u) ? v : (v ^ 0x7FFFFFFFu);
ndx = i;
} else {
v = 0xFFFFFFFFu;
ndx = 0xFFFFFFFFu;
}
if (use_smem) {
smem[i] = u32vec2(v, ndx);
} else {
fltmap[base + 2u*i + 0u] = v;
fltmap[base + 2u*i + 1u] = ndx;
}
}
barrier();
if (!use_smem) memoryBarrierBuffer();
/* Bitonic sort of the (val, ndx) pairs. */
for (uint k = 2; k <= N; k <<= 1) {
for (uint j = k >> 1; j > 0; j >>= 1) {
for (uint i = gl_LocalInvocationIndex; i < N;
i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
uint partner = i ^ j;
if (partner > i) {
bool ascending = (i & k) == 0;
u32vec2 a, b;
if (use_smem) {
a = smem[i];
b = smem[partner];
} else {
a = u32vec2(fltmap[base + 2u*i + 0u],
fltmap[base + 2u*i + 1u]);
b = u32vec2(fltmap[base + 2u*partner + 0u],
fltmap[base + 2u*partner + 1u]);
}
bool a_gt_b = (a.x > b.x) ||
(a.x == b.x && a.y > b.y);
if (a_gt_b == ascending) {
if (use_smem) {
smem[i] = b;
smem[partner] = a;
} else {
fltmap[base + 2u*i + 0u] = b.x;
fltmap[base + 2u*i + 1u] = b.y;
fltmap[base + 2u*partner + 0u] = a.x;
fltmap[base + 2u*partner + 1u] = a.y;
}
}
}
}
barrier();
if (!use_smem) memoryBarrierBuffer();
}
}
/* Write sorted pairs back to global */
if (use_smem) {
for (uint i = gl_LocalInvocationIndex; i < N;
i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
u32vec2 u = smem[i];
fltmap[base + 2u*i + 0u] = u.x;
fltmap[base + 2u*i + 1u] = u.y;
}
barrier();
}
}
}