meshoptimizer: Update to 0.24

This commit is contained in:
Jakub Marcowski 2025-06-27 01:02:34 +02:00
parent 9a3976097f
commit 893f5b37f4
No known key found for this signature in database
GPG key ID: 10D9E07CFFBC0E6F
14 changed files with 1327 additions and 435 deletions

View file

@ -10,6 +10,8 @@
// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
// Jack Ritter. An Efficient Bounding Sphere. 1990
// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008
// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006
namespace meshopt
{
@ -23,6 +25,9 @@ const size_t kMeshletMaxTriangles = 512;
const size_t kMeshletMaxSeeds = 256;
const size_t kMeshletAddSeeds = 4;
// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree
const int kMeshletMaxTreeDepth = 50;
struct TriangleAdjacency2
{
unsigned int* counts;
@ -144,37 +149,62 @@ static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const un
}
}
static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride)
static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
{
static const float kAxes[7][3] = {
// X, Y, Z
{1, 0, 0},
{0, 1, 0},
{0, 0, 1},
// XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length
{0.57735026f, 0.57735026f, 0.57735026f},
{-0.57735026f, 0.57735026f, 0.57735026f},
{0.57735026f, -0.57735026f, 0.57735026f},
{0.57735026f, 0.57735026f, -0.57735026f},
};
assert(count > 0);
assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));
size_t points_stride_float = points_stride / sizeof(float);
size_t radii_stride_float = radii_stride / sizeof(float);
// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
size_t pmin[3] = {0, 0, 0};
size_t pmax[3] = {0, 0, 0};
// find extremum points along all axes; for each axis we get a pair of points with min/max coordinates
size_t pmin[7], pmax[7];
float tmin[7], tmax[7];
for (size_t axis = 0; axis < axis_count; ++axis)
{
pmin[axis] = pmax[axis] = 0;
tmin[axis] = FLT_MAX;
tmax[axis] = -FLT_MAX;
}
for (size_t i = 0; i < count; ++i)
{
const float* p = points + i * points_stride_float;
float r = radii[i * radii_stride_float];
for (int axis = 0; axis < 3; ++axis)
for (size_t axis = 0; axis < axis_count; ++axis)
{
float bmin = points[pmin[axis] * points_stride_float + axis] - radii[pmin[axis] * radii_stride_float];
float bmax = points[pmax[axis] * points_stride_float + axis] + radii[pmax[axis] * radii_stride_float];
const float* ax = kAxes[axis];
pmin[axis] = (p[axis] - r < bmin) ? i : pmin[axis];
pmax[axis] = (p[axis] + r > bmax) ? i : pmax[axis];
float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];
float tpmin = tp - r, tpmax = tp + r;
pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];
pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];
tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];
tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];
}
}
// find the pair of points with largest distance
int paxis = 0;
size_t paxis = 0;
float paxisdr = 0;
for (int axis = 0; axis < 3; ++axis)
for (size_t axis = 0; axis < axis_count; ++axis)
{
const float* p1 = points + pmin[axis] * points_stride_float;
const float* p2 = points + pmax[axis] * points_stride_float;
@ -698,6 +728,314 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
}
}
struct BVHBox
{
float min[3];
float max[3];
};
static void boxMerge(BVHBox& box, const BVHBox& other)
{
for (int k = 0; k < 3; ++k)
{
box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
}
}
inline float boxSurface(const BVHBox& box)
{
float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
return sx * sy + sx * sz + sy * sz;
}
inline unsigned int radixFloat(unsigned int v)
{
// if sign bit is 0, flip sign bit
// if sign bit is 1, flip everything
unsigned int mask = (int(v) >> 31) | 0x80000000;
return v ^ mask;
}
static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)
{
memset(hist, 0, sizeof(hist));
const unsigned int* bits = reinterpret_cast<const unsigned int*>(data);
// compute 3 10-bit histograms in parallel (dropping 2 LSB)
for (size_t i = 0; i < count; ++i)
{
unsigned int id = radixFloat(bits[i]);
hist[(id >> 2) & 1023][0]++;
hist[(id >> 12) & 1023][1]++;
hist[(id >> 22) & 1023][2]++;
}
unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
// replace histogram data with prefix histogram sums in-place
for (int i = 0; i < 1024; ++i)
{
unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
hist[i][0] = sum0;
hist[i][1] = sum1;
hist[i][2] = sum2;
sum0 += hx;
sum1 += hy;
sum2 += hz;
}
assert(sum0 == count && sum1 == count && sum2 == count);
}
static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
{
const unsigned int* bits = reinterpret_cast<const unsigned int*>(keys);
int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes
for (size_t i = 0; i < count; ++i)
{
unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;
destination[hist[id][pass]++] = source[i];
}
}
static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)
{
(void)vertex_count;
for (size_t i = 0; i < face_count; ++i)
{
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
const float* va = vertex_positions + vertex_stride_float * a;
const float* vb = vertex_positions + vertex_stride_float * b;
const float* vc = vertex_positions + vertex_stride_float * c;
BVHBox& box = boxes[i];
for (int k = 0; k < 3; ++k)
{
box.min[k] = va[k] < vb[k] ? va[k] : vb[k];
box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];
box.max[k] = va[k] > vb[k] ? va[k] : vb[k];
box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];
centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;
}
}
}
static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices)
{
// count number of unique vertices
size_t used_vertices = 0;
for (size_t i = 0; i < count; ++i)
{
unsigned int index = order[i];
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
used[a] = used[b] = used[c] = 1;
}
// reset used[] for future invocations
for (size_t i = 0; i < count; ++i)
{
unsigned int index = order[i];
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
used[a] = used[b] = used[c] = -1;
}
if (used_vertices > max_vertices)
return false;
// mark meshlet boundary for future reassembly
assert(count > 0);
boundary[0] = 1;
memset(boundary + 1, 0, count - 1);
return true;
}
static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
{
for (size_t i = 0; i < count;)
{
size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
if (bvhPackLeaf(boundary + i, order + i, chunk, used, indices, max_vertices))
{
i += chunk;
continue;
}
// chunk is vertex bound, split it into smaller meshlets
assert(chunk > max_vertices / 3);
bvhPackLeaf(boundary + i, order + i, max_vertices / 3, used, indices, max_vertices);
i += max_vertices / 3;
}
}
static bool bvhDivisible(size_t count, size_t min, size_t max)
{
// count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]
// equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)
// we avoid expensive integer divisions in the common case where min is <= max/2
return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
}
static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t count, void* scratch, size_t step, size_t min, size_t max, float fill, float* out_cost)
{
BVHBox accuml = boxes[order[0]], accumr = boxes[order[count - 1]];
float* costs = static_cast<float*>(scratch);
// accumulate SAH cost in forward and backward directions
for (size_t i = 0; i < count; ++i)
{
boxMerge(accuml, boxes[order[i]]);
boxMerge(accumr, boxes[order[count - 1 - i]]);
costs[i] = boxSurface(accuml);
costs[i + count] = boxSurface(accumr);
}
bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
size_t end = aligned ? count - min : count - 1;
float rmaxf = 1.f / float(int(max));
// find best split that minimizes SAH
size_t bestsplit = 0;
float bestcost = FLT_MAX;
for (size_t i = min - 1; i < end; i += step)
{
size_t lsplit = i + 1, rsplit = count - (i + 1);
if (!bvhDivisible(lsplit, min, max))
continue;
if (aligned && !bvhDivisible(rsplit, min, max))
continue;
// costs[x] = inclusive surface area of boxes[0..x]
// costs[count-1-x] = inclusive surface area of boxes[x..count-1]
float larea = costs[i], rarea = costs[(count - 1 - (i + 1)) + count];
float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
if (cost > bestcost)
continue;
// fill cost; use floating point math to avoid expensive integer modulo
int lrest = int(float(int(lsplit + max - 1)) * rmaxf) * int(max) - int(lsplit);
int rrest = int(float(int(rsplit + max - 1)) * rmaxf) * int(max) - int(rsplit);
cost += fill * (float(lrest) * larea + float(rrest) * rarea);
if (cost < bestcost)
{
bestcost = cost;
bestsplit = i + 1;
}
}
*out_cost = bestcost;
return bestsplit;
}
static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
{
size_t l = 0, r = split;
for (size_t i = 0; i < count; ++i)
{
unsigned char side = sides[order[i]];
target[side ? r : l] = order[i];
l += 1;
l -= side;
r += side;
}
assert(l == split && r == count);
}
static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
{
if (depth >= kMeshletMaxTreeDepth)
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
if (count <= max_triangles && bvhPackLeaf(boundary, orderx, count, used, indices, max_vertices))
return;
unsigned int* axes[3] = {orderx, ordery, orderz};
// we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max
size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;
// if we could not pack the meshlet, we must be vertex bound
size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
// only use fill weight if we are optimizing for triangle count
float fill = count <= max_triangles ? 0.f : fill_weight;
// find best split that minimizes SAH
int bestk = -1;
size_t bestsplit = 0;
float bestcost = FLT_MAX;
for (int k = 0; k < 3; ++k)
{
float axiscost = FLT_MAX;
size_t axissplit = bvhPivot(boxes, axes[k], count, scratch, step, mint, max_triangles, fill, &axiscost);
if (axissplit && axiscost < bestcost)
{
bestk = k;
bestcost = axiscost;
bestsplit = axissplit;
}
}
// this may happen if SAH costs along the admissible splits are NaN
if (bestk < 0)
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
// mark sides of split for partitioning
unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
for (size_t i = 0; i < bestsplit; ++i)
sides[axes[bestk][i]] = 0;
for (size_t i = bestsplit; i < count; ++i)
sides[axes[bestk][i]] = 1;
// partition all axes into two sides, maintaining order
unsigned int* temp = static_cast<unsigned int*>(scratch);
for (int k = 0; k < 3; ++k)
{
if (k == bestk)
continue;
unsigned int* axis = axes[k];
memcpy(temp, axis, sizeof(unsigned int) * count);
bvhPartition(axis, temp, sides, bestsplit, count);
}
bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
}
} // namespace meshopt
size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
@ -962,6 +1300,108 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
return meshlet_offset;
}
size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
if (index_count == 0)
return 0;
size_t face_count = index_count / 3;
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
meshopt_Allocator allocator;
// 3 floats plus 1 uint for sorting, or
// 2 floats for SAH costs, or
// 1 uint plus 1 byte for partitioning
float* scratch = allocator.allocate<float>(face_count * 4);
// compute bounding boxes and centroids for sorting
BVHBox* boxes = allocator.allocate<BVHBox>(face_count);
bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);
unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;
for (int k = 0; k < 3; ++k)
{
unsigned int* order = axes + k * face_count;
const float* keys = scratch + k * face_count;
unsigned int hist[1024][3];
computeHistogram(hist, keys, face_count);
// 3-pass radix sort computes the resulting order into axes
for (size_t i = 0; i < face_count; ++i)
temp[i] = unsigned(i);
radixPass(order, temp, keys, face_count, hist, 0);
radixPass(temp, order, keys, face_count, hist, 1);
radixPass(order, temp, keys, face_count, hist, 2);
}
// index of the vertex in the meshlet, -1 if the vertex isn't used
short* used = allocator.allocate<short>(vertex_count);
memset(used, -1, vertex_count * sizeof(short));
unsigned char* boundary = allocator.allocate<unsigned char>(face_count);
bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
// compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound
size_t meshlet_count = 0;
for (size_t i = 0; i < face_count; ++i)
{
assert(boundary[i] <= 1);
meshlet_count += boundary[i];
}
size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);
// pack triangles into meshlets according to the order and boundaries marked by bvhSplit
meshopt_Meshlet meshlet = {};
size_t meshlet_offset = 0;
size_t meshlet_pending = meshlet_count;
for (size_t i = 0; i < face_count; ++i)
{
assert(boundary[i] <= 1);
bool split = i > 0 && boundary[i] == 1;
// while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space
if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)
split = false;
unsigned int index = axes[i];
assert(index < face_count);
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
// appends triangle to the meshlet and writes previous meshlet to the output if full
meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);
meshlet_pending -= boundary[i];
}
if (meshlet.triangle_count)
{
finishMeshlet(meshlet, meshlet_triangles);
meshlets[meshlet_offset++] = meshlet;
}
assert(meshlet_offset <= meshlet_bound);
return meshlet_offset;
}
meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
@ -1022,13 +1462,13 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
float psphere[4] = {};
computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0);
computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);
float center[3] = {psphere[0], psphere[1], psphere[2]};
// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
float nsphere[4] = {};
computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0);
computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);
float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
@ -1155,7 +1595,7 @@ meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count,
const float rzero = 0.f;
float psphere[4] = {};
computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0);
computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);
bounds.center[0] = psphere[0];
bounds.center[1] = psphere[1];