meshoptimizer: Update to 0.24

2025-10-26 03:04:31 +00:00 · 2025-06-27 01:02:34 +02:00 · 2025-06-27 01:02:34 +02:00 · 893f5b37f4
commit 893f5b37f4
parent 9a3976097f
14 changed files with 1327 additions and 435 deletions
--- a/thirdparty/meshoptimizer/clusterizer.cpp
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@ -10,6 +10,8 @@
 // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
 // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
 // Jack Ritter. An Efficient Bounding Sphere. 1990
+// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008
+// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006
 namespace meshopt
 {

@ -23,6 +25,9 @@ const size_t kMeshletMaxTriangles = 512;
 const size_t kMeshletMaxSeeds = 256;
 const size_t kMeshletAddSeeds = 4;

+// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree
+const int kMeshletMaxTreeDepth = 50;
+
 struct TriangleAdjacency2
 {
 	unsigned int* counts;
@ -144,37 +149,62 @@ static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const un
 	}
 }

-static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride)
+static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
 {
+	static const float kAxes[7][3] = {
+	    // X, Y, Z
+	    {1, 0, 0},
+	    {0, 1, 0},
+	    {0, 0, 1},
+
+	    // XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length
+	    {0.57735026f, 0.57735026f, 0.57735026f},
+	    {-0.57735026f, 0.57735026f, 0.57735026f},
+	    {0.57735026f, -0.57735026f, 0.57735026f},
+	    {0.57735026f, 0.57735026f, -0.57735026f},
+	};
+
 	assert(count > 0);
+	assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));

 	size_t points_stride_float = points_stride / sizeof(float);
 	size_t radii_stride_float = radii_stride / sizeof(float);

-	// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
-	size_t pmin[3] = {0, 0, 0};
-	size_t pmax[3] = {0, 0, 0};
+	// find extremum points along all axes; for each axis we get a pair of points with min/max coordinates
+	size_t pmin[7], pmax[7];
+	float tmin[7], tmax[7];
+
+	for (size_t axis = 0; axis < axis_count; ++axis)
+	{
+		pmin[axis] = pmax[axis] = 0;
+		tmin[axis] = FLT_MAX;
+		tmax[axis] = -FLT_MAX;
+	}

 	for (size_t i = 0; i < count; ++i)
 	{
 		const float* p = points + i * points_stride_float;
 		float r = radii[i * radii_stride_float];

-		for (int axis = 0; axis < 3; ++axis)
+		for (size_t axis = 0; axis < axis_count; ++axis)
 		{
-			float bmin = points[pmin[axis] * points_stride_float + axis] - radii[pmin[axis] * radii_stride_float];
-			float bmax = points[pmax[axis] * points_stride_float + axis] + radii[pmax[axis] * radii_stride_float];
+			const float* ax = kAxes[axis];

-			pmin[axis] = (p[axis] - r < bmin) ? i : pmin[axis];
-			pmax[axis] = (p[axis] + r > bmax) ? i : pmax[axis];
+			float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];
+			float tpmin = tp - r, tpmax = tp + r;
+
+			pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];
+			pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];
+			tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];
+			tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];
 		}
 	}

 	// find the pair of points with largest distance
-	int paxis = 0;
+	size_t paxis = 0;
 	float paxisdr = 0;

-	for (int axis = 0; axis < 3; ++axis)
+	for (size_t axis = 0; axis < axis_count; ++axis)
 	{
 		const float* p1 = points + pmin[axis] * points_stride_float;
 		const float* p2 = points + pmax[axis] * points_stride_float;
@ -698,6 +728,314 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
 	}
 }

+struct BVHBox
+{
+	float min[3];
+	float max[3];
+};
+
+static void boxMerge(BVHBox& box, const BVHBox& other)
+{
+	for (int k = 0; k < 3; ++k)
+	{
+		box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
+		box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
+	}
+}
+
+inline float boxSurface(const BVHBox& box)
+{
+	float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
+	return sx * sy + sx * sz + sy * sz;
+}
+
+inline unsigned int radixFloat(unsigned int v)
+{
+	// if sign bit is 0, flip sign bit
+	// if sign bit is 1, flip everything
+	unsigned int mask = (int(v) >> 31) | 0x80000000;
+	return v ^ mask;
+}
+
+static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)
+{
+	memset(hist, 0, sizeof(hist));
+
+	const unsigned int* bits = reinterpret_cast<const unsigned int*>(data);
+
+	// compute 3 10-bit histograms in parallel (dropping 2 LSB)
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = radixFloat(bits[i]);
+
+		hist[(id >> 2) & 1023][0]++;
+		hist[(id >> 12) & 1023][1]++;
+		hist[(id >> 22) & 1023][2]++;
+	}
+
+	unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
+
+	// replace histogram data with prefix histogram sums in-place
+	for (int i = 0; i < 1024; ++i)
+	{
+		unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
+
+		hist[i][0] = sum0;
+		hist[i][1] = sum1;
+		hist[i][2] = sum2;
+
+		sum0 += hx;
+		sum1 += hy;
+		sum2 += hz;
+	}
+
+	assert(sum0 == count && sum1 == count && sum2 == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+{
+	const unsigned int* bits = reinterpret_cast<const unsigned int*>(keys);
+	int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;
+
+		destination[hist[id][pass]++] = source[i];
+	}
+}
+
+static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)
+{
+	(void)vertex_count;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* va = vertex_positions + vertex_stride_float * a;
+		const float* vb = vertex_positions + vertex_stride_float * b;
+		const float* vc = vertex_positions + vertex_stride_float * c;
+
+		BVHBox& box = boxes[i];
+
+		for (int k = 0; k < 3; ++k)
+		{
+			box.min[k] = va[k] < vb[k] ? va[k] : vb[k];
+			box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];
+
+			box.max[k] = va[k] > vb[k] ? va[k] : vb[k];
+			box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];
+
+			centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;
+		}
+	}
+}
+
+static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices)
+{
+	// count number of unique vertices
+	size_t used_vertices = 0;
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int index = order[i];
+		unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+		used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
+		used[a] = used[b] = used[c] = 1;
+	}
+
+	// reset used[] for future invocations
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int index = order[i];
+		unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+		used[a] = used[b] = used[c] = -1;
+	}
+
+	if (used_vertices > max_vertices)
+		return false;
+
+	// mark meshlet boundary for future reassembly
+	assert(count > 0);
+
+	boundary[0] = 1;
+	memset(boundary + 1, 0, count - 1);
+
+	return true;
+}
+
+static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
+{
+	for (size_t i = 0; i < count;)
+	{
+		size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
+
+		if (bvhPackLeaf(boundary + i, order + i, chunk, used, indices, max_vertices))
+		{
+			i += chunk;
+			continue;
+		}
+
+		// chunk is vertex bound, split it into smaller meshlets
+		assert(chunk > max_vertices / 3);
+
+		bvhPackLeaf(boundary + i, order + i, max_vertices / 3, used, indices, max_vertices);
+		i += max_vertices / 3;
+	}
+}
+
+static bool bvhDivisible(size_t count, size_t min, size_t max)
+{
+	// count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]
+	// equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)
+	// we avoid expensive integer divisions in the common case where min is <= max/2
+	return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
+}
+
+static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t count, void* scratch, size_t step, size_t min, size_t max, float fill, float* out_cost)
+{
+	BVHBox accuml = boxes[order[0]], accumr = boxes[order[count - 1]];
+	float* costs = static_cast<float*>(scratch);
+
+	// accumulate SAH cost in forward and backward directions
+	for (size_t i = 0; i < count; ++i)
+	{
+		boxMerge(accuml, boxes[order[i]]);
+		boxMerge(accumr, boxes[order[count - 1 - i]]);
+
+		costs[i] = boxSurface(accuml);
+		costs[i + count] = boxSurface(accumr);
+	}
+
+	bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
+	size_t end = aligned ? count - min : count - 1;
+
+	float rmaxf = 1.f / float(int(max));
+
+	// find best split that minimizes SAH
+	size_t bestsplit = 0;
+	float bestcost = FLT_MAX;
+
+	for (size_t i = min - 1; i < end; i += step)
+	{
+		size_t lsplit = i + 1, rsplit = count - (i + 1);
+
+		if (!bvhDivisible(lsplit, min, max))
+			continue;
+		if (aligned && !bvhDivisible(rsplit, min, max))
+			continue;
+
+		// costs[x] = inclusive surface area of boxes[0..x]
+		// costs[count-1-x] = inclusive surface area of boxes[x..count-1]
+		float larea = costs[i], rarea = costs[(count - 1 - (i + 1)) + count];
+		float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
+
+		if (cost > bestcost)
+			continue;
+
+		// fill cost; use floating point math to avoid expensive integer modulo
+		int lrest = int(float(int(lsplit + max - 1)) * rmaxf) * int(max) - int(lsplit);
+		int rrest = int(float(int(rsplit + max - 1)) * rmaxf) * int(max) - int(rsplit);
+
+		cost += fill * (float(lrest) * larea + float(rrest) * rarea);
+
+		if (cost < bestcost)
+		{
+			bestcost = cost;
+			bestsplit = i + 1;
+		}
+	}
+
+	*out_cost = bestcost;
+	return bestsplit;
+}
+
+static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
+{
+	size_t l = 0, r = split;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned char side = sides[order[i]];
+		target[side ? r : l] = order[i];
+		l += 1;
+		l -= side;
+		r += side;
+	}
+
+	assert(l == split && r == count);
+}
+
+static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+	if (depth >= kMeshletMaxTreeDepth)
+		return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
+
+	if (count <= max_triangles && bvhPackLeaf(boundary, orderx, count, used, indices, max_vertices))
+		return;
+
+	unsigned int* axes[3] = {orderx, ordery, orderz};
+
+	// we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max
+	size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;
+
+	// if we could not pack the meshlet, we must be vertex bound
+	size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
+
+	// only use fill weight if we are optimizing for triangle count
+	float fill = count <= max_triangles ? 0.f : fill_weight;
+
+	// find best split that minimizes SAH
+	int bestk = -1;
+	size_t bestsplit = 0;
+	float bestcost = FLT_MAX;
+
+	for (int k = 0; k < 3; ++k)
+	{
+		float axiscost = FLT_MAX;
+		size_t axissplit = bvhPivot(boxes, axes[k], count, scratch, step, mint, max_triangles, fill, &axiscost);
+
+		if (axissplit && axiscost < bestcost)
+		{
+			bestk = k;
+			bestcost = axiscost;
+			bestsplit = axissplit;
+		}
+	}
+
+	// this may happen if SAH costs along the admissible splits are NaN
+	if (bestk < 0)
+		return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
+
+	// mark sides of split for partitioning
+	unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
+
+	for (size_t i = 0; i < bestsplit; ++i)
+		sides[axes[bestk][i]] = 0;
+
+	for (size_t i = bestsplit; i < count; ++i)
+		sides[axes[bestk][i]] = 1;
+
+	// partition all axes into two sides, maintaining order
+	unsigned int* temp = static_cast<unsigned int*>(scratch);
+
+	for (int k = 0; k < 3; ++k)
+	{
+		if (k == bestk)
+			continue;
+
+		unsigned int* axis = axes[k];
+		memcpy(temp, axis, sizeof(unsigned int) * count);
+		bvhPartition(axis, temp, sides, bestsplit, count);
+	}
+
+	bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+	bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+}
+
 } // namespace meshopt

 size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
@ -962,6 +1300,108 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
 	return meshlet_offset;
 }

+size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
+	assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+
+	if (index_count == 0)
+		return 0;
+
+	size_t face_count = index_count / 3;
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_Allocator allocator;
+
+	// 3 floats plus 1 uint for sorting, or
+	// 2 floats for SAH costs, or
+	// 1 uint plus 1 byte for partitioning
+	float* scratch = allocator.allocate<float>(face_count * 4);
+
+	// compute bounding boxes and centroids for sorting
+	BVHBox* boxes = allocator.allocate<BVHBox>(face_count);
+	bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
+
+	unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);
+	unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;
+
+	for (int k = 0; k < 3; ++k)
+	{
+		unsigned int* order = axes + k * face_count;
+		const float* keys = scratch + k * face_count;
+
+		unsigned int hist[1024][3];
+		computeHistogram(hist, keys, face_count);
+
+		// 3-pass radix sort computes the resulting order into axes
+		for (size_t i = 0; i < face_count; ++i)
+			temp[i] = unsigned(i);
+
+		radixPass(order, temp, keys, face_count, hist, 0);
+		radixPass(temp, order, keys, face_count, hist, 1);
+		radixPass(order, temp, keys, face_count, hist, 2);
+	}
+
+	// index of the vertex in the meshlet, -1 if the vertex isn't used
+	short* used = allocator.allocate<short>(vertex_count);
+	memset(used, -1, vertex_count * sizeof(short));
+
+	unsigned char* boundary = allocator.allocate<unsigned char>(face_count);
+
+	bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+
+	// compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound
+	size_t meshlet_count = 0;
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		assert(boundary[i] <= 1);
+		meshlet_count += boundary[i];
+	}
+
+	size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);
+
+	// pack triangles into meshlets according to the order and boundaries marked by bvhSplit
+	meshopt_Meshlet meshlet = {};
+	size_t meshlet_offset = 0;
+	size_t meshlet_pending = meshlet_count;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		assert(boundary[i] <= 1);
+		bool split = i > 0 && boundary[i] == 1;
+
+		// while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space
+		if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)
+			split = false;
+
+		unsigned int index = axes[i];
+		assert(index < face_count);
+
+		unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+		// appends triangle to the meshlet and writes previous meshlet to the output if full
+		meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);
+		meshlet_pending -= boundary[i];
+	}
+
+	if (meshlet.triangle_count)
+	{
+		finishMeshlet(meshlet, meshlet_triangles);
+
+		meshlets[meshlet_offset++] = meshlet;
+	}
+
+	assert(meshlet_offset <= meshlet_bound);
+	return meshlet_offset;
+}
+
 meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
 	using namespace meshopt;
@ -1022,13 +1462,13 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t

 	// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
 	float psphere[4] = {};
-	computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0);
+	computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);

 	float center[3] = {psphere[0], psphere[1], psphere[2]};

 	// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
 	float nsphere[4] = {};
-	computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0);
+	computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);

 	float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
 	float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
@ -1155,7 +1595,7 @@ meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count,
 	const float rzero = 0.f;

 	float psphere[4] = {};
-	computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0);
+	computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);

 	bounds.center[0] = psphere[0];
 	bounds.center[1] = psphere[1];