Update astcenc to the upstream 5.3.0 release

This is mostly a maintenance update that brings the compressor inline with the recently published Khronos Data Format Specification 1.4 release which clarified some ambiguity in the specification. This update also gives minor codec optimizations, bug fixes, and image quality improvements. The biggest improvement for Godot is that builds using MSVC cl.exe will now correctly default to the SSE2-optimized backend rather than the reference C backend. This makes compression more than 3 times faster. Builds using other compilers (GCC, LLVM/Clang) were not impacted by the underlying issue, and see no performance uplift.
2025-12-08 06:09:55 +00:00 · 2025-03-21 16:02:50 -07:00 · 2025-03-21 16:02:50 -07:00 · 75ce42d463
commit 75ce42d463
parent 2303ce843a
24 changed files with 2068 additions and 813 deletions
--- a/COPYRIGHT.txt
+++ b/COPYRIGHT.txt
@ -180,7 +180,7 @@ License: BSD-3-clause
 Files: thirdparty/astcenc/*
 Comment: Arm ASTC Encoder
-Copyright: 2011-2024, Arm Limited
+Copyright: 2011-2025, Arm Limited
 License: Apache-2.0
 Files: thirdparty/basis_universal/*
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@ -50,7 +50,7 @@ Files extracted from upstream source:
 ## astcenc
 - Upstream: https://github.com/ARM-software/astc-encoder
- Version: 4.8.0 (0d6c9047c5ad19640e2d60fdb8f11a16675e7938, 2024)
+- Version: 5.3.0 (bf32abd05eccaf3042170b2a85cebdf0bfee5873, 2025)
 - License: Apache 2.0
 Files extracted from upstream source:
--- a/thirdparty/astcenc/astcenc.h
+++ b/thirdparty/astcenc/astcenc.h
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2020-2024 Arm Limited
+// Copyright 2020-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -784,6 +784,20 @@ ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
 ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
 	astcenc_context* context);
 /**
 * @brief Cancel any pending compression operation.
 *
 * The caller must behave as if the compression completed normally, even though the data will be
 * undefined. They are still responsible for synchronizing threads in the worker thread pool, and
 * must call reset before starting another compression.
 *
 * @param context   Codec context.
 *
 * @return @c ASTCENC_SUCCESS on success, or an error if cancellation failed.
 */
 ASTCENC_PUBLIC astcenc_error astcenc_compress_cancel(
 	astcenc_context* context);
 /**
 * @brief Decompress an image.
 *
--- a/thirdparty/astcenc/astcenc_averages_and_directions.cpp
+++ b/thirdparty/astcenc/astcenc_averages_and_directions.cpp
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -50,7 +50,7 @@ static void compute_partition_averages_rgb(
 	vfloat4 averages[BLOCK_MAX_PARTITIONS]
 ) {
 	unsigned int partition_count = pi.partition_count;
-	unsigned int texel_count = blk.texel_count;
+	size_t texel_count = blk.texel_count;
 	promise(texel_count > 0);
 	// For 1 partition just use the precomputed mean
@ -64,11 +64,11 @@ static void compute_partition_averages_rgb(
 		vfloatacc pp_avg_rgb[3] {};
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@ -100,11 +100,11 @@ static void compute_partition_averages_rgb(
 		vfloatacc pp_avg_rgb[2][3] {};
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@ -145,11 +145,11 @@ static void compute_partition_averages_rgb(
 		vfloatacc pp_avg_rgb[3][3] {};
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@ -221,7 +221,7 @@ static void compute_partition_averages_rgba(
 	vfloat4 averages[BLOCK_MAX_PARTITIONS]
 ) {
 	unsigned int partition_count = pi.partition_count;
-	unsigned int texel_count = blk.texel_count;
+	size_t texel_count = blk.texel_count;
 	promise(texel_count > 0);
 	// For 1 partition just use the precomputed mean
@ -235,11 +235,11 @@ static void compute_partition_averages_rgba(
 		vfloat4 pp_avg_rgba[4] {};
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@ -275,11 +275,11 @@ static void compute_partition_averages_rgba(
 		vfloat4 pp_avg_rgba[2][4] {};
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@ -326,11 +326,11 @@ static void compute_partition_averages_rgba(
 		vfloat4 pp_avg_rgba[3][4] {};
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@ -390,17 +390,17 @@ void compute_avgs_and_dirs_4_comp(
 	const image_block& blk,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
 ) {
-	int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
 	// Pre-compute partition_averages
 	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
 	compute_partition_averages_rgba(pi, blk, partition_averages);
-	for (int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 		vfloat4 average = partition_averages[partition];
@ -411,7 +411,7 @@ void compute_avgs_and_dirs_4_comp(
 		vfloat4 sum_zp = vfloat4::zero();
 		vfloat4 sum_wp = vfloat4::zero();
-		for (unsigned int i = 0; i < texel_count; i++)
+		for (size_t i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
 			vfloat4 texel_datum = blk.texel(iwt);
@ -509,13 +509,13 @@ void compute_avgs_and_dirs_3_comp(
 		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
 	}
-	unsigned int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 		vfloat4 average = partition_averages[partition];
@ -525,7 +525,7 @@ void compute_avgs_and_dirs_3_comp(
 		vfloat4 sum_yp = vfloat4::zero();
 		vfloat4 sum_zp = vfloat4::zero();
-		for (unsigned int i = 0; i < texel_count; i++)
+		for (size_t i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
@ -570,17 +570,17 @@ void compute_avgs_and_dirs_3_comp_rgb(
 	const image_block& blk,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
 ) {
-	unsigned int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
 	// Pre-compute partition_averages
 	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
 	compute_partition_averages_rgb(pi, blk, partition_averages);
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 		vfloat4 average = partition_averages[partition];
@ -590,7 +590,7 @@ void compute_avgs_and_dirs_3_comp_rgb(
 		vfloat4 sum_yp = vfloat4::zero();
 		vfloat4 sum_zp = vfloat4::zero();
-		for (unsigned int i = 0; i < texel_count; i++)
+		for (size_t i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
@ -664,20 +664,20 @@ void compute_avgs_and_dirs_2_comp(
 		data_vg = blk.data_b;
 	}
-	unsigned int partition_count = pt.partition_count;
+	size_t partition_count = pt.partition_count;
 	promise(partition_count > 0);
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
-		unsigned int texel_count = pt.partition_texel_count[partition];
+		size_t texel_count = pt.partition_texel_count[partition];
 		promise(texel_count > 0);
 		// Only compute a partition mean if more than one partition
 		if (partition_count > 1)
 		{
 			average = vfloat4::zero();
-			for (unsigned int i = 0; i < texel_count; i++)
+			for (size_t i = 0; i < texel_count; i++)
 			{
 				unsigned int iwt = texel_indexes[i];
 				average += vfloat2(data_vr[iwt], data_vg[iwt]);
@ -691,7 +691,7 @@ void compute_avgs_and_dirs_2_comp(
 		vfloat4 sum_xp = vfloat4::zero();
 		vfloat4 sum_yp = vfloat4::zero();
-		for (unsigned int i = 0; i < texel_count; i++)
+		for (size_t i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
 			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
@ -729,20 +729,20 @@ void compute_error_squared_rgba(
 	float& uncor_error,
 	float& samec_error
 ) {
-	unsigned int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
 	vfloatacc uncor_errorsumv = vfloatacc::zero();
 	vfloatacc samec_errorsumv = vfloatacc::zero();
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
 		processed_line4 l_uncor = uncor_plines[partition];
 		processed_line4 l_samec = samec_plines[partition];
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 		// Vectorize some useful scalar inputs
@ -775,15 +775,15 @@ void compute_error_squared_rgba(
 		// array to extend the last value. This means min/max are not impacted, but we need to mask
 		// out the dummy values when we compute the line weighting.
 		vint lane_ids = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
-			vmask mask = lane_ids < vint(texel_count);
+			vmask mask = lane_ids < vint_from_size(texel_count);
-			vint texel_idxs(texel_indexes + i);
+			const uint8_t* texel_idxs = texel_indexes + i;
-			vfloat data_r = gatherf(blk.data_r, texel_idxs);
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
-			vfloat data_g = gatherf(blk.data_g, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
-			vfloat data_b = gatherf(blk.data_b, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
-			vfloat data_a = gatherf(blk.data_a, texel_idxs);
+			vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
 			vfloat uncor_param = (data_r * l_uncor_bs0)
 			                   + (data_g * l_uncor_bs1)
@ -847,17 +847,17 @@ void compute_error_squared_rgb(
 	float& uncor_error,
 	float& samec_error
 ) {
-	unsigned int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
 	vfloatacc uncor_errorsumv = vfloatacc::zero();
 	vfloatacc samec_errorsumv = vfloatacc::zero();
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		partition_lines3& pl = plines[partition];
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 		processed_line3 l_uncor = pl.uncor_pline;
@ -889,14 +889,14 @@ void compute_error_squared_rgb(
 		// to extend the last value. This means min/max are not impacted, but we need to mask
 		// out the dummy values when we compute the line weighting.
 		vint lane_ids = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
-			vmask mask = lane_ids < vint(texel_count);
+			vmask mask = lane_ids < vint_from_size(texel_count);
-			vint texel_idxs(texel_indexes + i);
+			const uint8_t* texel_idxs = texel_indexes + i;
-			vfloat data_r = gatherf(blk.data_r, texel_idxs);
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
-			vfloat data_g = gatherf(blk.data_g, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
-			vfloat data_b = gatherf(blk.data_b, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
 			vfloat uncor_param = (data_r * l_uncor_bs0)
 			                   + (data_g * l_uncor_bs1)
--- a/thirdparty/astcenc/astcenc_block_sizes.cpp
+++ b/thirdparty/astcenc/astcenc_block_sizes.cpp
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -384,12 +384,12 @@ static void init_decimation_info_2d(
 	}
 	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
-	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
+	size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
-	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
+	for (size_t i = texels_per_block; i < texels_per_block_simd; i++)
 	{
 		di.texel_weight_count[i] = 0;
-		for (unsigned int j = 0; j < 4; j++)
+		for (size_t j = 0; j < 4; j++)
 		{
 			di.texel_weight_contribs_float_tr[j][i] = 0;
 			di.texel_weights_tr[j][i] = 0;
@ -402,12 +402,12 @@ static void init_decimation_info_2d(
 	unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
 	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
-	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
+	size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
-	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
+	for (size_t i = weights_per_block; i < weights_per_block_simd; i++)
 	{
 		di.weight_texel_count[i] = 0;
-		for (unsigned int j = 0; j < max_texel_count_of_weight; j++)
+		for (size_t j = 0; j < max_texel_count_of_weight; j++)
 		{
 			di.weight_texels_tr[j][i] = last_texel;
 			di.weights_texel_contribs_tr[j][i] = 0.0f;
@ -640,12 +640,12 @@ static void init_decimation_info_3d(
 	}
 	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
-	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
+	size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
-	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
+	for (size_t i = texels_per_block; i < texels_per_block_simd; i++)
 	{
 		di.texel_weight_count[i] = 0;
-		for (unsigned int j = 0; j < 4; j++)
+		for (size_t j = 0; j < 4; j++)
 		{
 			di.texel_weight_contribs_float_tr[j][i] = 0;
 			di.texel_weights_tr[j][i] = 0;
@ -658,12 +658,12 @@ static void init_decimation_info_3d(
 	int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
 	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
-	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
+	size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
-	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
+	for (size_t i = weights_per_block; i < weights_per_block_simd; i++)
 	{
 		di.weight_texel_count[i] = 0;
-		for (int j = 0; j < max_texel_count_of_weight; j++)
+		for (size_t j = 0; j < max_texel_count_of_weight; j++)
 		{
 			di.weight_texels_tr[j][i] = last_texel;
 			di.weights_texel_contribs_tr[j][i] = 0.0f;
--- a/thirdparty/astcenc/astcenc_color_unquantize.cpp
+++ b/thirdparty/astcenc/astcenc_color_unquantize.cpp
@ -925,15 +925,8 @@ void unpack_color_endpoints(
 			alpha_hdr = false;
 		}
-		vmask4 mask(true, true, true, false);
+		output0 = lsl<8>(output0) | vint4(0x80);
-
+		output1 = lsl<8>(output1) | vint4(0x80);
 		vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
 		vint4 output0a = output0 * 257;
 		output0 = select(output0a, output0rgb, mask);
 		vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
 		vint4 output1a = output1 * 257;
 		output1 = select(output1a, output1rgb, mask);
 	}
 	// An HDR profile decode, but may be using linear LDR endpoints
 	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
--- a/thirdparty/astcenc/astcenc_compress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_compress_symbolic.cpp
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -1280,7 +1280,7 @@ void compress_block(
 		1.0f
 	};
-	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
+	const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
 	// Only enable MODE0 fast path if enabled
 	// Never enable for 3D blocks as no "always" block modes are available
--- a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
@ -98,19 +98,14 @@ void unpack_weights(
 	if (!is_dual_plane)
 	{
 		// Build full 64-entry weight lookup table
-		vint4 tab0 = vint4::load(scb.weights +  0);
+		vtable_64x8 table;
-		vint4 tab1 = vint4::load(scb.weights + 16);
+		vtable_prepare(table, scb.weights);
 		vint4 tab2 = vint4::load(scb.weights + 32);
 		vint4 tab3 = vint4::load(scb.weights + 48);
 		vint tab0p, tab1p, tab2p, tab3p;
 		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint summed_value(8);
 			vint weight_count(di.texel_weight_count + i);
-			int max_weight_count = hmax(weight_count).lane<0>();
+			int max_weight_count = hmax_s(weight_count);
 			promise(max_weight_count > 0);
 			for (int j = 0; j < max_weight_count; j++)
@ -118,7 +113,7 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
-				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
+				summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
 			}
 			store(lsr<4>(summed_value), weights_plane1 + i);
@ -128,16 +123,12 @@ void unpack_weights(
 	{
 		// Build a 32-entry weight lookup table per plane
 		// Plane 1
-		vint4 tab0_plane1 = vint4::load(scb.weights +  0);
+		vtable_32x8 tab_plane1;
-		vint4 tab1_plane1 = vint4::load(scb.weights + 16);
+		vtable_prepare(tab_plane1, scb.weights);
 		vint tab0_plane1p, tab1_plane1p;
 		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
 		// Plane 2
-		vint4 tab0_plane2 = vint4::load(scb.weights + 32);
+		vtable_32x8 tab_plane2;
-		vint4 tab1_plane2 = vint4::load(scb.weights + 48);
+		vtable_prepare(tab_plane2, scb.weights + 32);
 		vint tab0_plane2p, tab1_plane2p;
 		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
@ -145,7 +136,7 @@ void unpack_weights(
 			vint sum_plane2(8);
 			vint weight_count(di.texel_weight_count + i);
-			int max_weight_count = hmax(weight_count).lane<0>();
+			int max_weight_count = hmax_s(weight_count);
 			promise(max_weight_count > 0);
 			for (int j = 0; j < max_weight_count; j++)
@ -153,8 +144,8 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
-				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
+				sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
-				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
+				sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
 			}
 			store(lsr<4>(sum_plane1), weights_plane1 + i);
--- a/thirdparty/astcenc/astcenc_entry.cpp
+++ b/thirdparty/astcenc/astcenc_entry.cpp
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -1123,6 +1123,29 @@ astcenc_error astcenc_compress_reset(
 #endif
 }
 /* See header for documentation. */
 astcenc_error astcenc_compress_cancel(
 	astcenc_context* ctxo
 ) {
 #if defined(ASTCENC_DECOMPRESS_ONLY)
 	(void)ctxo;
 	return ASTCENC_ERR_BAD_CONTEXT;
 #else
 	astcenc_contexti* ctx = &ctxo->context;
 	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
 	{
 		return ASTCENC_ERR_BAD_CONTEXT;
 	}
 	// Cancel compression before cancelling avg. This avoids the race condition
 	// where cancelling them in the other order could see a compression worker
 	// starting to process even though some of the avg data is undefined.
 	ctxo->manage_compress.cancel();
 	ctxo->manage_avg.cancel();
 	return ASTCENC_SUCCESS;
 #endif
 }
 /* See header for documentation. */
 astcenc_error astcenc_decompress_image(
 	astcenc_context* ctxo,
--- a/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
+++ b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -226,7 +226,7 @@ static void kmeans_update(
 	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
-	// Find the center-of-gravity in each cluster
+	// Find the center of gravity in each cluster
 	for (unsigned int i = 0; i < texel_count; i++)
 	{
 		uint8_t partition = partition_of_texel[i];
@ -425,8 +425,8 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
 	}
 	// Create a running sum from the histogram array
-	// Cells store previous values only; i.e. exclude self after sum
+	// Indices store previous values only; i.e. exclude self after sum
-	unsigned int sum = 0;
+	uint16_t sum = 0;
 	for (unsigned int i = 0; i < texel_count; i++)
 	{
 		uint16_t cnt = mscount[i];
--- a/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
+++ b/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
@ -41,16 +41,16 @@ static vfloat bilinear_infill_vla(
 	unsigned int index
 ) {
 	// Load the bilinear filter texel weight indexes in the decimated grid
-	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
+	const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
-	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
+	const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
-	vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
+	const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
-	vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
+	const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;
 	// Load the bilinear filter weights from the decimated grid
-	vfloat weight_val0 = gatherf(weights, weight_idx0);
+	vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
-	vfloat weight_val1 = gatherf(weights, weight_idx1);
+	vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
-	vfloat weight_val2 = gatherf(weights, weight_idx2);
+	vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
-	vfloat weight_val3 = gatherf(weights, weight_idx3);
+	vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);
 	// Load the weight contribution factors for each decimated weight
 	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
@ -81,12 +81,12 @@ static vfloat bilinear_infill_vla_2(
 	unsigned int index
 ) {
 	// Load the bilinear filter texel weight indexes in the decimated grid
-	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
+	const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
-	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
+	const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
 	// Load the bilinear filter weights from the decimated grid
-	vfloat weight_val0 = gatherf(weights, weight_idx0);
+	vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
-	vfloat weight_val1 = gatherf(weights, weight_idx1);
+	vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
 	// Load the weight contribution factors for each decimated weight
 	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
@ -195,8 +195,8 @@ static void compute_ideal_colors_and_weights_1_comp(
 	}
 	// Zero initialize any SIMD over-fetch
-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	for (size_t i = texel_count; i < texel_count_simd; i++)
 	{
 		ei.weights[i] = 0.0f;
 		ei.weight_error_scale[i] = 0.0f;
@ -333,8 +333,8 @@ static void compute_ideal_colors_and_weights_2_comp(
 	}
 	// Zero initialize any SIMD over-fetch
-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	for (size_t i = texel_count; i < texel_count_simd; i++)
 	{
 		ei.weights[i] = 0.0f;
 		ei.weight_error_scale[i] = 0.0f;
@ -500,8 +500,8 @@ static void compute_ideal_colors_and_weights_3_comp(
 	}
 	// Zero initialize any SIMD over-fetch
-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	for (size_t i = texel_count; i < texel_count_simd; i++)
 	{
 		ei.weights[i] = 0.0f;
 		ei.weight_error_scale[i] = 0.0f;
@ -598,8 +598,8 @@ static void compute_ideal_colors_and_weights_4_comp(
 	}
 	// Zero initialize any SIMD over-fetch
-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	for (size_t i = texel_count; i < texel_count_simd; i++)
 	{
 		ei.weights[i] = 0.0f;
 		ei.weight_error_scale[i] = 0.0f;
@ -853,12 +853,6 @@ void compute_ideal_weights_for_decimation(
 	promise(texel_count > 0);
 	promise(weight_count > 0);
 	// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
 	// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
 	// arrays always contain space for 64 elements
 	unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
 	storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
 	// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
 	// zero-initialized SIMD over-fetch region
 	if (is_direct)
@ -873,7 +867,6 @@ void compute_ideal_weights_for_decimation(
 	}
 	// Otherwise compute an estimate and perform single refinement iteration
 	ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
 	// Compute an initial average for each decimated weight
 	bool constant_wes = ei.is_constant_weight_error_scale;
@ -889,23 +882,23 @@ void compute_ideal_weights_for_decimation(
 		// Accumulate error weighting of all the texels using this weight
 		vint weight_texel_count(di.weight_texel_count + i);
-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		unsigned int max_texel_count = hmax_s(weight_texel_count);
 		promise(max_texel_count > 0);
 		for (unsigned int j = 0; j < max_texel_count; j++)
 		{
-			vint texel(di.weight_texels_tr[j] + i);
+			const uint8_t* texel = di.weight_texels_tr[j] + i;
 			vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
 			if (!constant_wes)
 			{
-				weight_error_scale = gatherf(ei.weight_error_scale, texel);
+				weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
 			}
 			vfloat contrib_weight = weight * weight_error_scale;
 			weight_weight += contrib_weight;
-			initial_weight += gatherf(ei.weights, texel) * contrib_weight;
+			initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
 		}
 		storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
@ -914,6 +907,7 @@ void compute_ideal_weights_for_decimation(
 	// Populate the interpolated weight grid based on the initial average
 	// Process SIMD-width texel coordinates at at time while we can. Safe to
 	// over-process full SIMD vectors - the tail is zeroed.
 	ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
 	if (di.max_texel_weight_count <= 2)
 	{
 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
@ -947,22 +941,22 @@ void compute_ideal_weights_for_decimation(
 		// Accumulate error weighting of all the texels using this weight
 		vint weight_texel_count(di.weight_texel_count + i);
-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		unsigned int max_texel_count = hmax_s(weight_texel_count);
 		promise(max_texel_count > 0);
 		for (unsigned int j = 0; j < max_texel_count; j++)
 		{
-			vint texel(di.weight_texels_tr[j] + i);
+			const uint8_t* texel = di.weight_texels_tr[j] + i;
 			vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
 			if (!constant_wes)
 			{
- 				weight_error_scale = gatherf(ei.weight_error_scale, texel);
+				weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
 			}
 			vfloat scale = weight_error_scale * contrib_weight;
-			vfloat old_weight = gatherf(infilled_weights, texel);
+			vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
-			vfloat ideal_weight = gatherf(ei.weights, texel);
+			vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);
 			error_change0 += contrib_weight * scale;
 			error_change1 += (old_weight - ideal_weight) * scale;
@ -1023,9 +1017,8 @@ void compute_quantized_weights_for_decimation(
 	// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
 	if (get_quant_level(quant_level) <= 16)
 	{
-		vint4 tab0 = vint4::load(qat.quant_to_unquant);
+		vtable_16x8 table;
-		vint tab0p;
+		vtable_prepare(table, qat.quant_to_unquant);
 		vtable_prepare(tab0, tab0p);
 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 		{
@ -1038,8 +1031,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
-			vint ixli = vtable_8bt_32bi(tab0p, weightl);
+			vint ixli = vtable_lookup_32bit(table, weightl);
-			vint ixhi = vtable_8bt_32bi(tab0p, weighth);
+			vint ixhi = vtable_lookup_32bit(table, weighth);
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
@ -1050,16 +1043,13 @@ void compute_quantized_weights_for_decimation(
 			// Invert the weight-scaling that was done initially
 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
-			vint scn = pack_low_bytes(weight);
+			pack_and_store_low_bytes(weight, quantized_weight_set + i);
 			store_nbytes(scn, quantized_weight_set + i);
 		}
 	}
 	else
 	{
-		vint4 tab0 = vint4::load(qat.quant_to_unquant +  0);
+		vtable_32x8 table;
-		vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
+		vtable_prepare(table, qat.quant_to_unquant);
 		vint tab0p, tab1p;
 		vtable_prepare(tab0, tab1, tab0p, tab1p);
 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 		{
@ -1072,8 +1062,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
-			vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
+			vint ixli = vtable_lookup_32bit(table, weightl);
-			vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
+			vint ixhi = vtable_lookup_32bit(table, weighth);
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
@ -1084,8 +1074,7 @@ void compute_quantized_weights_for_decimation(
 			// Invert the weight-scaling that was done initially
 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
-			vint scn = pack_low_bytes(weight);
+			pack_and_store_low_bytes(weight, quantized_weight_set + i);
 			store_nbytes(scn, quantized_weight_set + i);
 		}
 	}
 }
--- a/thirdparty/astcenc/astcenc_internal.h
+++ b/thirdparty/astcenc/astcenc_internal.h
@ -1583,19 +1583,13 @@ static inline vmask4 get_u8_component_mask(
 	astcenc_profile decode_mode,
 	const image_block& blk
 ) {
-	vmask4 u8_mask(false);
+	// Decode mode or sRGB forces writing to unorm8 output value
-	// Decode mode writing to a unorm8 output value
+	if (blk.decode_unorm8 || decode_mode == ASTCENC_PRF_LDR_SRGB)
 	if (blk.decode_unorm8)
 	{
-		u8_mask = vmask4(true);
+		return vmask4(true);
 	}
 	// SRGB writing to a unorm8 RGB value
 	else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
 	{
 		u8_mask = vmask4(true, true, true, false);
 	}
-	return u8_mask;
+	return vmask4(false);
 }
 /**
--- a/thirdparty/astcenc/astcenc_internal_entry.h
+++ b/thirdparty/astcenc/astcenc_internal_entry.h
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -100,6 +100,9 @@ private:
 	/** @brief Lock used for critical section and condition synchronization. */
 	std::mutex m_lock;
 	/** @brief True if the current operation is cancelled. */
 	std::atomic<bool> m_is_cancelled;
 	/** @brief True if the stage init() step has been executed. */
 	bool m_init_done;
@ -147,6 +150,7 @@ public:
 	{
 		m_init_done = false;
 		m_term_done = false;
 		m_is_cancelled = false;
 		m_start_count = 0;
 		m_done_count = 0;
 		m_task_count = 0;
@ -155,6 +159,16 @@ public:
 		m_callback_min_diff = 1.0f;
 	}
 	/**
 	 * @brief Clear the tracker and stop new tasks being assigned.
 	 *
 	 * Note, all in-flight tasks in a worker will still complete normally.
 	 */
 	void cancel()
 	{
 		m_is_cancelled = true;
 	}
 	/**
 	 * @brief Trigger the pipeline stage init step.
 	 *
@ -211,7 +225,7 @@ public:
 	unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
 	{
 		unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
-		if (base >= m_task_count)
+		if (m_is_cancelled || base >= m_task_count)
 		{
 			count = 0;
 			return 0;
@ -241,16 +255,17 @@ public:
 			local_count = m_done_count;
 			local_last_value = m_callback_last_value;
-			if (m_done_count == m_task_count)
+			// Ensure the progress bar hits 100%
 			if (m_callback && m_done_count == m_task_count)
 			{
-				// Ensure the progress bar hits 100%
+				std::unique_lock<std::mutex> cblck(m_callback_lock);
-				if (m_callback)
+				m_callback(100.0f);
-				{
+				m_callback_last_value = 100.0f;
-					std::unique_lock<std::mutex> cblck(m_callback_lock);
+			}
 					m_callback(100.0f);
 					m_callback_last_value = 100.0f;
 				}
 			// Notify if nothing left to do
 			if (m_is_cancelled || m_done_count == m_task_count)
 			{
 				lck.unlock();
 				m_complete.notify_all();
 			}
@ -285,7 +300,7 @@ public:
 	void wait()
 	{
 		std::unique_lock<std::mutex> lck(m_lock);
-		m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
+		m_complete.wait(lck, [this]{ return m_is_cancelled || m_done_count == m_task_count; });
 	}
 	/**
--- a/thirdparty/astcenc/astcenc_mathlib.h
+++ b/thirdparty/astcenc/astcenc_mathlib.h
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -48,7 +48,7 @@
    #define ASTCENC_SSE 42
  #elif defined(__SSE4_1__)
    #define ASTCENC_SSE 41
-  #elif defined(__SSE2__)
+  #elif defined(__SSE2__) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
    #define ASTCENC_SSE 20
  #else
    #define ASTCENC_SSE 0
@ -58,25 +58,42 @@
 #ifndef ASTCENC_AVX
  #if defined(__AVX2__)
    #define ASTCENC_AVX 2
    #define ASTCENC_X86_GATHERS 1
  #elif defined(__AVX__)
    #define ASTCENC_AVX 1
    #define ASTCENC_X86_GATHERS 1
  #else
    #define ASTCENC_AVX 0
  #endif
 #endif
 #ifndef ASTCENC_NEON
-  #if defined(__aarch64__)
+  #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
    #define ASTCENC_NEON 1
  #else
    #define ASTCENC_NEON 0
  #endif
 #endif
 #ifndef ASTCENC_SVE
  #if defined(__ARM_FEATURE_SVE)
    #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
      #define ASTCENC_SVE 8
    // Auto-detected SVE can only assume vector width of 4 is available, but
    // must also allow for hardware being longer and so all use of intrinsics
    // must explicitly use predicate masks to limit to 4-wide.
    #else
      #define ASTCENC_SVE 4
    #endif
    #else
    #define ASTCENC_SVE 0
  #endif
 #endif
 // Force vector-sized SIMD alignment
-#if ASTCENC_AVX
+#if ASTCENC_AVX || ASTCENC_SVE == 8
  #define ASTCENC_VECALIGN 32
-#elif ASTCENC_SSE || ASTCENC_NEON
+#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
  #define ASTCENC_VECALIGN 16
 // Use default alignment for non-SIMD builds
 #else
--- a/thirdparty/astcenc/astcenc_partition_tables.cpp
+++ b/thirdparty/astcenc/astcenc_partition_tables.cpp
@ -304,9 +304,9 @@ static bool generate_one_partition_info_entry(
 	// Fill loop tail so we can overfetch later
 	for (unsigned int i = 0; i < partition_count; i++)
 	{
-		int ptex_count = counts[i];
+		size_t ptex_count = counts[i];
-		int ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
+		size_t ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
-		for (int j = ptex_count; j < ptex_count_simd; j++)
+		for (size_t j = ptex_count; j < ptex_count_simd; j++)
 		{
 			pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
 		}
--- a/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
+++ b/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -123,21 +123,21 @@ static void compute_error_squared_rgb_single_partition(
 	vint lane_ids = vint::lane_id();
 	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 	{
-		vint tix(texel_indexes + i);
+		const uint8_t* tix = texel_indexes + i;
 		vmask mask = lane_ids < vint(texel_count);
 		lane_ids += vint(ASTCENC_SIMD_WIDTH);
 		// Compute the error that arises from just ditching alpha
-		vfloat data_a = gatherf(blk.data_a, tix);
+		vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, tix);
 		vfloat alpha_diff = data_a - default_a;
 		alpha_diff = alpha_diff * alpha_diff;
 		haccumulate(a_drop_errv, alpha_diff, mask);
-		vfloat data_r = gatherf(blk.data_r, tix);
+		vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, tix);
-		vfloat data_g = gatherf(blk.data_g, tix);
+		vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, tix);
-		vfloat data_b = gatherf(blk.data_b, tix);
+		vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, tix);
 		// Compute uncorrelated error
 		vfloat param = data_r * uncor_bs0
@ -1135,13 +1135,13 @@ unsigned int compute_ideal_endpoint_formats(
 	vfloat clear_error(ERROR_CALC_DEFAULT);
 	vint clear_quant(0);
-	unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
+	size_t packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
 	storea(clear_error, errors_of_best_combination + packed_start_block_mode);
 	store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
 	store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);
 	// Ensure that last iteration overstep contains data that will never be picked
-	unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
+	size_t packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
 	storea(clear_error, errors_of_best_combination + packed_end_block_mode);
 	store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
 	store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);
@ -1292,9 +1292,12 @@ unsigned int compute_ideal_endpoint_formats(
 		vint vbest_error_index(-1);
 		vfloat vbest_ep_error(ERROR_CALC_DEFAULT);
-		start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
+		// TODO: This should use size_t for the inputs of start/end_block_mode
-		vint lane_ids = vint::lane_id() + vint(start_block_mode);
+		// to avoid some of this type conversion, but that propagates and will
-		for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
+		// need a bigger PR to fix
 		size_t start_mode = round_down_to_simd_multiple_vla(start_block_mode);
 		vint lane_ids = vint::lane_id() + vint_from_size(start_mode);
 		for (size_t j = start_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
 		{
 			vfloat err = vfloat(errors_of_best_combination + j);
 			vmask mask = err < vbest_ep_error;
@ -1306,8 +1309,8 @@ unsigned int compute_ideal_endpoint_formats(
 		// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
 		vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
 		vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
-		vbest_error_index = hmin(vbest_error_index);
+
-		int best_error_index = vbest_error_index.lane<0>();
+		int best_error_index = hmin_s(vbest_error_index);
 		best_error_weights[i] = best_error_index;
--- a/thirdparty/astcenc/astcenc_vecmathlib.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib.h
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2022 Arm Limited
+// Copyright 2019-2025 Arm Limited
 // Copyright 2008 Jose Fonseca
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
@ -42,11 +42,12 @@
 *
 * With the current implementation ISA support is provided for:
 *
- *     * 1-wide for scalar reference.
+ *     * 1-wide for scalar reference
- *     * 4-wide for Armv8-A NEON.
+ *     * 4-wide for Armv8-A NEON
- *     * 4-wide for x86-64 SSE2.
+ *     * 4-wide for x86-64 SSE2
- *     * 4-wide for x86-64 SSE4.1.
+ *     * 4-wide for x86-64 SSE4.1
- *     * 8-wide for x86-64 AVX2.
+ *     * 8-wide for Armv8-A SVE
 *     * 8-wide for x86-64 AVX2
 */
 #ifndef ASTC_VECMATHLIB_H_INCLUDED
@ -54,7 +55,14 @@
 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
 	#include <immintrin.h>
-#elif ASTCENC_NEON != 0
+#endif
 #if ASTCENC_SVE != 0
 	#include <arm_sve.h>
 	#include <arm_neon_sve_bridge.h>
 #endif
 #if ASTCENC_NEON != 0
 	#include <arm_neon.h>
 #endif
@ -69,8 +77,10 @@
 	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
 #endif
 template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
 #if ASTCENC_AVX >= 2
-	/* If we have AVX2 expose 8-wide VLA. */
+	// If we have AVX2 expose 8-wide VLA.
 	#include "astcenc_vecmathlib_sse_4.h"
 	#include "astcenc_vecmathlib_common_4.h"
 	#include "astcenc_vecmathlib_avx2_8.h"
@ -88,11 +98,16 @@
 	using vint = vint8;
 	using vmask = vmask8;
 	using vtable_16x8 = vtable8_16x8;
 	using vtable_32x8 = vtable8_32x8;
 	using vtable_64x8 = vtable8_64x8;
 	constexpr auto loada = vfloat8::loada;
 	constexpr auto load1 = vfloat8::load1;
 	constexpr auto vint_from_size = vint8_from_size;
 #elif ASTCENC_SSE >= 20
-	/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
+	// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
 	#include "astcenc_vecmathlib_sse_4.h"
 	#include "astcenc_vecmathlib_common_4.h"
@ -103,11 +118,48 @@
 	using vint = vint4;
 	using vmask = vmask4;
 	using vtable_16x8 = vtable4_16x8;
 	using vtable_32x8 = vtable4_32x8;
 	using vtable_64x8 = vtable4_64x8;
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 	constexpr auto vint_from_size = vint4_from_size;
 #elif ASTCENC_SVE == 8
 	// Check the compiler is configured with fixed-length 256-bit SVE.
 	#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
 		#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
 	#endif
 	// If we have SVE configured as 8-wide, expose 8-wide VLA.
 	#include "astcenc_vecmathlib_neon_4.h"
 	#include "astcenc_vecmathlib_common_4.h"
 	#include "astcenc_vecmathlib_sve_8.h"
 	#define ASTCENC_SIMD_WIDTH 8
 	using vfloat = vfloat8;
 	#if defined(ASTCENC_NO_INVARIANCE)
 		using vfloatacc = vfloat8;
 	#else
 		using vfloatacc = vfloat4;
 	#endif
 	using vint = vint8;
 	using vmask = vmask8;
 	using vtable_16x8 = vtable8_16x8;
 	using vtable_32x8 = vtable8_32x8;
 	using vtable_64x8 = vtable8_64x8;
 	constexpr auto loada = vfloat8::loada;
 	constexpr auto load1 = vfloat8::load1;
 	constexpr auto vint_from_size = vint8_from_size;
 #elif ASTCENC_NEON > 0
-	/* If we have NEON expose 4-wide VLA. */
+	// If we have NEON expose 4-wide VLA.
 	#include "astcenc_vecmathlib_neon_4.h"
 	#include "astcenc_vecmathlib_common_4.h"
@ -118,8 +170,13 @@
 	using vint = vint4;
 	using vmask = vmask4;
 	using vtable_16x8 = vtable4_16x8;
 	using vtable_32x8 = vtable4_32x8;
 	using vtable_64x8 = vtable4_64x8;
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 	constexpr auto vint_from_size = vint4_from_size;
 #else
 	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
@ -150,34 +207,15 @@
 	using vint = vint4;
 	using vmask = vmask4;
 	using vtable_16x8 = vtable4_16x8;
 	using vtable_32x8 = vtable4_32x8;
 	using vtable_64x8 = vtable4_64x8;
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 	constexpr auto vint_from_size = vint4_from_size;
 #endif
 /**
 * @brief Round a count down to the largest multiple of 8.
 *
 * @param count   The unrounded value.
 *
 * @return The rounded value.
 */
 ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
 {
 	return count & static_cast<unsigned int>(~(8 - 1));
 }
 /**
 * @brief Round a count down to the largest multiple of 4.
 *
 * @param count   The unrounded value.
 *
 * @return The rounded value.
 */
 ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
 {
 	return count & static_cast<unsigned int>(~(4 - 1));
 }
 /**
 * @brief Round a count down to the largest multiple of the SIMD width.
 *
@ -187,9 +225,9 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int coun
 *
 * @return The rounded value.
 */
-ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
+ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
 {
-	return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
+	return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
 }
 /**
@ -201,9 +239,9 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int co
 *
 * @return The rounded value.
 */
-ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
+ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
 {
-	unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
+	size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
 	return multiples * ASTCENC_SIMD_WIDTH;
 }
@ -239,8 +277,8 @@ ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
 ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
 {
 	vfloat z = atan(abs(y / x));
-	vmask xmask = vmask(float_as_int(x).m);
+	vmask xmask = x < vfloat::zero();
-	return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
+	return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
 }
 /*
--- a/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2024 Arm Limited
+// Copyright 2019-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -54,7 +54,7 @@ struct vfloat8
 	ASTCENC_SIMD_INLINE vfloat8() = default;
 	/**
-	 * @brief Construct from 4 values loaded from an unaligned address.
+	 * @brief Construct from 8 values loaded from an unaligned address.
 	 *
 	 * Consider using loada() which is better with vectors if data is aligned
 	 * to vector length.
@ -74,18 +74,6 @@ struct vfloat8
 		m = _mm256_set1_ps(a);
 	}
 	/**
 	 * @brief Construct from 8 scalar values.
 	 *
 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
 	 */
 	ASTCENC_SIMD_INLINE explicit vfloat8(
 		float a, float b, float c, float d,
 		float e, float f, float g, float h)
 	{
 		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
 	}
 	/**
 	 * @brief Construct from an existing SIMD register.
 	 */
@ -94,20 +82,6 @@ struct vfloat8
 		m = a;
 	}
 	/**
 	 * @brief Get the scalar value of a single lane.
 	 */
 	template <int l> ASTCENC_SIMD_INLINE float lane() const
 	{
 	#if !defined(__clang__) && defined(_MSC_VER)
 		return m.m256_f32[l];
 	#else
 		union { __m256 m; float f[8]; } cvt;
 		cvt.m = m;
 		return cvt.f[l];
 	#endif
 	}
 	/**
 	 * @brief Factory that returns a vector of zeros.
 	 */
@ -132,14 +106,6 @@ struct vfloat8
 		return vfloat8(_mm256_load_ps(p));
 	}
 	/**
 	 * @brief Factory that returns a vector containing the lane IDs.
 	 */
 	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
 	{
 		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
 	}
 	/**
 	 * @brief The vector ...
 	 */
@ -183,25 +149,13 @@ struct vint8
 	/**
 	 * @brief Construct from 1 scalar value replicated across all lanes.
 	 *
-	 * Consider using vfloat4::zero() for constexpr zeros.
+	 * Consider using zero() for constexpr zeros.
 	 */
 	ASTCENC_SIMD_INLINE explicit vint8(int a)
 	{
 		m = _mm256_set1_epi32(a);
 	}
 	/**
 	 * @brief Construct from 8 scalar values.
 	 *
 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
 	 */
 	ASTCENC_SIMD_INLINE explicit vint8(
 		int a, int b, int c, int d,
 		int e, int f, int g, int h)
 	{
 		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
 	}
 	/**
 	 * @brief Construct from an existing SIMD register.
 	 */
@ -210,20 +164,6 @@ struct vint8
 		m = a;
 	}
 	/**
 	 * @brief Get the scalar from a single lane.
 	 */
 	template <int l> ASTCENC_SIMD_INLINE int lane() const
 	{
 	#if !defined(__clang__) && defined(_MSC_VER)
 		return m.m256i_i32[l];
 	#else
 		union { __m256i m; int f[8]; } cvt;
 		cvt.m = m;
 		return cvt.f[l];
 	#endif
 	}
 	/**
 	 * @brief Factory that returns a vector of zeros.
 	 */
@ -518,31 +458,54 @@ ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
 */
 ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
 {
-	__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
+	// Build min within groups of 2, then 4, then 8
-	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
+	__m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
-	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
+	m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
-	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
+	m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
-	__m256i r = astcenc_mm256_set_m128i(m, m);
+	vint8 vmin(m);
 	vint8 vmin(r);
 	return vmin;
 }
 /**
 * @brief Return the horizontal minimum of a vector.
 */
 ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
 {
 	return _mm256_cvtsi256_si32(hmin(a).m);
 }
 /**
 * @brief Return the horizontal maximum of a vector.
 */
 ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
 {
-	__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
+	// Build max within groups of 2, then 4, then 8
-	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
+	__m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
-	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
+	m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
-	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
+	m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
-	__m256i r = astcenc_mm256_set_m128i(m, m);
+	vint8 vmax(m);
 	vint8 vmax(r);
 	return vmax;
 }
 /**
 * @brief Return the horizontal maximum of a vector.
 */
 ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
 {
 	return _mm256_cvtsi256_si32(hmax(a).m);
 }
 /**
 * @brief Generate a vint8 from a size_t.
 */
 ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
 {
 	assert(a <= std::numeric_limits<int>::max());
 	return vint8(static_cast<int>(a));
 }
 /**
 * @brief Store a vector to a 16B aligned memory address.
 */
@ -570,18 +533,10 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
 	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
 }
 /**
 * @brief Gather N (vector width) indices from the array.
 */
 ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
 {
 	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
 }
 /**
 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
 */
-ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
 {
 	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
 	                               0, 0, 0, 0, 28, 24, 20, 16,
@ -593,7 +548,8 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
 	__m128i b = _mm_unpacklo_epi32(a0, a1);
 	__m256i r = astcenc_mm256_set_m128i(b, b);
-	return vint8(r);
+
 	store_nbytes(vint8(r), p);
 }
 /**
@ -606,7 +562,7 @@ ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
 }
 // ============================================================================
-// vfloat4 operators and functions
+// vfloat8 operators and functions
 // ============================================================================
 /**
@ -674,7 +630,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
 	return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
 }
 /**
 * @brief Overload: scalar by vector division.
 */
@ -683,7 +638,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
 	return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
 }
 /**
 * @brief Overload: vector by vector equality.
 */
@ -786,19 +740,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
 	return a;
 }
 /**
 * @brief Return a clamped value between 0.0f and max.
 *
 * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
 * be returned for that lane.
 */
 ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
 {
 	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
 	return a;
 }
 /**
 * @brief Return a clamped value between 0.0f and 1.0f.
 *
@ -857,7 +798,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
 */
 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
 {
-	return hmin(a).lane<0>();
+	return _mm256_cvtss_f32(hmin(a).m);
 }
 /**
@ -887,7 +828,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
 */
 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
 {
-	return hmax(a).lane<0>();
+	return _mm256_cvtss_f32(hmax(a).m);
 }
 /**
@ -909,14 +850,6 @@ ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
 }
 /**
 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
 */
 ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
 {
 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
 }
 /**
 * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
 *
@ -979,6 +912,33 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
 	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
 }
 /**
 * @brief Load a vector of gathered results from an array using byte indices from memory
 */
 template<>
 ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
 {
 #if ASTCENC_X86_GATHERS == 0
 	// Perform manual gather using scalar loads in two separate dependency chains,
 	// then merge late. MSVC translates this 1:1, which is OK. Clang turns it
 	// into a bunch of memory-operand inserts on 128-bit halves then merges late,
 	// which performs significantly worse in tests.
 	__m256 m0 = _mm256_broadcast_ss(base + indices[0]);
 	__m256 m1 = _mm256_broadcast_ss(base + indices[1]);
 	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
 	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
 	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
 	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
 	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
 	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);
 	return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
 #else
 	vint8 inds(indices);
 	return gatherf(base, inds);
 #endif
 }
 /**
 * @brief Store a vector to an unaligned memory address.
 */
@ -1045,98 +1005,140 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
 	return vfloat8(_mm256_castsi256_ps(a.m));
 }
-/**
+/*
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * Table structure for a 16x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
+struct vtable8_16x8 {
-{
+	vint8 t0;
-	// AVX2 duplicates the table within each 128-bit lane
+};
 	__m128i t0n = t0.m;
 	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
 }
-/**
+/*
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * Table structure for a 32x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
+struct vtable8_32x8 {
-{
+	vint8 t0;
-	// AVX2 duplicates the table within each 128-bit lane
+	vint8 t1;
-	__m128i t0n = t0.m;
+};
 	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
+/*
-	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
+ * Table structure for a 64x 8-bit entry table.
-}
+ */
 struct vtable8_64x8 {
 	vint8 t0;
 	vint8 t1;
 	vint8 t2;
 	vint8 t3;
 };
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
 */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+	vtable8_16x8& table,
-	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
+	const uint8_t* data
-{
+) {
-	// AVX2 duplicates the table within each 128-bit lane
+	// AVX2 tables duplicate table entries in each 128-bit half-register
-	__m128i t0n = t0.m;
+	vint4 d0 = vint4::load(data);
 	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
 	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
 	__m128i t2n = _mm_xor_si128(t1.m, t2.m);
 	t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
 	__m128i t3n = _mm_xor_si128(t2.m, t3.m);
 	t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
 }
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
+ASTCENC_SIMD_INLINE void vtable_prepare(
-{
+	vtable8_32x8& table,
 	const uint8_t* data
 ) {
 	// AVX2 tables duplicate table entries in each 128-bit half-register
 	vint4 d0 = vint4::load(data);
 	vint4 d1 = vint4::load(data + 16);
 	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
 	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
 	// XOR chain the high rows to allow table emulation
 	table.t1 = table.t1 ^ table.t0;
 }
 /**
 * @brief Prepare a vtable lookup table 64x 8-bit entry table.
 */
 ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable8_64x8& table,
 	const uint8_t* data
 ) {
 	// AVX2 tables duplicate table entries in each 128-bit half-register
 	vint4 d0 = vint4::load(data);
 	vint4 d1 = vint4::load(data + 16);
 	vint4 d2 = vint4::load(data + 32);
 	vint4 d3 = vint4::load(data + 48);
 	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
 	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
 	table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
 	table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
 	// XOR chain the high rows to allow table emulation
 	table.t3 = table.t3 ^ table.t2;
 	table.t2 = table.t2 ^ table.t1;
 	table.t1 = table.t1 ^ table.t0;
 }
 /**
 * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
 */
 ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
 	const vtable8_16x8& tbl,
 	vint8 idx
 ) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	return vint8(result);
 }
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
 */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
-{
+	const vtable8_32x8& tbl,
 	vint8 idx
 ) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	return vint8(result);
 }
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
 */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
-{
+	const vtable8_64x8& tbl,
 	vint8 idx
 ) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
-	result2 = _mm256_shuffle_epi8(t2.m, idxx);
+	result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
-	result2 = _mm256_shuffle_epi8(t3.m, idxx);
+	result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	return vint8(result);
@ -1146,7 +1148,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
 * @brief Return a vector of interleaved RGBA data.
 *
 * Input vectors have the value stored in the bottom 8 bits of each lane,
- * with high  bits set to zero.
+ * with high bits set to zero.
 *
 * Output vector stores a single RGBA texel packed in each lane.
 */
@ -1183,8 +1185,12 @@ ASTCENC_SIMD_INLINE void printx(vint8 a)
 {
 	alignas(32) int v[8];
 	storea(a, v);
 	unsigned int uv[8];
 	std::memcpy(uv, v, sizeof(int) * 8);
 	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
-	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+		uv[0], uv[1], uv[2], uv[3], uv[4], uv[5], uv[6], uv[7]);
 }
 /**
--- a/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2020-2024 Arm Limited
+// Copyright 2020-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -31,26 +31,7 @@
 #endif
 #include <cstdio>
-
+#include <limits>
 // ============================================================================
 // vmask4 operators and functions
 // ============================================================================
 /**
 * @brief True if any lanes are enabled, false otherwise.
 */
 ASTCENC_SIMD_INLINE bool any(vmask4 a)
 {
 	return mask(a) != 0;
 }
 /**
 * @brief True if all lanes are enabled, false otherwise.
 */
 ASTCENC_SIMD_INLINE bool all(vmask4 a)
 {
 	return mask(a) == 0xF;
 }
 // ============================================================================
 // vint4 operators and functions
@ -129,6 +110,31 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
 	return a.lane<0>() + a.lane<1>() + a.lane<2>();
 }
 /**
 * @brief Return the horizontal minimum of a vector.
 */
 ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
 {
 	return hmin(a).lane<0>();
 }
 /**
 * @brief Generate a vint4 from a size_t.
 */
 ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
 {
 	assert(a <= std::numeric_limits<int>::max());
 	return vint4(static_cast<int>(a));
 }
 /**
 * @brief Return the horizontal maximum of a vector.
 */
 ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
 {
 	return hmax(a).lane<0>();
 }
 // ============================================================================
 // vfloat4 operators and functions
 // ============================================================================
@ -222,18 +228,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
 	return min(max(a, minv), maxv);
 }
 /**
 * @brief Return the clamped value between 0.0f and max.
 *
 * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
 * be returned for that lane.
 */
 ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
 {
 	// Do not reorder - second operand will return if either is NaN
 	return min(max(a, vfloat4::zero()), maxv);
 }
 /**
 * @brief Return the clamped value between 0.0f and 1.0f.
 *
@ -396,8 +390,12 @@ ASTCENC_SIMD_INLINE void printx(vint4 a)
 {
 	ASTCENC_ALIGNAS int v[4];
 	storea(a, v);
 	unsigned int uv[4];
 	std::memcpy(uv, v, sizeof(int) * 4);
 	printf("v4_i32:\n  %08x %08x %08x %08x\n",
-	       v[0], v[1], v[2], v[3]);
+		uv[0], uv[1], uv[2], uv[3]);
 }
 /**
--- a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2023 Arm Limited
+// Copyright 2019-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -115,7 +115,7 @@ struct vfloat4
 	 */
 	static ASTCENC_SIMD_INLINE vfloat4 zero()
 	{
-		return vfloat4(vdupq_n_f32(0.0f));
+		return vfloat4(0.0f);
 	}
 	/**
@ -134,15 +134,6 @@ struct vfloat4
 		return vfloat4(vld1q_f32(p));
 	}
 	/**
 	 * @brief Factory that returns a vector containing the lane IDs.
 	 */
 	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
 	{
 		alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
 		return vfloat4(vld1q_f32(data));
 	}
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@ -203,16 +194,21 @@ struct vint4
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
 	{
-		// Cast is safe - NEON loads are allowed to be unaligned
+#if ASTCENC_SVE == 0
-		uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
+	// Cast is safe - NEON loads are allowed to be unaligned
-		uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
+	uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
-		m = vreinterpretq_s32_u32(vmovl_u16(t16));
+	uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
 	m = vreinterpretq_s32_u32(vmovl_u16(t16));
 #else
 	svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p);
 	m = svget_neonq(data);
 #endif
 	}
 	/**
 	 * @brief Construct from 1 scalar value replicated across all lanes.
 	 *
-	 * Consider using vfloat4::zero() for constexpr zeros.
+	 * Consider using zero() for constexpr zeros.
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(int a)
 	{
@ -420,6 +416,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
 	return vaddvq_u32(vshlq_u32(tmp, shift));
 }
 /**
 * @brief True if any lanes are enabled, false otherwise.
 */
 ASTCENC_SIMD_INLINE bool any(vmask4 a)
 {
 	return vmaxvq_u32(a.m) != 0;
 }
 /**
 * @brief True if all lanes are enabled, false otherwise.
 */
 ASTCENC_SIMD_INLINE bool all(vmask4 a)
 {
 	return vminvq_u32(a.m) != 0;
 }
 // ============================================================================
 // vint4 operators and functions
 // ============================================================================
@ -570,15 +582,6 @@ ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
 	return vint4(vmaxvq_s32(a.m));
 }
 /**
 * @brief Return the horizontal sum of a vector.
 */
 ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
 {
 	int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m));
 	return vget_lane_s32(vpadd_s32(t, t), 0);
 }
 /**
 * @brief Store a vector to a 16B aligned memory address.
 */
@ -612,31 +615,17 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 }
 /**
- * @brief Gather N (vector width) indices from the array.
+ * @brief Pack and store low 8 bits of each vector lane.
 */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data)
 {
 	alignas(16) int idx[4];
 	storea(indices, idx);
 	alignas(16) int vals[4];
 	vals[0] = base[idx[0]];
 	vals[1] = base[idx[1]];
 	vals[2] = base[idx[2]];
 	vals[3] = base[idx[3]];
 	return vint4(vals);
 }
 /**
 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
 */
 ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
 {
 	alignas(16) uint8_t shuf[16] {
 		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
 	};
 	uint8x16_t idx = vld1q_u8(shuf);
 	int8x16_t av = vreinterpretq_s8_s32(a.m);
-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
+	a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
 	store_nbytes(a, data);
 }
 /**
@ -814,21 +803,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 	return vfloat4(vbslq_f32(cond.m, b.m, a.m));
 }
 /**
 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
 */
 ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
 {
 	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
 	uint32x4_t mask = vcgeq_u32(cond.m, msb);
 	return vfloat4(vbslq_f32(mask, b.m, a.m));
 }
 /**
 * @brief Load a vector of gathered results from an array;
 */
 ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 {
 #if ASTCENC_SVE == 0
 	alignas(16) int idx[4];
 	storea(indices, idx);
 	alignas(16) float vals[4];
@ -837,8 +817,32 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 	vals[2] = base[idx[2]];
 	vals[3] = base[idx[3]];
 	return vfloat4(vals);
 #else
 	svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m);
 	svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
 	return vfloat4(svget_neonq_f32(data));
 #endif
 }
 /**
 * @brief Load a vector of gathered results from an array using byte indices from memory
 */
 template<>
 ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
 {
 #if ASTCENC_SVE == 0
 	alignas(16) float vals[4];
 	vals[0] = base[indices[0]];
 	vals[1] = base[indices[1]];
 	vals[2] = base[indices[2]];
 	vals[3] = base[indices[3]];
 	return vfloat4(vals);
 #else
 	svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
 	svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
 	return vfloat4(svget_neonq_f32(data));
 #endif
 }
 /**
 * @brief Store a vector to an unaligned memory address.
 */
@ -950,87 +954,105 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 	return vfloat4(vreinterpretq_f32_s32(v.m));
 }
-/**
+/*
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * Table structure for a 16x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
+struct vtable4_16x8 {
-{
+	uint8x16_t t0;
-	t0p = t0;
+};
 }
-
+/*
-/**
+ * Table structure for a 32x 8-bit entry table.
 * @brief Prepare a vtable lookup table for use with the native SIMD size.
 */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
+struct vtable4_32x8 {
-{
+	uint8x16x2_t t01;
-	t0p = t0;
+};
-	t1p = t1;
+
-}
+/*
 * Table structure for a 64x 8-bit entry table.
 */
 struct vtable4_64x8 {
 	uint8x16x4_t t0123;
 };
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
 */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+	vtable4_16x8& table,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
+	const uint8_t* data
-{
+) {
-	t0p = t0;
+	table.t0 = vld1q_u8(data);
 	t1p = t1;
 	t2p = t2;
 	t3p = t3;
 }
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
+ASTCENC_SIMD_INLINE void vtable_prepare(
-{
+	vtable4_32x8& table,
-	int8x16_t table {
+	const uint8_t* data
-		vreinterpretq_s8_s32(t0.m)
+) {
 	table.t01 = uint8x16x2_t {
 		vld1q_u8(data),
 		vld1q_u8(data + 16)
 	};
 }
 /**
 * @brief Prepare a vtable lookup table 64x 8-bit entry table.
 */
 ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_64x8& table,
 	const uint8_t* data
 ) {
 	table.t0123 = uint8x16x4_t {
 		vld1q_u8(data),
 		vld1q_u8(data + 16),
 		vld1q_u8(data + 32),
 		vld1q_u8(data + 48)
 	};
 }
 /**
 * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
 */
 ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_16x8& tbl,
 	vint4 idx
 ) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes)));
 }
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
 */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
-{
+	const vtable4_32x8& tbl,
-	int8x16x2_t table {
+	vint4 idx
-		vreinterpretq_s8_s32(t0.m),
+) {
 		vreinterpretq_s8_s32(t1.m)
 	};
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
-	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes)));
 }
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
 */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
-{
+	const vtable4_64x8& tbl,
-	int8x16x4_t table {
+	vint4 idx
-		vreinterpretq_s8_s32(t0.m),
+) {
 		vreinterpretq_s8_s32(t1.m),
 		vreinterpretq_s8_s32(t2.m),
 		vreinterpretq_s8_s32(t3.m)
 	};
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
-	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes)));
 }
 /**
--- a/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2024 Arm Limited
+// Copyright 2019-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -139,14 +139,6 @@ struct vfloat4
 		return vfloat4(p);
 	}
 	/**
 	 * @brief Factory that returns a vector containing the lane IDs.
 	 */
 	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
 	{
 		return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
 	}
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@ -233,7 +225,7 @@ struct vint4
 	/**
 	 * @brief Construct from 4 scalar values replicated across all lanes.
 	 *
-	 * Consider using vint4::zero() for constexpr zeros.
+	 * Consider using zero() for constexpr zeros.
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(int a)
 	{
@ -354,7 +346,7 @@ struct vmask4
 	/**
 	 * @brief Get the scalar value of a single lane.
 	 */
-	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	template <int l> ASTCENC_SIMD_INLINE bool lane() const
 	{
 		return m[l] != 0;
 	}
@ -420,10 +412,26 @@ ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
 */
 ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
 {
-	return ((a.m[0] >> 31) & 0x1) |
+	return (a.m[0] & 0x1) |
-	       ((a.m[1] >> 30) & 0x2) |
+	       (a.m[1] & 0x2) |
-	       ((a.m[2] >> 29) & 0x4) |
+	       (a.m[2] & 0x4) |
-	       ((a.m[3] >> 28) & 0x8);
+	       (a.m[3] & 0x8);
 }
 /**
 * @brief True if any lanes are enabled, false otherwise.
 */
 ASTCENC_SIMD_INLINE bool any(vmask4 a)
 {
 	return mask(a) != 0;
 }
 /**
 * @brief True if all lanes are enabled, false otherwise.
 */
 ASTCENC_SIMD_INLINE bool all(vmask4 a)
 {
 	return mask(a) == 0xF;
 }
 // ============================================================================
@ -638,14 +646,6 @@ ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
 	return vint4(std::max(b, c));
 }
 /**
 * @brief Return the horizontal sum of vector lanes as a scalar.
 */
 ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
 {
 	return a.m[0] + a.m[1] + a.m[2] + a.m[3];
 }
 /**
 * @brief Store a vector to an aligned memory address.
 */
@ -684,29 +684,23 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	std::memcpy(p, a.m, sizeof(uint8_t) * 4);
 }
 /**
 * @brief Gather N (vector width) indices from the array.
 */
 ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
 {
 	return vint4(base[indices.m[0]],
 	             base[indices.m[1]],
 	             base[indices.m[2]],
 	             base[indices.m[3]]);
 }
 /**
 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
 */
-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
 {
 	int b0 = a.m[0] & 0xFF;
 	int b1 = a.m[1] & 0xFF;
 	int b2 = a.m[2] & 0xFF;
 	int b3 = a.m[3] & 0xFF;
 #if !defined(ASTCENC_BIG_ENDIAN)
 	int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
-	return vint4(b, 0, 0, 0);
+#else
 	int b = b3 | (b2 << 8) | (b1 << 16) | (b0 << 24);
 #endif
 	a = vint4(b, 0, 0, 0);
 	store_nbytes(a, p);
 }
 /**
@ -934,17 +928,6 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
 }
 /**
 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
 */
 ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
 {
 	return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
 	               (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
 	               (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
 	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
 }
 /**
 * @brief Load a vector of gathered results from an array;
 */
@ -956,6 +939,18 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 	               base[indices.m[3]]);
 }
 /**
 * @brief Load a vector of gathered results from an array using byte indices from memory
 */
 template<>
 ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
 {
 	return vfloat4(base[indices[0]],
 	               base[indices[1]],
 	               base[indices[2]],
 	               base[indices[3]]);
 }
 /**
 * @brief Store a vector to an unaligned memory address.
 */
@ -1080,84 +1075,94 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
 	return r;
 }
-/**
+/*
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * Table structure for a 16x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
+struct vtable4_16x8 {
-{
+	const uint8_t* data;
-	t0p = t0;
+};
-}
+
 /*
 * Table structure for a 32x 8-bit entry table.
 */
 struct vtable4_32x8 {
 	const uint8_t* data;
 };
 /*
 * Table structure for a 64x 8-bit entry table.
 */
 struct vtable4_64x8 {
 	const uint8_t* data;
 };
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
 */
 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
 {
 	t0p = t0;
 	t1p = t1;
 }
 /**
 * @brief Prepare a vtable lookup table for use with the native SIMD size.
 */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+	vtable4_16x8& table,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
+	const uint8_t* data
-{
+) {
-	t0p = t0;
+	table.data = data;
 	t1p = t1;
 	t2p = t2;
 	t3p = t3;
 }
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
+ASTCENC_SIMD_INLINE void vtable_prepare(
-{
+	vtable4_32x8& table,
-	uint8_t table[16];
+	const uint8_t* data
-
+) {
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
+	table.data = data;
 	return vint4(table[idx.lane<0>()],
 	             table[idx.lane<1>()],
 	             table[idx.lane<2>()],
 	             table[idx.lane<3>()]);
 }
 /**
 * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
 */
 ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
 {
 	uint8_t table[32];
 	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
 	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
 	return vint4(table[idx.lane<0>()],
 	             table[idx.lane<1>()],
 	             table[idx.lane<2>()],
 	             table[idx.lane<3>()]);
 }
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
+ASTCENC_SIMD_INLINE void vtable_prepare(
-{
+	vtable4_64x8& table,
-	uint8_t table[64];
+	const uint8_t* data
 ) {
 	table.data = data;
 }
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
+/**
-	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
-	std::memcpy(table + 32, t2.m, 4 * sizeof(int));
+ */
-	std::memcpy(table + 48, t3.m, 4 * sizeof(int));
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_16x8& table,
 	vint4 idx
 ) {
 	return vint4(table.data[idx.lane<0>()],
 	             table.data[idx.lane<1>()],
 	             table.data[idx.lane<2>()],
 	             table.data[idx.lane<3>()]);
 }
-	return vint4(table[idx.lane<0>()],
+/**
-	             table[idx.lane<1>()],
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
-	             table[idx.lane<2>()],
+ */
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_32x8& table,
 	vint4 idx
 ) {
 	return vint4(table.data[idx.lane<0>()],
 	             table.data[idx.lane<1>()],
 	             table.data[idx.lane<2>()],
 	             table.data[idx.lane<3>()]);
 }
 /**
 * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
 */
 ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_64x8& table,
 	vint4 idx
 ) {
 	return vint4(table.data[idx.lane<0>()],
 	             table.data[idx.lane<1>()],
 	             table.data[idx.lane<2>()],
 	             table.data[idx.lane<3>()]);
 }
 /**
@ -1170,7 +1175,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3
 */
 ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
 {
 #if !defined(ASTCENC_BIG_ENDIAN)
 	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
 #else
 	return a + lsl<8>(b) + lsl<16>(g) + lsl<24>(r);
 #endif
 }
 /**
--- a/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2023 Arm Limited
+// Copyright 2019-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@ -142,14 +142,6 @@ struct vfloat4
 		return vfloat4(_mm_load_ps(p));
 	}
 	/**
 	 * @brief Factory that returns a vector containing the lane IDs.
 	 */
 	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
 	{
 		return vfloat4(_mm_set_ps(3, 2, 1, 0));
 	}
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@ -229,7 +221,7 @@ struct vint4
 	/**
 	 * @brief Construct from 1 scalar value replicated across all lanes.
 	 *
-	 * Consider using vfloat4::zero() for constexpr zeros.
+	 * Consider using zero() for constexpr zeros.
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(int a)
 	{
@ -436,6 +428,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
 	return static_cast<unsigned int>(_mm_movemask_ps(a.m));
 }
 /**
 * @brief True if any lanes are enabled, false otherwise.
 */
 ASTCENC_SIMD_INLINE bool any(vmask4 a)
 {
 	return mask(a) != 0;
 }
 /**
 * @brief True if all lanes are enabled, false otherwise.
 */
 ASTCENC_SIMD_INLINE bool all(vmask4 a)
 {
 	return mask(a) == 0xF;
 }
 // ============================================================================
 // vint4 operators and functions
 // ============================================================================
@ -598,9 +606,9 @@ ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
 */
 ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
 {
-	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
-	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
-	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
+	return a;
 }
 /*
@ -608,25 +616,9 @@ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
 */
 ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
 {
-	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
-	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
-	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
+	return a;
 }
 /**
 * @brief Return the horizontal sum of a vector as a scalar.
 */
 ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
 {
 	// Add top and bottom halves, lane 1/0
 	__m128i fold = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(a.m),
 	                                              _mm_castsi128_ps(a.m)));
 	__m128i t = _mm_add_epi32(a.m, fold);
 	// Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)
 	t = _mm_add_epi32(t, _mm_shuffle_epi32(t, 0x55));
 	return _mm_cvtsi128_si32(t);
 }
 /**
@ -663,32 +655,20 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
 }
 /**
 * @brief Gather N (vector width) indices from the array.
 */
 ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
 {
 #if ASTCENC_AVX >= 2
 	return vint4(_mm_i32gather_epi32(base, indices.m, 4));
 #else
 	alignas(16) int idx[4];
 	storea(indices, idx);
 	return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
 #endif
 }
 /**
 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
 */
-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
 {
 #if ASTCENC_SSE >= 41
 	__m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);
-	return vint4(_mm_shuffle_epi8(a.m, shuf));
+	a = vint4(_mm_shuffle_epi8(a.m, shuf));
 	store_nbytes(a, p);
 #else
 	__m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));
 	__m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));
-	return vint4(_mm_unpacklo_epi16(va, vb));
+	a = vint4(_mm_unpacklo_epi16(va, vb));
 	store_nbytes(a, p);
 #endif
 }
@ -899,25 +879,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 #endif
 }
 /**
 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
 */
 ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
 {
 #if ASTCENC_SSE >= 41
 	return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
 #else
 	__m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
 	return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m)));
 #endif
 }
 /**
 * @brief Load a vector of gathered results from an array;
 */
 ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 {
-#if ASTCENC_AVX >= 2
+#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0
 	return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
 #else
 	alignas(16) int idx[4];
@ -926,6 +893,23 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 #endif
 }
 /**
 * @brief Load a vector of gathered results from an array using byte indices from memory
 */
 template<>
 ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
 {
 	// Experimentally, in this particular use case (byte indices in memory),
 	// using 4 separate scalar loads is appreciably faster than using gathers
 	// even if they're available, on every x86 uArch tried, so always do the
 	// separate loads even when ASTCENC_X86_GATHERS is enabled.
 	//
 	// Tested on:
 	//   - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove
 	//   - AMD Zen 2, Zen 4
 	return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]);
 }
 /**
 * @brief Store a vector to an unaligned memory address.
 */
@ -1054,136 +1038,173 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 	return vfloat4(_mm_castsi128_ps(v.m));
 }
-/**
+/*
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * Table structure for a 16x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
+struct vtable4_16x8 {
 {
 	t0p = t0;
 }
 /**
 * @brief Prepare a vtable lookup table for use with the native SIMD size.
 */
 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
 {
 #if ASTCENC_SSE >= 41
-	t0p = t0;
+	vint4 t0;
 	t1p = t0 ^ t1;
 #else
-	t0p = t0;
+	const uint8_t* data;
 	t1p = t1;
 #endif
-}
+};
 /*
 * Table structure for a 32x 8-bit entry table.
 */
 struct vtable4_32x8 {
 #if ASTCENC_SSE >= 41
 	vint4 t0;
 	vint4 t1;
 #else
 	const uint8_t* data;
 #endif
 };
 /*
 * Table structure for a 64x 8-bit entry table.
 */
 struct vtable4_64x8 {
 #if ASTCENC_SSE >= 41
 	vint4 t0;
 	vint4 t1;
 	vint4 t2;
 	vint4 t3;
 #else
 	const uint8_t* data;
 #endif
 };
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
 */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+	vtable4_16x8& table,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
+	const uint8_t* data
-{
+) {
 #if ASTCENC_SSE >= 41
-	t0p = t0;
+	table.t0 = vint4::load(data);
 	t1p = t0 ^ t1;
 	t2p = t1 ^ t2;
 	t3p = t2 ^ t3;
 #else
-	t0p = t0;
+	table.data = data;
 	t1p = t1;
 	t2p = t2;
 	t3p = t3;
 #endif
 }
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
+ASTCENC_SIMD_INLINE void vtable_prepare(
-{
+	vtable4_32x8& table,
 	const uint8_t* data
 ) {
 #if ASTCENC_SSE >= 41
-	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+	table.t0 = vint4::load(data);
-	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
+	table.t1 = vint4::load(data + 16);
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	table.t1 = table.t1 ^ table.t0;
 	return vint4(result);
 #else
-	uint8_t table[16];
+	table.data = data;
 	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
 	return vint4(table[idx.lane<0>()],
 	             table[idx.lane<1>()],
 	             table[idx.lane<2>()],
 	             table[idx.lane<3>()]);
 #endif
 }
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
 */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
+ASTCENC_SIMD_INLINE void vtable_prepare(
-{
+	vtable4_64x8& table,
 	const uint8_t* data
 ) {
 #if ASTCENC_SSE >= 41
-	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+	table.t0 = vint4::load(data);
-	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
+	table.t1 = vint4::load(data + 16);
 	table.t2 = vint4::load(data + 32);
 	table.t3 = vint4::load(data + 48);
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	table.t3 = table.t3 ^ table.t2;
-	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
+	table.t2 = table.t2 ^ table.t1;
-
+	table.t1 = table.t1 ^ table.t0;
 	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	return vint4(result);
 #else
-	uint8_t table[32];
+	table.data = data;
 	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
 	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
 	return vint4(table[idx.lane<0>()],
 	             table[idx.lane<1>()],
 	             table[idx.lane<2>()],
 	             table[idx.lane<3>()]);
 #endif
 }
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
 */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
-{
+	const vtable4_16x8& tbl,
 	vint4 idx
 ) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	return vint4(result);
 #else
 	return vint4(tbl.data[idx.lane<0>()],
 	             tbl.data[idx.lane<1>()],
 	             tbl.data[idx.lane<2>()],
 	             tbl.data[idx.lane<3>()]);
 #endif
 }
 /**
 * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
 */
 ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_32x8& tbl,
 	vint4 idx
 ) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
-	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 	result2 = _mm_shuffle_epi8(t2.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 	result2 = _mm_shuffle_epi8(t3.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	return vint4(result);
 #else
-	uint8_t table[64];
+	return vint4(tbl.data[idx.lane<0>()],
 	             tbl.data[idx.lane<1>()],
 	             tbl.data[idx.lane<2>()],
 	             tbl.data[idx.lane<3>()]);
 #endif
 }
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
+/**
-	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
-	std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
+ */
-	std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_64x8& tbl,
 	vint4 idx
 ) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
-	return vint4(table[idx.lane<0>()],
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
-	             table[idx.lane<1>()],
+	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
-	             table[idx.lane<2>()],
+
-	             table[idx.lane<3>()]);
+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 	result2 = _mm_shuffle_epi8(tbl.t2.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 	result2 = _mm_shuffle_epi8(tbl.t3.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	return vint4(result);
 #else
 	return vint4(tbl.data[idx.lane<0>()],
 	             tbl.data[idx.lane<1>()],
 	             tbl.data[idx.lane<2>()],
 	             tbl.data[idx.lane<3>()]);
 #endif
 }
@ -1307,7 +1328,11 @@ ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
 */
 ASTCENC_SIMD_INLINE int popcount(uint64_t v)
 {
 #if !defined(__x86_64__) && !defined(_M_AMD64)
 	return static_cast<int>(__builtin_popcountll(v));
 #else
 	return static_cast<int>(_mm_popcnt_u64(v));
 #endif
 }
 #endif // ASTCENC_POPCNT >= 1
--- a/thirdparty/astcenc/astcenc_vecmathlib_sve_8.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_sve_8.h
--- a/thirdparty/astcenc/astcenc_weight_align.cpp
+++ b/thirdparty/astcenc/astcenc_weight_align.cpp
@ -43,6 +43,7 @@
 #include <stdio.h>
 #include <cassert>
 #include <cstring>
 #include <cfloat>
 static constexpr unsigned int ANGULAR_STEPS { 32 };
@ -104,14 +105,17 @@ static void compute_angular_offsets(
 	// Precompute isample; arrays are always allocated 64 elements long
 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 	{
-		// Add 2^23 and interpreting bits extracts round-to-nearest int
+		// Ideal weight can be outside [0, 1] range, so clamp to fit table
-		vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
+		vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
-		vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
+
 		// Convert a weight to a sincos table index
 		vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
 		vint isample = float_to_int_rtn(sample);
 		storea(isample, isamplev + i);
 	}
 	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
-	vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
+	vfloat mult(1.0f / (2.0f * astc::PI));
 	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
 	{
@ -164,18 +168,41 @@ static void compute_lowest_and_highest_weight(
 	promise(weight_count > 0);
 	promise(max_angular_steps > 0);
-	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
+	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
 	// Compute minimum/maximum weights in the weight array. Our remapping
 	// is monotonic, so the min/max rounded weights relate to the min/max
 	// unrounded weights in a straightforward way.
 	vfloat min_weight(FLT_MAX);
 	vfloat max_weight(-FLT_MAX);
 	vint lane_id = vint::lane_id();
 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 	{
 		vmask active = lane_id < vint(weight_count);
 		lane_id += vint(ASTCENC_SIMD_WIDTH);
 		vfloat weights = loada(dec_weight_ideal_value + i);
 		min_weight = min(min_weight, select(min_weight, weights, active));
 		max_weight = max(max_weight, select(max_weight, weights, active));
 	}
 	min_weight = hmin(min_weight);
 	max_weight = hmax(max_weight);
 	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
 	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
 	{
 		vfloat minidx(128.0f);
 		vfloat maxidx(-128.0f);
 		vfloat errval = vfloat::zero();
 		vfloat cut_low_weight_err = vfloat::zero();
 		vfloat cut_high_weight_err = vfloat::zero();
 		vfloat offset = loada(offsets + sp);
 		// We know the min and max weight values, so we can figure out
 		// the corresponding indices before we enter the loop.
 		vfloat minidx = round(min_weight * rcp_stepsize - offset);
 		vfloat maxidx = round(max_weight * rcp_stepsize - offset);
 		for (unsigned int j = 0; j < weight_count; j++)
 		{
 			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
@ -183,22 +210,12 @@ static void compute_lowest_and_highest_weight(
 			vfloat diff = sval - svalrte;
 			errval += diff * diff;
-			// Reset tracker on min hit
+			// Accumulate errors for minimum index
-			vmask mask = svalrte < minidx;
+			vmask mask = svalrte == minidx;
 			minidx = select(minidx, svalrte, mask);
 			cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
 			// Accumulate on min hit
 			mask = svalrte == minidx;
 			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
 			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
-			// Reset tracker on max hit
+			// Accumulate errors for maximum index
 			mask = svalrte > maxidx;
 			maxidx = select(maxidx, svalrte, mask);
 			cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
 			// Accumulate on max hit
 			mask = svalrte == maxidx;
 			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
 			cut_high_weight_err = select(cut_high_weight_err, accum, mask);