Rémi Verschelde 2023-08-07 15:34:07 +02:00
parent 7d2ca2d8ac
commit 82f20cdcc0
No known key found for this signature in database
GPG key ID: C3336907360768E1
22 changed files with 1036 additions and 713 deletions

View file

@ -165,7 +165,7 @@ License: BSD-3-clause
Files: ./thirdparty/astcenc/ Files: ./thirdparty/astcenc/
Comment: Arm ASTC Encoder Comment: Arm ASTC Encoder
Copyright: 2011-2023, Arm Limited Copyright: 2011-2024, Arm Limited
License: Apache-2.0 License: Apache-2.0
Files: ./thirdparty/basis_universal/ Files: ./thirdparty/basis_universal/

View file

@ -47,7 +47,7 @@ Files extracted from upstream source:
## astcenc ## astcenc
- Upstream: https://github.com/ARM-software/astc-encoder - Upstream: https://github.com/ARM-software/astc-encoder
- Version: 4.4.0 (5a5b5a1ef60dd47c27c28c66c118d22c40e3197e, 2023) - Version: 4.7.0 (1a51f2915121275038677317c8bf61f1a78b590c, 2024)
- License: Apache 2.0 - License: Apache 2.0
Files extracted from upstream source: Files extracted from upstream source:

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2020-2023 Arm Limited // Copyright 2020-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -215,6 +215,8 @@ enum astcenc_error {
ASTCENC_ERR_BAD_CONTEXT, ASTCENC_ERR_BAD_CONTEXT,
/** @brief The call failed due to unimplemented functionality. */ /** @brief The call failed due to unimplemented functionality. */
ASTCENC_ERR_NOT_IMPLEMENTED, ASTCENC_ERR_NOT_IMPLEMENTED,
/** @brief The call failed due to an out-of-spec decode mode flag set. */
ASTCENC_ERR_BAD_DECODE_MODE,
#if defined(ASTCENC_DIAGNOSTICS) #if defined(ASTCENC_DIAGNOSTICS)
/** @brief The call failed due to an issue with diagnostic tracing. */ /** @brief The call failed due to an issue with diagnostic tracing. */
ASTCENC_ERR_DTRACE_FAILURE, ASTCENC_ERR_DTRACE_FAILURE,
@ -302,6 +304,11 @@ enum astcenc_type
ASTCENC_TYPE_F32 = 2 ASTCENC_TYPE_F32 = 2
}; };
/**
* @brief Function pointer type for compression progress reporting callback.
*/
extern "C" typedef void (*astcenc_progress_callback)(float);
/** /**
* @brief Enable normal map compression. * @brief Enable normal map compression.
* *
@ -312,6 +319,19 @@ enum astcenc_type
*/ */
static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0; static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0;
/**
* @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
*
* The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
* flag during compression will allow the compressor to use the correct rounding when selecting
* encodings. This will improve the compressed image quality if your application is using the
* decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
*
* Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
* this setting.
*/
static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1;
/** /**
* @brief Enable alpha weighting. * @brief Enable alpha weighting.
* *
@ -378,6 +398,7 @@ static const unsigned int ASTCENC_ALL_FLAGS =
ASTCENC_FLG_MAP_RGBM | ASTCENC_FLG_MAP_RGBM |
ASTCENC_FLG_USE_ALPHA_WEIGHT | ASTCENC_FLG_USE_ALPHA_WEIGHT |
ASTCENC_FLG_USE_PERCEPTUAL | ASTCENC_FLG_USE_PERCEPTUAL |
ASTCENC_FLG_USE_DECODE_UNORM8 |
ASTCENC_FLG_DECOMPRESS_ONLY | ASTCENC_FLG_DECOMPRESS_ONLY |
ASTCENC_FLG_SELF_DECOMPRESS_ONLY; ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
@ -542,6 +563,24 @@ struct astcenc_config
*/ */
float tune_2plane_early_out_limit_correlation; float tune_2plane_early_out_limit_correlation;
/**
* @brief The config enable for the mode0 fast-path search.
*
* If this is set to TUNE_MIN_TEXELS_MODE0 or higher then the early-out fast mode0
* search is enabled. This option is ineffective for 3D block sizes.
*/
float tune_search_mode0_enable;
/**
* @brief The progress callback, can be @c nullptr.
*
* If this is specified the codec will peridocially report progress for
* compression as a percentage between 0 and 100. The callback is called from one
* of the compressor threads, so doing significant work in the callback will
* reduce compression performance.
*/
astcenc_progress_callback progress_callback;
#if defined(ASTCENC_DIAGNOSTICS) #if defined(ASTCENC_DIAGNOSTICS)
/** /**
* @brief The path to save the diagnostic trace data to. * @brief The path to save the diagnostic trace data to.

View file

@ -40,6 +40,27 @@
#include "astcenc_internal.h" #include "astcenc_internal.h"
/**
* @brief Compute the error of an LDR RGB or RGBA encoding.
*
* @param uquant0 The original endpoint 0 color.
* @param uquant1 The original endpoint 1 color.
* @param quant0 The unpacked quantized endpoint 0 color.
* @param quant1 The unpacked quantized endpoint 1 color.
*
* @return The MSE of the encoding.
*/
static float get_rgba_encoding_error(
vfloat4 uquant0,
vfloat4 uquant1,
vint4 quant0,
vint4 quant1
) {
vfloat4 error0 = uquant0 - int_to_float(quant0);
vfloat4 error1 = uquant1 - int_to_float(quant1);
return hadd_s(error0 * error0 + error1 * error1);
}
/** /**
* @brief Determine the quantized value given a quantization level. * @brief Determine the quantized value given a quantization level.
* *
@ -56,6 +77,26 @@ static inline uint8_t quant_color(
return color_unquant_to_uquant_tables[quant_level - QUANT_6][index]; return color_unquant_to_uquant_tables[quant_level - QUANT_6][index];
} }
/**
* @brief Determine the quantized value given a quantization level.
*
* @param quant_level The quantization level to use.
* @param value The value to convert. This must be in the 0-255 range.
*
* @return The unpacked quantized value, returned in 0-255 range.
*/
static inline vint4 quant_color3(
quant_method quant_level,
vint4 value
) {
vint4 index = value * 2 + 1;
return vint4(
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<0>()],
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<1>()],
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<2>()],
0);
}
/** /**
* @brief Determine the quantized value given a quantization level and residual. * @brief Determine the quantized value given a quantization level and residual.
* *
@ -83,6 +124,35 @@ static inline uint8_t quant_color(
return color_unquant_to_uquant_tables[quant_level - QUANT_6][index]; return color_unquant_to_uquant_tables[quant_level - QUANT_6][index];
} }
/**
* @brief Determine the quantized value given a quantization level and residual.
*
* @param quant_level The quantization level to use.
* @param value The value to convert. This must be in the 0-255 range.
* @param valuef The original value before rounding, used to compute a residual.
*
* @return The unpacked quantized value, returned in 0-255 range.
*/
static inline vint4 quant_color3(
quant_method quant_level,
vint4 value,
vfloat4 valuef
) {
vint4 index = value * 2;
// Compute the residual to determine if we should round down or up ties.
// Test should be residual >= 0, but empirical testing shows small bias helps.
vfloat4 residual = valuef - int_to_float(value);
vmask4 mask = residual >= vfloat4(-0.1f);
index = select(index, index + 1, mask);
return vint4(
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<0>()],
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<1>()],
color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<2>()],
0);
}
/** /**
* @brief Quantize an LDR RGB color. * @brief Quantize an LDR RGB color.
* *
@ -92,47 +162,33 @@ static inline uint8_t quant_color(
* *
* @param color0 The input unquantized color0 endpoint. * @param color0 The input unquantized color0 endpoint.
* @param color1 The input unquantized color1 endpoint. * @param color1 The input unquantized color1 endpoint.
* @param[out] output The output endpoints, returned as (r0, r1, g0, g1, b0, b1). * @param[out] color0_out The output quantized color0 endpoint.
* @param[out] color1_out The output quantized color1 endpoint.
* @param quant_level The quantization level to use. * @param quant_level The quantization level to use.
*/ */
static void quantize_rgb( static void quantize_rgb(
vfloat4 color0, vfloat4 color0,
vfloat4 color1, vfloat4 color1,
uint8_t output[6], vint4& color0_out,
vint4& color1_out,
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; vint4 color0i, color1i;
vfloat4 nudge(0.2f);
float r0 = astc::clamp255f(color0.lane<0>() * scale);
float g0 = astc::clamp255f(color0.lane<1>() * scale);
float b0 = astc::clamp255f(color0.lane<2>() * scale);
float r1 = astc::clamp255f(color1.lane<0>() * scale);
float g1 = astc::clamp255f(color1.lane<1>() * scale);
float b1 = astc::clamp255f(color1.lane<2>() * scale);
int ri0, gi0, bi0, ri1, gi1, bi1;
float rgb0_addon = 0.0f;
float rgb1_addon = 0.0f;
do do
{ {
ri0 = quant_color(quant_level, astc::max(astc::flt2int_rtn(r0 + rgb0_addon), 0), r0 + rgb0_addon); vint4 color0q = max(float_to_int_rtn(color0), vint4(0));
gi0 = quant_color(quant_level, astc::max(astc::flt2int_rtn(g0 + rgb0_addon), 0), g0 + rgb0_addon); color0i = quant_color3(quant_level, color0q, color0);
bi0 = quant_color(quant_level, astc::max(astc::flt2int_rtn(b0 + rgb0_addon), 0), b0 + rgb0_addon); color0 = color0 - nudge;
ri1 = quant_color(quant_level, astc::min(astc::flt2int_rtn(r1 + rgb1_addon), 255), r1 + rgb1_addon);
gi1 = quant_color(quant_level, astc::min(astc::flt2int_rtn(g1 + rgb1_addon), 255), g1 + rgb1_addon);
bi1 = quant_color(quant_level, astc::min(astc::flt2int_rtn(b1 + rgb1_addon), 255), b1 + rgb1_addon);
rgb0_addon -= 0.2f; vint4 color1q = min(float_to_int_rtn(color1), vint4(255));
rgb1_addon += 0.2f; color1i = quant_color3(quant_level, color1q, color1);
} while (ri0 + gi0 + bi0 > ri1 + gi1 + bi1); color1 = color1 + nudge;
} while (hadd_rgb_s(color0i) > hadd_rgb_s(color1i));
output[0] = static_cast<uint8_t>(ri0); color0_out = color0i;
output[1] = static_cast<uint8_t>(ri1); color1_out = color1i;
output[2] = static_cast<uint8_t>(gi0);
output[3] = static_cast<uint8_t>(gi1);
output[4] = static_cast<uint8_t>(bi0);
output[5] = static_cast<uint8_t>(bi1);
} }
/** /**
@ -145,24 +201,24 @@ static void quantize_rgb(
* *
* @param color0 The input unquantized color0 endpoint. * @param color0 The input unquantized color0 endpoint.
* @param color1 The input unquantized color1 endpoint. * @param color1 The input unquantized color1 endpoint.
* @param[out] output The output endpoints, returned as (r0, r1, g0, g1, b0, b1, a0, a1). * @param[out] color0_out The output quantized color0 endpoint.
* @param[out] color1_out The output quantized color1 endpoint.
* @param quant_level The quantization level to use. * @param quant_level The quantization level to use.
*/ */
static void quantize_rgba( static void quantize_rgba(
vfloat4 color0, vfloat4 color0,
vfloat4 color1, vfloat4 color1,
uint8_t output[8], vint4& color0_out,
vint4& color1_out,
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; quantize_rgb(color0, color1, color0_out, color1_out, quant_level);
float a0 = astc::clamp255f(color0.lane<3>() * scale); float a0 = color0.lane<3>();
float a1 = astc::clamp255f(color1.lane<3>() * scale); float a1 = color1.lane<3>();
output[6] = quant_color(quant_level, astc::flt2int_rtn(a0), a0); color0_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a0), a0));
output[7] = quant_color(quant_level, astc::flt2int_rtn(a1), a1); color1_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a1), a1));
quantize_rgb(color0, color1, output, quant_level);
} }
/** /**
@ -172,7 +228,8 @@ static void quantize_rgba(
* *
* @param color0 The input unquantized color0 endpoint. * @param color0 The input unquantized color0 endpoint.
* @param color1 The input unquantized color1 endpoint. * @param color1 The input unquantized color1 endpoint.
* @param[out] output The output endpoints, returned as (r1, r0, g1, g0, b1, b0). * @param[out] color0_out The output quantized color0 endpoint.
* @param[out] color1_out The output quantized color1 endpoint.
* @param quant_level The quantization level to use. * @param quant_level The quantization level to use.
* *
* @return Returns @c false on failure, @c true on success. * @return Returns @c false on failure, @c true on success.
@ -180,54 +237,35 @@ static void quantize_rgba(
static bool try_quantize_rgb_blue_contract( static bool try_quantize_rgb_blue_contract(
vfloat4 color0, vfloat4 color0,
vfloat4 color1, vfloat4 color1,
uint8_t output[6], vint4& color0_out,
vint4& color1_out,
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; // Apply inverse blue-contraction
color0 += color0 - color0.swz<2, 2, 2, 3>();
color1 += color1 - color1.swz<2, 2, 2, 3>();
float r0 = color0.lane<0>() * scale; // If anything overflows BC cannot be used
float g0 = color0.lane<1>() * scale; vmask4 color0_error = (color0 < vfloat4(0.0f)) | (color0 > vfloat4(255.0f));
float b0 = color0.lane<2>() * scale; vmask4 color1_error = (color1 < vfloat4(0.0f)) | (color1 > vfloat4(255.0f));
if (any(color0_error | color1_error))
float r1 = color1.lane<0>() * scale;
float g1 = color1.lane<1>() * scale;
float b1 = color1.lane<2>() * scale;
// Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used.
r0 += (r0 - b0);
g0 += (g0 - b0);
r1 += (r1 - b1);
g1 += (g1 - b1);
if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f ||
r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f)
{ {
return false; return false;
} }
// Quantize the inverse-blue-contracted color // Quantize the inverse blue-contracted color
int ri0 = quant_color(quant_level, astc::flt2int_rtn(r0), r0); vint4 color0i = quant_color3(quant_level, float_to_int_rtn(color0), color0);
int gi0 = quant_color(quant_level, astc::flt2int_rtn(g0), g0); vint4 color1i = quant_color3(quant_level, float_to_int_rtn(color1), color1);
int bi0 = quant_color(quant_level, astc::flt2int_rtn(b0), b0);
int ri1 = quant_color(quant_level, astc::flt2int_rtn(r1), r1); // If color #1 is not larger than color #0 then blue-contraction cannot be used
int gi1 = quant_color(quant_level, astc::flt2int_rtn(g1), g1); // We must test afterwards because quantization can change the order
int bi1 = quant_color(quant_level, astc::flt2int_rtn(b1), b1); if (hadd_rgb_s(color1i) <= hadd_rgb_s(color0i))
// If color #1 is not larger than color #0 then blue-contraction cannot be used. Note that
// blue-contraction and quantization change this order, which is why we must test afterwards.
if (ri1 + gi1 + bi1 <= ri0 + gi0 + bi0)
{ {
return false; return false;
} }
output[0] = static_cast<uint8_t>(ri1); color0_out = color1i;
output[1] = static_cast<uint8_t>(ri0); color1_out = color0i;
output[2] = static_cast<uint8_t>(gi1);
output[3] = static_cast<uint8_t>(gi0);
output[4] = static_cast<uint8_t>(bi1);
output[5] = static_cast<uint8_t>(bi0);
return true; return true;
} }
@ -238,7 +276,8 @@ static bool try_quantize_rgb_blue_contract(
* *
* @param color0 The input unquantized color0 endpoint. * @param color0 The input unquantized color0 endpoint.
* @param color1 The input unquantized color1 endpoint. * @param color1 The input unquantized color1 endpoint.
* @param[out] output The output endpoints, returned as (r1, r0, g1, g0, b1, b0, a1, a0). * @param[out] color0_out The output quantized color0 endpoint.
* @param[out] color1_out The output quantized color1 endpoint.
* @param quant_level The quantization level to use. * @param quant_level The quantization level to use.
* *
* @return Returns @c false on failure, @c true on success. * @return Returns @c false on failure, @c true on success.
@ -246,18 +285,22 @@ static bool try_quantize_rgb_blue_contract(
static bool try_quantize_rgba_blue_contract( static bool try_quantize_rgba_blue_contract(
vfloat4 color0, vfloat4 color0,
vfloat4 color1, vfloat4 color1,
uint8_t output[8], vint4& color0_out,
vint4& color1_out,
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; if (try_quantize_rgb_blue_contract(color0, color1, color0_out, color1_out, quant_level))
{
float a0 = color0.lane<3>();
float a1 = color1.lane<3>();
float a0 = astc::clamp255f(color0.lane<3>() * scale); color0_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a1), a1));
float a1 = astc::clamp255f(color1.lane<3>() * scale); color1_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a0), a0));
output[6] = quant_color(quant_level, astc::flt2int_rtn(a1), a1); return true;
output[7] = quant_color(quant_level, astc::flt2int_rtn(a0), a0); }
return try_quantize_rgb_blue_contract(color0, color1, output, quant_level); return false;
} }
/** /**
@ -269,7 +312,8 @@ static bool try_quantize_rgba_blue_contract(
* *
* @param color0 The input unquantized color0 endpoint. * @param color0 The input unquantized color0 endpoint.
* @param color1 The input unquantized color1 endpoint. * @param color1 The input unquantized color1 endpoint.
* @param[out] output The output endpoints, returned as (r0, r1, g0, g1, b0, b1). * @param[out] color0_out The output quantized color0 endpoint.
* @param[out] color1_out The output quantized color1 endpoint.
* @param quant_level The quantization level to use. * @param quant_level The quantization level to use.
* *
* @return Returns @c false on failure, @c true on success. * @return Returns @c false on failure, @c true on success.
@ -277,85 +321,54 @@ static bool try_quantize_rgba_blue_contract(
static bool try_quantize_rgb_delta( static bool try_quantize_rgb_delta(
vfloat4 color0, vfloat4 color0,
vfloat4 color1, vfloat4 color1,
uint8_t output[6], vint4& color0_out,
vint4& color1_out,
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; // Transform color0 to unorm9
vint4 color0a = float_to_int_rtn(color0);
float r0 = astc::clamp255f(color0.lane<0>() * scale); color0.set_lane<3>(0.0f);
float g0 = astc::clamp255f(color0.lane<1>() * scale); color0a = lsl<1>(color0a);
float b0 = astc::clamp255f(color0.lane<2>() * scale);
float r1 = astc::clamp255f(color1.lane<0>() * scale);
float g1 = astc::clamp255f(color1.lane<1>() * scale);
float b1 = astc::clamp255f(color1.lane<2>() * scale);
// Transform r0 to unorm9
int r0a = astc::flt2int_rtn(r0);
int g0a = astc::flt2int_rtn(g0);
int b0a = astc::flt2int_rtn(b0);
r0a <<= 1;
g0a <<= 1;
b0a <<= 1;
// Mask off the top bit // Mask off the top bit
int r0b = r0a & 0xFF; vint4 color0b = color0a & 0xFF;
int g0b = g0a & 0xFF;
int b0b = b0a & 0xFF;
// Quantize then unquantize in order to get a value that we take differences against // Quantize then unquantize in order to get a value that we take differences against
int r0be = quant_color(quant_level, r0b); vint4 color0be = quant_color3(quant_level, color0b);
int g0be = quant_color(quant_level, g0b); color0b = color0be | (color0a & 0x100);
int b0be = quant_color(quant_level, b0b);
r0b = r0be | (r0a & 0x100);
g0b = g0be | (g0a & 0x100);
b0b = b0be | (b0a & 0x100);
// Get hold of the second value // Get hold of the second value
int r1d = astc::flt2int_rtn(r1); vint4 color1d = float_to_int_rtn(color1);
int g1d = astc::flt2int_rtn(g1); color1d = lsl<1>(color1d);
int b1d = astc::flt2int_rtn(b1);
r1d <<= 1;
g1d <<= 1;
b1d <<= 1;
// ... and take differences // ... and take differences
r1d -= r0b; color1d = color1d - color0b;
g1d -= g0b; color1d.set_lane<3>(0);
b1d -= b0b;
// Check if the difference is too large to be encodable // Check if the difference is too large to be encodable
if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64) if (any((color1d > vint4(63)) | (color1d < vint4(-64))))
{ {
return false; return false;
} }
// Insert top bit of the base into the offset // Insert top bit of the base into the offset
r1d &= 0x7F; color1d = color1d & 0x7F;
g1d &= 0x7F; color1d = color1d | lsr<1>(color0b & 0x100);
b1d &= 0x7F;
r1d |= (r0b & 0x100) >> 1;
g1d |= (g0b & 0x100) >> 1;
b1d |= (b0b & 0x100) >> 1;
// Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails // Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails
// since we have then corrupted either the top bit of the base or the sign bit of the offset // since we have then corrupted either the top bit of the base or the sign bit of the offset
int r1de = quant_color(quant_level, r1d); vint4 color1de = quant_color3(quant_level, color1d);
int g1de = quant_color(quant_level, g1d);
int b1de = quant_color(quant_level, b1d);
if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0) vint4 color_flips = (color1d ^ color1de) & 0xC0;
color_flips.set_lane<3>(0);
if (any(color_flips != vint4::zero()))
{ {
return false; return false;
} }
// If the sum of offsets triggers blue-contraction then encoding fails // If the sum of offsets triggers blue-contraction then encoding fails
vint4 ep0(r0be, g0be, b0be, 0); vint4 ep0 = color0be;
vint4 ep1(r1de, g1de, b1de, 0); vint4 ep1 = color1de;
bit_transfer_signed(ep1, ep0); bit_transfer_signed(ep1, ep0);
if (hadd_rgb_s(ep1) < 0) if (hadd_rgb_s(ep1) < 0)
{ {
@ -369,111 +382,90 @@ static bool try_quantize_rgb_delta(
return false; return false;
} }
output[0] = static_cast<uint8_t>(r0be); color0_out = color0be;
output[1] = static_cast<uint8_t>(r1de); color1_out = color1de;
output[2] = static_cast<uint8_t>(g0be);
output[3] = static_cast<uint8_t>(g1de);
output[4] = static_cast<uint8_t>(b0be);
output[5] = static_cast<uint8_t>(b1de);
return true; return true;
} }
/**
* @brief Try to quantize an LDR RGB color using delta encoding and blue-contraction.
*
* Blue-contraction is only usable if encoded color 1 RGB is larger than color 0 RGB.
*
* @param color0 The input unquantized color0 endpoint.
* @param color1 The input unquantized color1 endpoint.
* @param[out] color0_out The output quantized color0 endpoint.
* @param[out] color1_out The output quantized color1 endpoint.
* @param quant_level The quantization level to use.
*
* @return Returns @c false on failure, @c true on success.
*/
static bool try_quantize_rgb_delta_blue_contract( static bool try_quantize_rgb_delta_blue_contract(
vfloat4 color0, vfloat4 color0,
vfloat4 color1, vfloat4 color1,
uint8_t output[6], vint4& color0_out,
vint4& color1_out,
quant_method quant_level quant_method quant_level
) { ) {
// Note: Switch around endpoint colors already at start // Note: Switch around endpoint colors already at start
float scale = 1.0f / 257.0f; std::swap(color0, color1);
float r1 = color0.lane<0>() * scale; // Apply inverse blue-contraction
float g1 = color0.lane<1>() * scale; color0 += color0 - color0.swz<2, 2, 2, 3>();
float b1 = color0.lane<2>() * scale; color1 += color1 - color1.swz<2, 2, 2, 3>();
float r0 = color1.lane<0>() * scale; // If anything overflows BC cannot be used
float g0 = color1.lane<1>() * scale; vmask4 color0_error = (color0 < vfloat4(0.0f)) | (color0 > vfloat4(255.0f));
float b0 = color1.lane<2>() * scale; vmask4 color1_error = (color1 < vfloat4(0.0f)) | (color1 > vfloat4(255.0f));
if (any(color0_error | color1_error))
// Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used.
r0 += (r0 - b0);
g0 += (g0 - b0);
r1 += (r1 - b1);
g1 += (g1 - b1);
if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f ||
r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f)
{ {
return false; return false;
} }
// Transform r0 to unorm9 // Transform color0 to unorm9
int r0a = astc::flt2int_rtn(r0); vint4 color0a = float_to_int_rtn(color0);
int g0a = astc::flt2int_rtn(g0); color0.set_lane<3>(0.0f);
int b0a = astc::flt2int_rtn(b0); color0a = lsl<1>(color0a);
r0a <<= 1;
g0a <<= 1;
b0a <<= 1;
// Mask off the top bit // Mask off the top bit
int r0b = r0a & 0xFF; vint4 color0b = color0a & 0xFF;
int g0b = g0a & 0xFF;
int b0b = b0a & 0xFF;
// Quantize, then unquantize in order to get a value that we take differences against. // Quantize then unquantize in order to get a value that we take differences against
int r0be = quant_color(quant_level, r0b); vint4 color0be = quant_color3(quant_level, color0b);
int g0be = quant_color(quant_level, g0b); color0b = color0be | (color0a & 0x100);
int b0be = quant_color(quant_level, b0b);
r0b = r0be | (r0a & 0x100);
g0b = g0be | (g0a & 0x100);
b0b = b0be | (b0a & 0x100);
// Get hold of the second value // Get hold of the second value
int r1d = astc::flt2int_rtn(r1); vint4 color1d = float_to_int_rtn(color1);
int g1d = astc::flt2int_rtn(g1); color1d = lsl<1>(color1d);
int b1d = astc::flt2int_rtn(b1);
r1d <<= 1; // ... and take differences
g1d <<= 1; color1d = color1d - color0b;
b1d <<= 1; color1d.set_lane<3>(0);
// .. and take differences!
r1d -= r0b;
g1d -= g0b;
b1d -= b0b;
// Check if the difference is too large to be encodable // Check if the difference is too large to be encodable
if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64) if (any((color1d > vint4(63)) | (color1d < vint4(-64))))
{ {
return false; return false;
} }
// Insert top bit of the base into the offset // Insert top bit of the base into the offset
r1d &= 0x7F; color1d = color1d & 0x7F;
g1d &= 0x7F; color1d = color1d | lsr<1>(color0b & 0x100);
b1d &= 0x7F;
r1d |= (r0b & 0x100) >> 1; // Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails
g1d |= (g0b & 0x100) >> 1; // since we have then corrupted either the top bit of the base or the sign bit of the offset
b1d |= (b0b & 0x100) >> 1; vint4 color1de = quant_color3(quant_level, color1d);
// Then quantize and unquantize; if this causes any of the top two bits to flip, vint4 color_flips = (color1d ^ color1de) & 0xC0;
// then encoding fails, since we have then corrupted either the top bit of the base color_flips.set_lane<3>(0);
// or the sign bit of the offset. if (any(color_flips != vint4::zero()))
int r1de = quant_color(quant_level, r1d);
int g1de = quant_color(quant_level, g1d);
int b1de = quant_color(quant_level, b1d);
if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0)
{ {
return false; return false;
} }
// If the sum of offsets does not trigger blue-contraction then encoding fails // If the sum of offsets does not trigger blue-contraction then encoding fails
vint4 ep0(r0be, g0be, b0be, 0); vint4 ep0 = color0be;
vint4 ep1(r1de, g1de, b1de, 0); vint4 ep1 = color1de;
bit_transfer_signed(ep1, ep0); bit_transfer_signed(ep1, ep0);
if (hadd_rgb_s(ep1) >= 0) if (hadd_rgb_s(ep1) >= 0)
{ {
@ -487,13 +479,8 @@ static bool try_quantize_rgb_delta_blue_contract(
return false; return false;
} }
output[0] = static_cast<uint8_t>(r0be); color0_out = color0be;
output[1] = static_cast<uint8_t>(r1de); color1_out = color1de;
output[2] = static_cast<uint8_t>(g0be);
output[3] = static_cast<uint8_t>(g1de);
output[4] = static_cast<uint8_t>(b0be);
output[5] = static_cast<uint8_t>(b1de);
return true; return true;
} }
@ -508,7 +495,8 @@ static bool try_quantize_rgb_delta_blue_contract(
* *
* @param color0 The input unquantized color0 endpoint. * @param color0 The input unquantized color0 endpoint.
* @param color1 The input unquantized color1 endpoint. * @param color1 The input unquantized color1 endpoint.
* @param[out] output The output endpoints, returned as (x, x, x, x, x, x, a0, a1). * @param[out] color0_out The output quantized color0 endpoint; must preserve lane 0/1/2.
* @param[out] color1_out The output quantized color1 endpoint; must preserve lane 0/1/2.
* @param quant_level The quantization level to use. * @param quant_level The quantization level to use.
* *
* @return Returns @c false on failure, @c true on success. * @return Returns @c false on failure, @c true on success.
@ -516,13 +504,12 @@ static bool try_quantize_rgb_delta_blue_contract(
static bool try_quantize_alpha_delta( static bool try_quantize_alpha_delta(
vfloat4 color0, vfloat4 color0,
vfloat4 color1, vfloat4 color1,
uint8_t output[8], vint4& color0_out,
vint4& color1_out,
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; float a0 = color0.lane<3>();
float a1 = color1.lane<3>();
float a0 = astc::clamp255f(color0.lane<3>() * scale);
float a1 = astc::clamp255f(color1.lane<3>() * scale);
int a0a = astc::flt2int_rtn(a0); int a0a = astc::flt2int_rtn(a0);
a0a <<= 1; a0a <<= 1;
@ -561,8 +548,8 @@ static bool try_quantize_alpha_delta(
return false; return false;
} }
output[6] = static_cast<uint8_t>(a0be); color0_out.set_lane<3>(a0be);
output[7] = static_cast<uint8_t>(a1de); color1_out.set_lane<3>(a1de);
return true; return true;
} }
@ -589,13 +576,11 @@ static bool try_quantize_luminance_alpha_delta(
uint8_t output[4], uint8_t output[4],
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; float l0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
float l1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
float l0 = astc::clamp255f(hadd_rgb_s(color0) * ((1.0f / 3.0f) * scale)); float a0 = color0.lane<3>();
float l1 = astc::clamp255f(hadd_rgb_s(color1) * ((1.0f / 3.0f) * scale)); float a1 = color1.lane<3>();
float a0 = astc::clamp255f(color0.lane<3>() * scale);
float a1 = astc::clamp255f(color1.lane<3>() * scale);
int l0a = astc::flt2int_rtn(l0); int l0a = astc::flt2int_rtn(l0);
int a0a = astc::flt2int_rtn(a0); int a0a = astc::flt2int_rtn(a0);
@ -693,7 +678,8 @@ static bool try_quantize_luminance_alpha_delta(
* *
* @param color0 The input unquantized color0 endpoint. * @param color0 The input unquantized color0 endpoint.
* @param color1 The input unquantized color1 endpoint. * @param color1 The input unquantized color1 endpoint.
* @param[out] output The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1). * @param[out] color0_out The output quantized color0 endpoint
* @param[out] color1_out The output quantized color1 endpoint
* @param quant_level The quantization level to use. * @param quant_level The quantization level to use.
* *
* @return Returns @c false on failure, @c true on success. * @return Returns @c false on failure, @c true on success.
@ -701,14 +687,14 @@ static bool try_quantize_luminance_alpha_delta(
static bool try_quantize_rgba_delta( static bool try_quantize_rgba_delta(
vfloat4 color0, vfloat4 color0,
vfloat4 color1, vfloat4 color1,
uint8_t output[8], vint4& color0_out,
vint4& color1_out,
quant_method quant_level quant_method quant_level
) { ) {
return try_quantize_rgb_delta(color0, color1, output, quant_level) && return try_quantize_rgb_delta(color0, color1, color0_out, color1_out, quant_level) &&
try_quantize_alpha_delta(color0, color1, output, quant_level); try_quantize_alpha_delta(color0, color1, color0_out, color1_out, quant_level);
} }
/** /**
* @brief Try to quantize an LDR RGBA color using delta and blue contract encoding. * @brief Try to quantize an LDR RGBA color using delta and blue contract encoding.
* *
@ -720,7 +706,8 @@ static bool try_quantize_rgba_delta(
* *
* @param color0 The input unquantized color0 endpoint. * @param color0 The input unquantized color0 endpoint.
* @param color1 The input unquantized color1 endpoint. * @param color1 The input unquantized color1 endpoint.
* @param[out] output The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1). * @param[out] color0_out The output quantized color0 endpoint
* @param[out] color1_out The output quantized color1 endpoint
* @param quant_level The quantization level to use. * @param quant_level The quantization level to use.
* *
* @return Returns @c false on failure, @c true on success. * @return Returns @c false on failure, @c true on success.
@ -728,12 +715,13 @@ static bool try_quantize_rgba_delta(
static bool try_quantize_rgba_delta_blue_contract( static bool try_quantize_rgba_delta_blue_contract(
vfloat4 color0, vfloat4 color0,
vfloat4 color1, vfloat4 color1,
uint8_t output[8], vint4& color0_out,
vint4& color1_out,
quant_method quant_level quant_method quant_level
) { ) {
// Note that we swap the color0 and color1 ordering for alpha to match RGB blue-contract // Note that we swap the color0 and color1 ordering for alpha to match RGB blue-contract
return try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level) && return try_quantize_rgb_delta_blue_contract(color0, color1, color0_out, color1_out, quant_level) &&
try_quantize_alpha_delta(color1, color0, output, quant_level); try_quantize_alpha_delta(color1, color0, color0_out, color1_out, quant_level);
} }
/** /**
@ -774,6 +762,8 @@ static void quantize_rgbs(
/** /**
* @brief Quantize an LDR RGBA color using scale encoding. * @brief Quantize an LDR RGBA color using scale encoding.
* *
* @param color0 The input unquantized color0 alpha endpoint.
* @param color1 The input unquantized color1 alpha endpoint.
* @param color The input unquantized color endpoint and scale factor. * @param color The input unquantized color endpoint and scale factor.
* @param[out] output The output endpoints, returned as (r0, g0, b0, s, a0, a1). * @param[out] output The output endpoints, returned as (r0, g0, b0, s, a0, a1).
* @param quant_level The quantization level to use. * @param quant_level The quantization level to use.
@ -785,10 +775,8 @@ static void quantize_rgbs_alpha(
uint8_t output[6], uint8_t output[6],
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; float a0 = color0.lane<3>();
float a1 = color1.lane<3>();
float a0 = astc::clamp255f(color0.lane<3>() * scale);
float a1 = astc::clamp255f(color1.lane<3>() * scale);
output[4] = quant_color(quant_level, astc::flt2int_rtn(a0), a0); output[4] = quant_color(quant_level, astc::flt2int_rtn(a0), a0);
output[5] = quant_color(quant_level, astc::flt2int_rtn(a1), a1); output[5] = quant_color(quant_level, astc::flt2int_rtn(a1), a1);
@ -810,13 +798,8 @@ static void quantize_luminance(
uint8_t output[2], uint8_t output[2],
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
color0 = color0 * scale;
color1 = color1 * scale;
float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f));
float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f));
if (lum0 > lum1) if (lum0 > lum1)
{ {
@ -843,16 +826,11 @@ static void quantize_luminance_alpha(
uint8_t output[4], uint8_t output[4],
quant_method quant_level quant_method quant_level
) { ) {
float scale = 1.0f / 257.0f; float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
color0 = color0 * scale; float a0 = color0.lane<3>();
color1 = color1 * scale; float a1 = color1.lane<3>();
float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f));
float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f));
float a0 = astc::clamp255f(color0.lane<3>());
float a1 = astc::clamp255f(color1.lane<3>());
output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0), lum0); output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0), lum0);
output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1), lum1); output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1), lum1);
@ -1939,58 +1917,170 @@ uint8_t pack_color_endpoints(
) { ) {
assert(QUANT_6 <= quant_level && quant_level <= QUANT_256); assert(QUANT_6 <= quant_level && quant_level <= QUANT_256);
// We do not support negative colors // Clamp colors to a valid LDR range
color0 = max(color0, 0.0f); // Note that HDR has a lower max, handled in the conversion functions
color1 = max(color1, 0.0f); color0 = clamp(0.0f, 65535.0f, color0);
color1 = clamp(0.0f, 65535.0f, color1);
// Pre-scale the LDR value we need to the 0-255 quantizable range
vfloat4 color0_ldr = color0 * (1.0f / 257.0f);
vfloat4 color1_ldr = color1 * (1.0f / 257.0f);
uint8_t retval = 0; uint8_t retval = 0;
float best_error = ERROR_CALC_DEFAULT;
vint4 color0_out, color1_out;
vint4 color0_out2, color1_out2;
switch (format) switch (format)
{ {
case FMT_RGB: case FMT_RGB:
if (quant_level <= QUANT_160) if (quant_level <= QUANT_160)
{ {
if (try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level)) if (try_quantize_rgb_delta_blue_contract(color0_ldr, color1_ldr, color0_out, color1_out, quant_level))
{ {
vint4 color0_unpack;
vint4 color1_unpack;
rgba_delta_unpack(color0_out, color1_out, color0_unpack, color1_unpack);
retval = FMT_RGB_DELTA; retval = FMT_RGB_DELTA;
break; best_error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
} }
if (try_quantize_rgb_delta(color0, color1, output, quant_level))
if (try_quantize_rgb_delta(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level))
{ {
retval = FMT_RGB_DELTA; vint4 color0_unpack;
break; vint4 color1_unpack;
rgba_delta_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
if (error < best_error)
{
retval = FMT_RGB_DELTA;
best_error = error;
color0_out = color0_out2;
color1_out = color1_out2;
}
} }
} }
if (quant_level < QUANT_256 && try_quantize_rgb_blue_contract(color0, color1, output, quant_level))
if (quant_level < QUANT_256)
{ {
retval = FMT_RGB; if (try_quantize_rgb_blue_contract(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level))
break; {
vint4 color0_unpack;
vint4 color1_unpack;
rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
if (error < best_error)
{
retval = FMT_RGB;
best_error = error;
color0_out = color0_out2;
color1_out = color1_out2;
}
}
} }
quantize_rgb(color0, color1, output, quant_level);
retval = FMT_RGB; {
quantize_rgb(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level);
vint4 color0_unpack;
vint4 color1_unpack;
rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
if (error < best_error)
{
retval = FMT_RGB;
color0_out = color0_out2;
color1_out = color1_out2;
}
}
// TODO: Can we vectorize this?
output[0] = static_cast<uint8_t>(color0_out.lane<0>());
output[1] = static_cast<uint8_t>(color1_out.lane<0>());
output[2] = static_cast<uint8_t>(color0_out.lane<1>());
output[3] = static_cast<uint8_t>(color1_out.lane<1>());
output[4] = static_cast<uint8_t>(color0_out.lane<2>());
output[5] = static_cast<uint8_t>(color1_out.lane<2>());
break; break;
case FMT_RGBA: case FMT_RGBA:
if (quant_level <= QUANT_160) if (quant_level <= QUANT_160)
{ {
if (try_quantize_rgba_delta_blue_contract(color0, color1, output, quant_level)) if (try_quantize_rgba_delta_blue_contract(color0_ldr, color1_ldr, color0_out, color1_out, quant_level))
{ {
vint4 color0_unpack;
vint4 color1_unpack;
rgba_delta_unpack(color0_out, color1_out, color0_unpack, color1_unpack);
retval = FMT_RGBA_DELTA; retval = FMT_RGBA_DELTA;
break; best_error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
} }
if (try_quantize_rgba_delta(color0, color1, output, quant_level))
if (try_quantize_rgba_delta(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level))
{ {
retval = FMT_RGBA_DELTA; vint4 color0_unpack;
break; vint4 color1_unpack;
rgba_delta_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
if (error < best_error)
{
retval = FMT_RGBA_DELTA;
best_error = error;
color0_out = color0_out2;
color1_out = color1_out2;
}
} }
} }
if (quant_level < QUANT_256 && try_quantize_rgba_blue_contract(color0, color1, output, quant_level))
if (quant_level < QUANT_256)
{ {
retval = FMT_RGBA; if (try_quantize_rgba_blue_contract(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level))
break; {
vint4 color0_unpack;
vint4 color1_unpack;
rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
if (error < best_error)
{
retval = FMT_RGBA;
best_error = error;
color0_out = color0_out2;
color1_out = color1_out2;
}
}
} }
quantize_rgba(color0, color1, output, quant_level);
retval = FMT_RGBA; {
quantize_rgba(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level);
vint4 color0_unpack;
vint4 color1_unpack;
rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack);
float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack);
if (error < best_error)
{
retval = FMT_RGBA;
color0_out = color0_out2;
color1_out = color1_out2;
}
}
// TODO: Can we vectorize this?
output[0] = static_cast<uint8_t>(color0_out.lane<0>());
output[1] = static_cast<uint8_t>(color1_out.lane<0>());
output[2] = static_cast<uint8_t>(color0_out.lane<1>());
output[3] = static_cast<uint8_t>(color1_out.lane<1>());
output[4] = static_cast<uint8_t>(color0_out.lane<2>());
output[5] = static_cast<uint8_t>(color1_out.lane<2>());
output[6] = static_cast<uint8_t>(color0_out.lane<3>());
output[7] = static_cast<uint8_t>(color1_out.lane<3>());
break; break;
case FMT_RGB_SCALE: case FMT_RGB_SCALE:
@ -2009,7 +2099,7 @@ uint8_t pack_color_endpoints(
break; break;
case FMT_RGB_SCALE_ALPHA: case FMT_RGB_SCALE_ALPHA:
quantize_rgbs_alpha(color0, color1, rgbs_color, output, quant_level); quantize_rgbs_alpha(color0_ldr, color1_ldr, rgbs_color, output, quant_level);
retval = FMT_RGB_SCALE_ALPHA; retval = FMT_RGB_SCALE_ALPHA;
break; break;
@ -2025,20 +2115,20 @@ uint8_t pack_color_endpoints(
break; break;
case FMT_LUMINANCE: case FMT_LUMINANCE:
quantize_luminance(color0, color1, output, quant_level); quantize_luminance(color0_ldr, color1_ldr, output, quant_level);
retval = FMT_LUMINANCE; retval = FMT_LUMINANCE;
break; break;
case FMT_LUMINANCE_ALPHA: case FMT_LUMINANCE_ALPHA:
if (quant_level <= 18) if (quant_level <= 18)
{ {
if (try_quantize_luminance_alpha_delta(color0, color1, output, quant_level)) if (try_quantize_luminance_alpha_delta(color0_ldr, color1_ldr, output, quant_level))
{ {
retval = FMT_LUMINANCE_ALPHA_DELTA; retval = FMT_LUMINANCE_ALPHA_DELTA;
break; break;
} }
} }
quantize_luminance_alpha(color0, color1, output, quant_level); quantize_luminance_alpha(color0_ldr, color1_ldr, output, quant_level);
retval = FMT_LUMINANCE_ALPHA; retval = FMT_LUMINANCE_ALPHA;
break; break;

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited // Copyright 2011-2023 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -40,15 +40,7 @@ static ASTCENC_SIMD_INLINE vint4 uncontract_color(
return select(input, bc0, mask); return select(input, bc0, mask);
} }
/** void rgba_delta_unpack(
* @brief Unpack an LDR RGBA color that uses delta encoding.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color deltas.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgba_delta_unpack(
vint4 input0, vint4 input0,
vint4 input1, vint4 input1,
vint4& output0, vint4& output0,
@ -92,15 +84,7 @@ static void rgb_delta_unpack(
output1.set_lane<3>(255); output1.set_lane<3>(255);
} }
/** void rgba_unpack(
* @brief Unpack an LDR RGBA color that uses direct encoding.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgba_unpack(
vint4 input0, vint4 input0,
vint4 input1, vint4 input1,
vint4& output0, vint4& output0,
@ -910,32 +894,55 @@ void unpack_color_endpoints(
} }
} }
vint4 ldr_scale(257); // Handle endpoint errors and expansion
vint4 hdr_scale(1);
vint4 output_scale = ldr_scale;
// An LDR profile image // Linear LDR 8-bit endpoints are expanded to 16-bit by replication
if ((decode_mode == ASTCENC_PRF_LDR) || if (decode_mode == ASTCENC_PRF_LDR)
(decode_mode == ASTCENC_PRF_LDR_SRGB))
{ {
// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB // Error color - HDR endpoint in an LDR encoding
if (rgb_hdr == true) if (rgb_hdr || alpha_hdr)
{ {
output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00); output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00); output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output_scale = hdr_scale;
rgb_hdr = false; rgb_hdr = false;
alpha_hdr = false; alpha_hdr = false;
} }
output0 = output0 * 257;
output1 = output1 * 257;
} }
// An HDR profile image // sRGB LDR 8-bit endpoints are expanded to 16 bit by:
// - RGB = shift left by 8 bits and OR with 0x80
// - A = replication
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
// Error color - HDR endpoint in an LDR encoding
if (rgb_hdr || alpha_hdr)
{
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
rgb_hdr = false;
alpha_hdr = false;
}
vmask4 mask(true, true, true, false);
vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
vint4 output0a = output0 * 257;
output0 = select(output0a, output0rgb, mask);
vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
vint4 output1a = output1 * 257;
output1 = select(output1a, output1rgb, mask);
}
// An HDR profile decode, but may be using linear LDR endpoints
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
// HDR endpoints are already 16-bit
else else
{ {
vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr); vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
output_scale = select(ldr_scale, hdr_scale, hdr_lanes); vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
output0 = output0 * output_scale;
output1 = output1 * output_scale;
} }
output0 = output0 * output_scale;
output1 = output1 * output_scale;
} }

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -247,7 +247,7 @@ static bool realign_weights_decimated(
} }
// Create an unquantized weight grid for this decimation level // Create an unquantized weight grid for this decimation level
alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH) for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
{ {
vint unquant_value(dec_weights_uquant + we_idx); vint unquant_value(dec_weights_uquant + we_idx);
@ -467,7 +467,7 @@ static float compress_symbolic_block_for_partition_1plane(
qwt_bitcounts[i] = static_cast<int8_t>(bitcount); qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
// Generate the optimized set of weights for the weight mode // Generate the optimized set of weights for the weight mode
compute_quantized_weights_for_decimation( compute_quantized_weights_for_decimation(
@ -830,7 +830,7 @@ static float compress_symbolic_block_for_partition_2planes(
unsigned int decimation_mode = bm.decimation_mode; unsigned int decimation_mode = bm.decimation_mode;
const auto& di = bsd.get_decimation_info(decimation_mode); const auto& di = bsd.get_decimation_info(decimation_mode);
alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
// Generate the optimized set of weights for the mode // Generate the optimized set of weights for the mode
compute_quantized_weights_for_decimation( compute_quantized_weights_for_decimation(
@ -1163,7 +1163,7 @@ static float prepare_block_statistics(
void compress_block( void compress_block(
const astcenc_contexti& ctx, const astcenc_contexti& ctx,
const image_block& blk, const image_block& blk,
physical_compressed_block& pcb, uint8_t pcb[16],
compression_working_buffers& tmpbuf) compression_working_buffers& tmpbuf)
{ {
astcenc_profile decode_mode = ctx.config.profile; astcenc_profile decode_mode = ctx.config.profile;
@ -1282,9 +1282,10 @@ void compress_block(
static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot; static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
// Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels // Only enable MODE0 fast path if enabled
// Never enable for 3D blocks as no "always" block modes are available
int start_trial = 1; int start_trial = 1;
if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1)) if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
{ {
start_trial = 0; start_trial = 0;
} }

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -27,15 +27,15 @@
/** /**
* @brief Compute the integer linear interpolation of two color endpoints. * @brief Compute the integer linear interpolation of two color endpoints.
* *
* @param decode_mode The ASTC profile (linear or sRGB) * @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
* @param color0 The endpoint0 color. * @param color0 The endpoint0 color.
* @param color1 The endpoint1 color. * @param color1 The endpoint1 color.
* @param weights The interpolation weight (between 0 and 64). * @param weights The interpolation weight (between 0 and 64).
* *
* @return The interpolated color. * @return The interpolated color.
*/ */
static vint4 lerp_color_int( static vint4 lerp_color_int(
astcenc_profile decode_mode, vmask4 u8_mask,
vint4 color0, vint4 color0,
vint4 color1, vint4 color1,
vint4 weights vint4 weights
@ -43,24 +43,18 @@ static vint4 lerp_color_int(
vint4 weight1 = weights; vint4 weight1 = weights;
vint4 weight0 = vint4(64) - weight1; vint4 weight0 = vint4(64) - weight1;
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
color0 = asr<8>(color0);
color1 = asr<8>(color1);
}
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32); vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
color = asr<6>(color); color = asr<6>(color);
if (decode_mode == ASTCENC_PRF_LDR_SRGB) // For decode_unorm8 values force the codec to bit replicate. This allows the
{ // rest of the codec to assume the full 0xFFFF range for everything and ignore
color = color * vint4(257); // the decode_mode setting
} vint4 color_u8 = asr<8>(color) * vint4(257);
color = select(color, color_u8, u8_mask);
return color; return color;
} }
/** /**
* @brief Convert integer color value into a float value for the decoder. * @brief Convert integer color value into a float value for the decoder.
* *
@ -104,10 +98,10 @@ void unpack_weights(
if (!is_dual_plane) if (!is_dual_plane)
{ {
// Build full 64-entry weight lookup table // Build full 64-entry weight lookup table
vint4 tab0(reinterpret_cast<const int*>(scb.weights + 0)); vint4 tab0 = vint4::load(scb.weights + 0);
vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16)); vint4 tab1 = vint4::load(scb.weights + 16);
vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32)); vint4 tab2 = vint4::load(scb.weights + 32);
vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48)); vint4 tab3 = vint4::load(scb.weights + 48);
vint tab0p, tab1p, tab2p, tab3p; vint tab0p, tab1p, tab2p, tab3p;
vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p); vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
@ -134,14 +128,14 @@ void unpack_weights(
{ {
// Build a 32-entry weight lookup table per plane // Build a 32-entry weight lookup table per plane
// Plane 1 // Plane 1
vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights + 0)); vint4 tab0_plane1 = vint4::load(scb.weights + 0);
vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16)); vint4 tab1_plane1 = vint4::load(scb.weights + 16);
vint tab0_plane1p, tab1_plane1p; vint tab0_plane1p, tab1_plane1p;
vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p); vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
// Plane 2 // Plane 2
vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32)); vint4 tab0_plane2 = vint4::load(scb.weights + 32);
vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48)); vint4 tab1_plane2 = vint4::load(scb.weights + 48);
vint tab0_plane2p, tab1_plane2p; vint tab0_plane2p, tab1_plane2p;
vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p); vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
@ -229,12 +223,13 @@ void decompress_symbolic_block(
{ {
vint4 colori(scb.constant_color); vint4 colori(scb.constant_color);
// For sRGB decoding a real decoder would just use the top 8 bits for color conversion. // Determine the UNORM8 rounding on the decode
// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range. vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{ // The real decoder would just use the top 8 bits, but we rescale
colori = asr<8>(colori) * 257; // in to a 16-bit value that rounds correctly.
} vint4 colori_u8 = asr<8>(colori) * 257;
colori = select(colori, colori_u8, u8_mask);
vint4 colorf16 = unorm16_to_sf16(colori); vint4 colorf16 = unorm16_to_sf16(colori);
color = float16_to_float(colorf16); color = float16_to_float(colorf16);
@ -289,6 +284,8 @@ void decompress_symbolic_block(
int plane2_component = scb.plane2_component; int plane2_component = scb.plane2_component;
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component); vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
for (int i = 0; i < partition_count; i++) for (int i = 0; i < partition_count; i++)
{ {
// Decode the color endpoints for this partition // Decode the color endpoints for this partition
@ -310,7 +307,7 @@ void decompress_symbolic_block(
{ {
int tix = pi.texels_of_partition[i][j]; int tix = pi.texels_of_partition[i][j];
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask); vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight); vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 colorf = decode_texel(color, lns_mask); vfloat4 colorf = decode_texel(color, lns_mask);
blk.data_r[tix] = colorf.lane<0>(); blk.data_r[tix] = colorf.lane<0>();
@ -365,12 +362,14 @@ float compute_symbolic_block_difference_2plane(
rgb_lns, a_lns, rgb_lns, a_lns,
ep0, ep1); ep0, ep1);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Unpack and compute error for each texel in the partition // Unpack and compute error for each texel in the partition
unsigned int texel_count = bsd.texel_count; unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i++) for (unsigned int i = 0; i < texel_count; i++)
{ {
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask); vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight); vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 color = int_to_float(colori); vfloat4 color = int_to_float(colori);
vfloat4 oldColor = blk.texel(i); vfloat4 oldColor = blk.texel(i);
@ -444,6 +443,8 @@ float compute_symbolic_block_difference_1plane(
int plane1_weights[BLOCK_MAX_TEXELS]; int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
vfloat4 summa = vfloat4::zero(); vfloat4 summa = vfloat4::zero();
for (unsigned int i = 0; i < partition_count; i++) for (unsigned int i = 0; i < partition_count; i++)
{ {
@ -464,7 +465,7 @@ float compute_symbolic_block_difference_1plane(
for (unsigned int j = 0; j < texel_count; j++) for (unsigned int j = 0; j < texel_count; j++)
{ {
unsigned int tix = pi.texels_of_partition[i][j]; unsigned int tix = pi.texels_of_partition[i][j];
vint4 colori = lerp_color_int(config.profile, ep0, ep1, vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
vint4(plane1_weights[tix])); vint4(plane1_weights[tix]));
vfloat4 color = int_to_float(colori); vfloat4 color = int_to_float(colori);
@ -532,7 +533,7 @@ float compute_symbolic_block_difference_1plane_1partition(
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights // Unquantize and undecimate the weights
alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
// Decode the color endpoints for this partition // Decode the color endpoints for this partition
@ -547,19 +548,12 @@ float compute_symbolic_block_difference_1plane_1partition(
rgb_lns, a_lns, rgb_lns, a_lns,
ep0, ep1); ep0, ep1);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Pre-shift sRGB so things round correctly
if (config.profile == ASTCENC_PRF_LDR_SRGB)
{
ep0 = asr<8>(ep0);
ep1 = asr<8>(ep1);
}
// Unpack and compute error for each texel in the partition // Unpack and compute error for each texel in the partition
vfloatacc summav = vfloatacc::zero(); vfloatacc summav = vfloatacc::zero();
vint lane_id = vint::lane_id(); vint lane_id = vint::lane_id();
vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
unsigned int texel_count = bsd.texel_count; unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
@ -578,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition(
vint ep0_b = vint(ep0.lane<2>()) * weight0; vint ep0_b = vint(ep0.lane<2>()) * weight0;
vint ep0_a = vint(ep0.lane<3>()) * weight0; vint ep0_a = vint(ep0.lane<3>()) * weight0;
// Shift so things round correctly // Combine contributions
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale; vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale; vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale; vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale; vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
// If using a U8 decode mode bit replicate top 8 bits
// so rest of codec can assume 0xFFFF max range everywhere
vint colori_r8 = asr<8>(colori_r) * vint(257);
colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
vint colori_g8 = asr<8>(colori_g) * vint(257);
colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
vint colori_b8 = asr<8>(colori_b) * vint(257);
colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
vint colori_a8 = asr<8>(colori_a) * vint(257);
colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
// Compute color diff // Compute color diff
vfloat color_r = int_to_float(colori_r); vfloat color_r = int_to_float(colori_r);

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -55,6 +55,7 @@ struct astcenc_preset_config
float tune_2partition_early_out_limit_factor; float tune_2partition_early_out_limit_factor;
float tune_3partition_early_out_limit_factor; float tune_3partition_early_out_limit_factor;
float tune_2plane_early_out_limit_correlation; float tune_2plane_early_out_limit_correlation;
float tune_search_mode0_enable;
}; };
/** /**
@ -63,22 +64,22 @@ struct astcenc_preset_config
static const std::array<astcenc_preset_config, 6> preset_configs_high {{ static const std::array<astcenc_preset_config, 6> preset_configs_high {{
{ {
ASTCENC_PRE_FASTEST, ASTCENC_PRE_FASTEST,
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
}, { }, {
ASTCENC_PRE_FAST, ASTCENC_PRE_FAST,
3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
}, { }, {
ASTCENC_PRE_MEDIUM, ASTCENC_PRE_MEDIUM,
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
}, { }, {
ASTCENC_PRE_THOROUGH, ASTCENC_PRE_THOROUGH,
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
}, { }, {
ASTCENC_PRE_VERYTHOROUGH, ASTCENC_PRE_VERYTHOROUGH,
4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f 4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
}, { }, {
ASTCENC_PRE_EXHAUSTIVE, ASTCENC_PRE_EXHAUSTIVE,
4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f 4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
} }
}}; }};
@ -88,22 +89,22 @@ static const std::array<astcenc_preset_config, 6> preset_configs_high {{
static const std::array<astcenc_preset_config, 6> preset_configs_mid {{ static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
{ {
ASTCENC_PRE_FASTEST, ASTCENC_PRE_FASTEST,
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
}, { }, {
ASTCENC_PRE_FAST, ASTCENC_PRE_FAST,
3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
}, { }, {
ASTCENC_PRE_MEDIUM, ASTCENC_PRE_MEDIUM,
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
}, { }, {
ASTCENC_PRE_THOROUGH, ASTCENC_PRE_THOROUGH,
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
}, { }, {
ASTCENC_PRE_VERYTHOROUGH, ASTCENC_PRE_VERYTHOROUGH,
4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f 4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
}, { }, {
ASTCENC_PRE_EXHAUSTIVE, ASTCENC_PRE_EXHAUSTIVE,
4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f 4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
} }
}}; }};
@ -113,22 +114,22 @@ static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
static const std::array<astcenc_preset_config, 6> preset_configs_low {{ static const std::array<astcenc_preset_config, 6> preset_configs_low {{
{ {
ASTCENC_PRE_FASTEST, ASTCENC_PRE_FASTEST,
2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
}, { }, {
ASTCENC_PRE_FAST, ASTCENC_PRE_FAST,
2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
}, { }, {
ASTCENC_PRE_MEDIUM, ASTCENC_PRE_MEDIUM,
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
}, { }, {
ASTCENC_PRE_THOROUGH, ASTCENC_PRE_THOROUGH,
4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
}, { }, {
ASTCENC_PRE_VERYTHOROUGH, ASTCENC_PRE_VERYTHOROUGH,
4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f 4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
}, { }, {
ASTCENC_PRE_EXHAUSTIVE, ASTCENC_PRE_EXHAUSTIVE,
4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f 4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
} }
}}; }};
@ -216,11 +217,13 @@ static astcenc_error validate_block_size(
/** /**
* @brief Validate flags. * @brief Validate flags.
* *
* @param flags The flags to check. * @param profile The profile to check.
* @param flags The flags to check.
* *
* @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
*/ */
static astcenc_error validate_flags( static astcenc_error validate_flags(
astcenc_profile profile,
unsigned int flags unsigned int flags
) { ) {
// Flags field must not contain any unknown flag bits // Flags field must not contain any unknown flag bits
@ -238,6 +241,14 @@ static astcenc_error validate_flags(
return ASTCENC_ERR_BAD_FLAGS; return ASTCENC_ERR_BAD_FLAGS;
} }
// Decode_unorm8 must only be used with an LDR profile
bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
if (is_unorm8 && is_hdr)
{
return ASTCENC_ERR_BAD_DECODE_MODE;
}
return ASTCENC_SUCCESS; return ASTCENC_SUCCESS;
} }
@ -363,7 +374,7 @@ static astcenc_error validate_config(
return status; return status;
} }
status = validate_flags(config.flags); status = validate_flags(config.profile, config.flags);
if (status != ASTCENC_SUCCESS) if (status != ASTCENC_SUCCESS)
{ {
return status; return status;
@ -504,10 +515,10 @@ astcenc_error astcenc_config_init(
config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit; config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit; config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit; config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES); config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels, config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
(*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels); (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
@ -516,6 +527,7 @@ astcenc_error astcenc_config_init(
config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor; config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor; config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation; config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
} }
// Start and end node are not the same - so interpolate between them // Start and end node are not the same - so interpolate between them
else else
@ -542,14 +554,10 @@ astcenc_error astcenc_config_init(
config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit); config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
config.tune_block_mode_limit = LERPI(tune_block_mode_limit); config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
config.tune_refinement_limit = LERPI(tune_refinement_limit); config.tune_refinement_limit = LERPI(tune_refinement_limit);
config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit), config.tune_candidate_limit = LERPUI(tune_candidate_limit);
TUNE_MAX_TRIAL_CANDIDATES); config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit), config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
BLOCK_MAX_PARTITIONINGS); config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
BLOCK_MAX_PARTITIONINGS);
config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
BLOCK_MAX_PARTITIONINGS);
config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels, config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
LERP(tune_db_limit_b_base) - 19 * ltexels); LERP(tune_db_limit_b_base) - 19 * ltexels);
@ -558,6 +566,7 @@ astcenc_error astcenc_config_init(
config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor); config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor); config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation); config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
#undef LERP #undef LERP
#undef LERPI #undef LERPI
#undef LERPUI #undef LERPUI
@ -585,13 +594,14 @@ astcenc_error astcenc_config_init(
case ASTCENC_PRF_HDR_RGB_LDR_A: case ASTCENC_PRF_HDR_RGB_LDR_A:
case ASTCENC_PRF_HDR: case ASTCENC_PRF_HDR:
config.tune_db_limit = 999.0f; config.tune_db_limit = 999.0f;
config.tune_search_mode0_enable = 0.0f;
break; break;
default: default:
return ASTCENC_ERR_BAD_PROFILE; return ASTCENC_ERR_BAD_PROFILE;
} }
// Flags field must not contain any unknown flag bits // Flags field must not contain any unknown flag bits
status = validate_flags(flags); status = validate_flags(profile, flags);
if (status != ASTCENC_SUCCESS) if (status != ASTCENC_SUCCESS)
{ {
return status; return status;
@ -689,6 +699,12 @@ astcenc_error astcenc_context_alloc(
} }
ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN); ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
if (!ctx->bsd)
{
delete ctxo;
return ASTCENC_ERR_OUT_OF_MEM;
}
bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY); bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
init_block_size_descriptor(config.block_x, config.block_y, config.block_z, init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
can_omit_modes, can_omit_modes,
@ -698,7 +714,7 @@ astcenc_error astcenc_context_alloc(
#if !defined(ASTCENC_DECOMPRESS_ONLY) #if !defined(ASTCENC_DECOMPRESS_ONLY)
// Do setup only needed by compression // Do setup only needed by compression
if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY)) if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
{ {
// Turn a dB limit into a per-texel error for faster use later // Turn a dB limit into a per-texel error for faster use later
if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB)) if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
@ -712,7 +728,7 @@ astcenc_error astcenc_context_alloc(
size_t worksize = sizeof(compression_working_buffers) * thread_count; size_t worksize = sizeof(compression_working_buffers) * thread_count;
ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN); ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0, static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
"compression_working_buffers size must be multiple of vector alignment"); "compression_working_buffers size must be multiple of vector alignment");
if (!ctx->working_buffers) if (!ctx->working_buffers)
{ {
@ -802,6 +818,8 @@ static void compress_image(
int row_blocks = xblocks; int row_blocks = xblocks;
int plane_blocks = xblocks * yblocks; int plane_blocks = xblocks * yblocks;
blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
// Populate the block channel weights // Populate the block channel weights
blk.channel_weight = vfloat4(ctx.config.cw_r_weight, blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
ctx.config.cw_g_weight, ctx.config.cw_g_weight,
@ -812,7 +830,7 @@ static void compress_image(
auto& temp_buffers = ctx.working_buffers[thread_index]; auto& temp_buffers = ctx.working_buffers[thread_index];
// Only the first thread actually runs the initializer // Only the first thread actually runs the initializer
ctxo.manage_compress.init(block_count); ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
// Determine if we can use an optimized load function // Determine if we can use an optimized load function
bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) || bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
@ -914,8 +932,7 @@ static void compress_image(
int offset = ((z * yblocks + y) * xblocks + x) * 16; int offset = ((z * yblocks + y) * xblocks + x) * 16;
uint8_t *bp = buffer + offset; uint8_t *bp = buffer + offset;
physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp); compress_block(ctx, blk, bp, temp_buffers);
compress_block(ctx, blk, *pcb, temp_buffers);
} }
ctxo.manage_compress.complete_task_assignment(count); ctxo.manage_compress.complete_task_assignment(count);
@ -1138,6 +1155,7 @@ astcenc_error astcenc_decompress_image(
unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x; unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y; unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z; unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
unsigned int block_count = zblocks * yblocks * xblocks;
int row_blocks = xblocks; int row_blocks = xblocks;
int plane_blocks = xblocks * yblocks; int plane_blocks = xblocks * yblocks;
@ -1152,6 +1170,9 @@ astcenc_error astcenc_decompress_image(
image_block blk; image_block blk;
blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z); blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
// Decode mode inferred from the output data type
blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
// If context thread count is one then implicitly reset // If context thread count is one then implicitly reset
if (ctx->thread_count == 1) if (ctx->thread_count == 1)
{ {
@ -1159,7 +1180,7 @@ astcenc_error astcenc_decompress_image(
} }
// Only the first thread actually runs the initializer // Only the first thread actually runs the initializer
ctxo->manage_decompress.init(zblocks * yblocks * xblocks); ctxo->manage_decompress.init(block_count, nullptr);
// All threads run this processing loop until there is no work remaining // All threads run this processing loop until there is no work remaining
while (true) while (true)
@ -1182,10 +1203,9 @@ astcenc_error astcenc_decompress_image(
unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16; unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
const uint8_t* bp = data + offset; const uint8_t* bp = data + offset;
const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
symbolic_compressed_block scb; symbolic_compressed_block scb;
physical_to_symbolic(*ctx->bsd, pcb, scb); physical_to_symbolic(*ctx->bsd, bp, scb);
decompress_symbolic_block(ctx->config.profile, *ctx->bsd, decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
x * block_x, y * block_y, z * block_z, x * block_x, y * block_y, z * block_z,
@ -1224,9 +1244,8 @@ astcenc_error astcenc_get_block_info(
astcenc_contexti* ctx = &ctxo->context; astcenc_contexti* ctx = &ctxo->context;
// Decode the compressed data into a symbolic form // Decode the compressed data into a symbolic form
const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
symbolic_compressed_block scb; symbolic_compressed_block scb;
physical_to_symbolic(*ctx->bsd, pcb, scb); physical_to_symbolic(*ctx->bsd, data, scb);
// Fetch the appropriate partition and decimation tables // Fetch the appropriate partition and decimation tables
block_size_descriptor& bsd = *ctx->bsd; block_size_descriptor& bsd = *ctx->bsd;
@ -1359,6 +1378,8 @@ const char* astcenc_get_error_string(
return "ASTCENC_ERR_BAD_CONTEXT"; return "ASTCENC_ERR_BAD_CONTEXT";
case ASTCENC_ERR_NOT_IMPLEMENTED: case ASTCENC_ERR_NOT_IMPLEMENTED:
return "ASTCENC_ERR_NOT_IMPLEMENTED"; return "ASTCENC_ERR_NOT_IMPLEMENTED";
case ASTCENC_ERR_BAD_DECODE_MODE:
return "ASTCENC_ERR_BAD_DECODE_MODE";
#if defined(ASTCENC_DIAGNOSTICS) #if defined(ASTCENC_DIAGNOSTICS)
case ASTCENC_ERR_DTRACE_FAILURE: case ASTCENC_ERR_DTRACE_FAILURE:
return "ASTCENC_ERR_DTRACE_FAILURE"; return "ASTCENC_ERR_DTRACE_FAILURE";

View file

@ -250,13 +250,16 @@ static void kmeans_update(
* *
* @return The number of bit mismatches. * @return The number of bit mismatches.
*/ */
static inline unsigned int partition_mismatch2( static inline uint8_t partition_mismatch2(
const uint64_t a[2], const uint64_t a[2],
const uint64_t b[2] const uint64_t b[2]
) { ) {
int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]); int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]); int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
return astc::min(v1, v2);
// Divide by 2 because XOR always counts errors twice, once when missing
// in the expected position, and again when present in the wrong partition
return static_cast<uint8_t>(astc::min(v1, v2) / 2);
} }
/** /**
@ -267,7 +270,7 @@ static inline unsigned int partition_mismatch2(
* *
* @return The number of bit mismatches. * @return The number of bit mismatches.
*/ */
static inline unsigned int partition_mismatch3( static inline uint8_t partition_mismatch3(
const uint64_t a[3], const uint64_t a[3],
const uint64_t b[3] const uint64_t b[3]
) { ) {
@ -295,7 +298,9 @@ static inline unsigned int partition_mismatch3(
int s5 = p11 + p20; int s5 = p11 + p20;
int v2 = astc::min(s4, s5) + p02; int v2 = astc::min(s4, s5) + p02;
return astc::min(v0, v1, v2); // Divide by 2 because XOR always counts errors twice, once when missing
// in the expected position, and again when present in the wrong partition
return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
} }
/** /**
@ -306,7 +311,7 @@ static inline unsigned int partition_mismatch3(
* *
* @return The number of bit mismatches. * @return The number of bit mismatches.
*/ */
static inline unsigned int partition_mismatch4( static inline uint8_t partition_mismatch4(
const uint64_t a[4], const uint64_t a[4],
const uint64_t b[4] const uint64_t b[4]
) { ) {
@ -342,7 +347,9 @@ static inline unsigned int partition_mismatch4(
int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01); int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12); int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
return astc::min(v0, v1, v2, v3); // Divide by 2 because XOR always counts errors twice, once when missing
// in the expected position, and again when present in the wrong partition
return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
} }
using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*); using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
@ -359,7 +366,7 @@ static void count_partition_mismatch_bits(
const block_size_descriptor& bsd, const block_size_descriptor& bsd,
unsigned int partition_count, unsigned int partition_count,
const uint64_t bitmaps[BLOCK_MAX_PARTITIONS], const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS] uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
) { ) {
unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1]; unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
promise(active_count > 0); promise(active_count > 0);
@ -369,6 +376,8 @@ static void count_partition_mismatch_bits(
for (unsigned int i = 0; i < active_count; i++) for (unsigned int i = 0; i < active_count; i++)
{ {
mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]); mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
assert(mismatch_counts[i] < bsd.texel_count);
} }
} }
else if (partition_count == 3) else if (partition_count == 3)
@ -376,6 +385,8 @@ static void count_partition_mismatch_bits(
for (unsigned int i = 0; i < active_count; i++) for (unsigned int i = 0; i < active_count; i++)
{ {
mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]); mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
assert(mismatch_counts[i] < bsd.texel_count);
} }
} }
else else
@ -383,6 +394,8 @@ static void count_partition_mismatch_bits(
for (unsigned int i = 0; i < active_count; i++) for (unsigned int i = 0; i < active_count; i++)
{ {
mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]); mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
assert(mismatch_counts[i] < bsd.texel_count);
} }
} }
} }
@ -397,12 +410,13 @@ static void count_partition_mismatch_bits(
* @return The number of active partitions in this selection. * @return The number of active partitions in this selection.
*/ */
static unsigned int get_partition_ordering_by_mismatch_bits( static unsigned int get_partition_ordering_by_mismatch_bits(
unsigned int texel_count,
unsigned int partitioning_count, unsigned int partitioning_count,
const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS], const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS] uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
) { ) {
promise(partitioning_count > 0); promise(partitioning_count > 0);
unsigned int mscount[256] { 0 }; uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
// Create the histogram of mismatch counts // Create the histogram of mismatch counts
for (unsigned int i = 0; i < partitioning_count; i++) for (unsigned int i = 0; i < partitioning_count; i++)
@ -410,16 +424,14 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
mscount[mismatch_count[i]]++; mscount[mismatch_count[i]]++;
} }
unsigned int active_count = partitioning_count - mscount[255];
// Create a running sum from the histogram array // Create a running sum from the histogram array
// Cells store previous values only; i.e. exclude self after sum // Cells store previous values only; i.e. exclude self after sum
unsigned int summa = 0; unsigned int sum = 0;
for (unsigned int i = 0; i < 256; i++) for (unsigned int i = 0; i < texel_count; i++)
{ {
unsigned int cnt = mscount[i]; uint16_t cnt = mscount[i];
mscount[i] = summa; mscount[i] = sum;
summa += cnt; sum += cnt;
} }
// Use the running sum as the index, incrementing after read to allow // Use the running sum as the index, incrementing after read to allow
@ -427,10 +439,10 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
for (unsigned int i = 0; i < partitioning_count; i++) for (unsigned int i = 0; i < partitioning_count; i++)
{ {
unsigned int idx = mscount[mismatch_count[i]]++; unsigned int idx = mscount[mismatch_count[i]]++;
partition_ordering[idx] = i; partition_ordering[idx] = static_cast<uint16_t>(i);
} }
return active_count; return partitioning_count;
} }
/** /**
@ -447,7 +459,7 @@ static unsigned int compute_kmeans_partition_ordering(
const block_size_descriptor& bsd, const block_size_descriptor& bsd,
const image_block& blk, const image_block& blk,
unsigned int partition_count, unsigned int partition_count,
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS] uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
) { ) {
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]; vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
uint8_t texel_partitions[BLOCK_MAX_TEXELS]; uint8_t texel_partitions[BLOCK_MAX_TEXELS];
@ -478,11 +490,12 @@ static unsigned int compute_kmeans_partition_ordering(
} }
// Count the mismatch between the block and the format's partition tables // Count the mismatch between the block and the format's partition tables
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]; uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts); count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
// Sort the partitions based on the number of mismatched bits // Sort the partitions based on the number of mismatched bits
return get_partition_ordering_by_mismatch_bits( return get_partition_ordering_by_mismatch_bits(
texels_to_process,
bsd.partitioning_count_selected[partition_count - 1], bsd.partitioning_count_selected[partition_count - 1],
mismatch_counts, partition_ordering); mismatch_counts, partition_ordering);
} }
@ -565,7 +578,7 @@ unsigned int find_best_partition_candidates(
weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim; weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS]; uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
partition_search_limit = astc::min(partition_search_limit, sequence_len); partition_search_limit = astc::min(partition_search_limit, sequence_len);
requested_candidates = astc::min(partition_search_limit, requested_candidates); requested_candidates = astc::min(partition_search_limit, requested_candidates);

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -873,7 +873,7 @@ void compute_ideal_weights_for_decimation(
} }
// Otherwise compute an estimate and perform single refinement iteration // Otherwise compute an estimate and perform single refinement iteration
alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
// Compute an initial average for each decimated weight // Compute an initial average for each decimated weight
bool constant_wes = ei.is_constant_weight_error_scale; bool constant_wes = ei.is_constant_weight_error_scale;
@ -1023,7 +1023,7 @@ void compute_quantized_weights_for_decimation(
// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
if (get_quant_level(quant_level) <= 16) if (get_quant_level(quant_level) <= 16)
{ {
vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant)); vint4 tab0 = vint4::load(qat.quant_to_unquant);
vint tab0p; vint tab0p;
vtable_prepare(tab0, tab0p); vtable_prepare(tab0, tab0p);
@ -1056,8 +1056,8 @@ void compute_quantized_weights_for_decimation(
} }
else else
{ {
vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant)); vint4 tab0 = vint4::load(qat.quant_to_unquant + 0);
vint4 tab1(reinterpret_cast<const int*>(qat.quant_to_unquant + 16)); vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
vint tab0p, tab1p; vint tab0p, tab1p;
vtable_prepare(tab0, tab1, tab0p, tab1p); vtable_prepare(tab0, tab1, tab0p, tab1p);
@ -1171,7 +1171,7 @@ void recompute_ideal_colors_1plane(
promise(total_texel_count > 0); promise(total_texel_count > 0);
promise(partition_count > 0); promise(partition_count > 0);
alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{ {
vint unquant_value(dec_weights_uquant + i); vint unquant_value(dec_weights_uquant + i);
@ -1179,7 +1179,7 @@ void recompute_ideal_colors_1plane(
storea(unquant_valuef, dec_weight + i); storea(unquant_valuef, dec_weight + i);
} }
alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
float* undec_weight_ref; float* undec_weight_ref;
if (di.max_texel_weight_count == 1) if (di.max_texel_weight_count == 1)
{ {
@ -1394,8 +1394,8 @@ void recompute_ideal_colors_2planes(
promise(total_texel_count > 0); promise(total_texel_count > 0);
promise(weight_count > 0); promise(weight_count > 0);
alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE); assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
@ -1410,8 +1410,8 @@ void recompute_ideal_colors_2planes(
storea(unquant_value2f, dec_weight_plane2 + i); storea(unquant_value2f, dec_weight_plane2 + i);
} }
alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
float* undec_weight_plane1_ref; float* undec_weight_plane1_ref;
float* undec_weight_plane2_ref; float* undec_weight_plane2_ref;

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -109,7 +109,7 @@ static vfloat4 swz_texel(
vfloat4 data, vfloat4 data,
const astcenc_swizzle& swz const astcenc_swizzle& swz
) { ) {
alignas(16) float datas[6]; ASTCENC_ALIGNAS float datas[6];
storea(data, datas); storea(data, datas);
datas[ASTCENC_SWZ_0] = 0.0f; datas[ASTCENC_SWZ_0] = 0.0f;
@ -433,7 +433,7 @@ void store_image_block(
vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai); vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
vmask store_mask = vint::lane_id() < vint(used_texels); vmask store_mask = vint::lane_id() < vint(used_texels);
store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask); store_lanes_masked(data8_row, data_rgbai, store_mask);
data8_row += ASTCENC_SIMD_WIDTH * 4; data8_row += ASTCENC_SIMD_WIDTH * 4;
idx += used_texels; idx += used_texels;

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -29,6 +29,7 @@
#include <cstdio> #include <cstdio>
#endif #endif
#include <cstdlib> #include <cstdlib>
#include <limits>
#include "astcenc.h" #include "astcenc.h"
#include "astcenc_mathlib.h" #include "astcenc_mathlib.h"
@ -79,7 +80,7 @@ static constexpr unsigned int BLOCK_MAX_PARTITIONS { 4 };
/** @brief The number of partitionings, per partition count, suported by the ASTC format. */ /** @brief The number of partitionings, per partition count, suported by the ASTC format. */
static constexpr unsigned int BLOCK_MAX_PARTITIONINGS { 1024 }; static constexpr unsigned int BLOCK_MAX_PARTITIONINGS { 1024 };
/** @brief The maximum number of weights used during partition selection for texel clustering. */ /** @brief The maximum number of texels used during partition selection for texel clustering. */
static constexpr uint8_t BLOCK_MAX_KMEANS_TEXELS { 64 }; static constexpr uint8_t BLOCK_MAX_KMEANS_TEXELS { 64 };
/** @brief The maximum number of weights a block can support. */ /** @brief The maximum number of weights a block can support. */
@ -119,11 +120,9 @@ static constexpr unsigned int WEIGHTS_MAX_DECIMATION_MODES { 87 };
static constexpr float ERROR_CALC_DEFAULT { 1e30f }; static constexpr float ERROR_CALC_DEFAULT { 1e30f };
/** /**
* @brief The minimum texel count for a block to use the one partition fast path. * @brief The minimum tuning setting threshold for the one partition fast path.
*
* This setting skips 4x4 and 5x4 block sizes.
*/ */
static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 }; static constexpr float TUNE_MIN_SEARCH_MODE0 { 0.85f };
/** /**
* @brief The maximum number of candidate encodings tested for each encoding mode. * @brief The maximum number of candidate encodings tested for each encoding mode.
@ -137,7 +136,7 @@ static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 8 };
* *
* This can be dynamically reduced by the compression quality preset. * This can be dynamically reduced by the compression quality preset.
*/ */
static constexpr unsigned int TUNE_MAX_PARTITIONING_CANDIDATES { 32 }; static constexpr unsigned int TUNE_MAX_PARTITIONING_CANDIDATES { 8 };
/** /**
* @brief The maximum quant level using full angular endpoint search method. * @brief The maximum quant level using full angular endpoint search method.
@ -386,7 +385,7 @@ struct decimation_info
* @brief The bilinear contribution of the N weights that are interpolated for each texel. * @brief The bilinear contribution of the N weights that are interpolated for each texel.
* Value is between 0 and 1, stored transposed to improve vectorization. * Value is between 0 and 1, stored transposed to improve vectorization.
*/ */
alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
/** @brief The number of texels that each stored weight contributes to. */ /** @brief The number of texels that each stored weight contributes to. */
uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS]; uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
@ -401,7 +400,7 @@ struct decimation_info
* @brief The bilinear contribution to the N texels that use each weight. * @brief The bilinear contribution to the N texels that use each weight.
* Value is between 0 and 1, stored transposed to improve vectorization. * Value is between 0 and 1, stored transposed to improve vectorization.
*/ */
alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
/** /**
* @brief The bilinear contribution to the Nth texel that uses each weight. * @brief The bilinear contribution to the Nth texel that uses each weight.
@ -581,7 +580,7 @@ struct block_size_descriptor
decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES]; decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];
/** @brief The active decimation tables, stored in low indices. */ /** @brief The active decimation tables, stored in low indices. */
alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES]; ASTCENC_ALIGNAS decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */ /** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES]; uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
@ -741,16 +740,16 @@ struct block_size_descriptor
struct image_block struct image_block
{ {
/** @brief The input (compress) or output (decompress) data for the red color component. */ /** @brief The input (compress) or output (decompress) data for the red color component. */
alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS];
/** @brief The input (compress) or output (decompress) data for the green color component. */ /** @brief The input (compress) or output (decompress) data for the green color component. */
alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS];
/** @brief The input (compress) or output (decompress) data for the blue color component. */ /** @brief The input (compress) or output (decompress) data for the blue color component. */
alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS];
/** @brief The input (compress) or output (decompress) data for the alpha color component. */ /** @brief The input (compress) or output (decompress) data for the alpha color component. */
alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS];
/** @brief The number of texels in the block. */ /** @brief The number of texels in the block. */
uint8_t texel_count; uint8_t texel_count;
@ -773,6 +772,9 @@ struct image_block
/** @brief Is this grayscale block where R == G == B for all texels? */ /** @brief Is this grayscale block where R == G == B for all texels? */
bool grayscale; bool grayscale;
/** @brief Is the eventual decode using decode_unorm8 rounding? */
bool decode_unorm8;
/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */ /** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
uint8_t rgb_lns[BLOCK_MAX_TEXELS]; uint8_t rgb_lns[BLOCK_MAX_TEXELS];
@ -899,10 +901,10 @@ struct endpoints_and_weights
endpoints ep; endpoints ep;
/** @brief The ideal weight for each texel; may be undecimated or decimated. */ /** @brief The ideal weight for each texel; may be undecimated or decimated. */
alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float weights[BLOCK_MAX_TEXELS];
/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */ /** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS]; ASTCENC_ALIGNAS float weight_error_scale[BLOCK_MAX_TEXELS];
}; };
/** /**
@ -932,7 +934,7 @@ struct encoding_choice_errors
/** /**
* @brief Preallocated working buffers, allocated per thread during context creation. * @brief Preallocated working buffers, allocated per thread during context creation.
*/ */
struct alignas(ASTCENC_VECALIGN) compression_working_buffers struct ASTCENC_ALIGNAS compression_working_buffers
{ {
/** @brief Ideal endpoints and weights for plane 1. */ /** @brief Ideal endpoints and weights for plane 1. */
endpoints_and_weights ei1; endpoints_and_weights ei1;
@ -948,7 +950,7 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
* *
* For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets. * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
*/ */
alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
/** /**
* @brief Decimated quantized weight values in the unquantized 0-64 range. * @brief Decimated quantized weight values in the unquantized 0-64 range.
@ -958,7 +960,7 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS]; uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
/** @brief Error of the best encoding combination for each block mode. */ /** @brief Error of the best encoding combination for each block mode. */
alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES]; ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
/** @brief The best color quant for each block mode. */ /** @brief The best color quant for each block mode. */
uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES]; uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
@ -1025,13 +1027,13 @@ struct dt_init_working_buffers
struct quant_and_transfer_table struct quant_and_transfer_table
{ {
/** @brief The unscrambled unquantized value. */ /** @brief The unscrambled unquantized value. */
int8_t quant_to_unquant[32]; uint8_t quant_to_unquant[32];
/** @brief The scrambling order: scrambled_quant = map[unscrambled_quant]. */ /** @brief The scrambling order: scrambled_quant = map[unscrambled_quant]. */
int8_t scramble_map[32]; uint8_t scramble_map[32];
/** @brief The unscrambling order: unscrambled_unquant = map[scrambled_quant]. */ /** @brief The unscrambling order: unscrambled_unquant = map[scrambled_quant]. */
int8_t unscramble_and_unquant_map[32]; uint8_t unscramble_and_unquant_map[32];
/** /**
* @brief A table of previous-and-next weights, indexed by the current unquantized value. * @brief A table of previous-and-next weights, indexed by the current unquantized value.
@ -1060,7 +1062,7 @@ static constexpr uint8_t SYM_BTYPE_NONCONST { 3 };
* @brief A symbolic representation of a compressed block. * @brief A symbolic representation of a compressed block.
* *
* The symbolic representation stores the unpacked content of a single * The symbolic representation stores the unpacked content of a single
* @c physical_compressed_block, in a form which is much easier to access for * physical compressed block, in a form which is much easier to access for
* the rest of the compressor code. * the rest of the compressor code.
*/ */
struct symbolic_compressed_block struct symbolic_compressed_block
@ -1122,18 +1124,6 @@ struct symbolic_compressed_block
} }
}; };
/**
* @brief A physical representation of a compressed block.
*
* The physical representation stores the raw bytes of the format in memory.
*/
struct physical_compressed_block
{
/** @brief The ASTC encoded data for a single block. */
uint8_t data[16];
};
/** /**
* @brief Parameter structure for @c compute_pixel_region_variance(). * @brief Parameter structure for @c compute_pixel_region_variance().
* *
@ -1577,6 +1567,33 @@ unsigned int find_best_partition_candidates(
Functionality for managing images and image related data. Functionality for managing images and image related data.
============================================================================ */ ============================================================================ */
/**
* @brief Get a vector mask indicating lanes decompressing into a UNORM8 value.
*
* @param decode_mode The color profile for LDR_SRGB settings.
* @param blk The image block for output image bitness settings.
*
* @return The component mask vector.
*/
static inline vmask4 get_u8_component_mask(
astcenc_profile decode_mode,
const image_block& blk
) {
vmask4 u8_mask(false);
// Decode mode writing to a unorm8 output value
if (blk.decode_unorm8)
{
u8_mask = vmask4(true);
}
// SRGB writing to a unorm8 RGB value
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
u8_mask = vmask4(true, true, true, false);
}
return u8_mask;
}
/** /**
* @brief Setup computation of regional averages in an image. * @brief Setup computation of regional averages in an image.
* *
@ -1830,7 +1847,7 @@ uint8_t pack_color_endpoints(
* *
* Endpoints must be unscrambled and converted into the 0-255 range before calling this functions. * Endpoints must be unscrambled and converted into the 0-255 range before calling this functions.
* *
* @param decode_mode The decode mode (LDR, HDR). * @param decode_mode The decode mode (LDR, HDR, etc).
* @param format The color endpoint mode used. * @param format The color endpoint mode used.
* @param input The raw array of encoded input integers. The length of this array * @param input The raw array of encoded input integers. The length of this array
* depends on @c format; it can be safely assumed to be large enough. * depends on @c format; it can be safely assumed to be large enough.
@ -1848,6 +1865,34 @@ void unpack_color_endpoints(
vint4& output0, vint4& output0,
vint4& output1); vint4& output1);
/**
* @brief Unpack an LDR RGBA color that uses delta encoding.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color deltas.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
void rgba_delta_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1);
/**
* @brief Unpack an LDR RGBA color that uses direct encoding.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
void rgba_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1);
/** /**
* @brief Unpack a set of quantized and decimated weights. * @brief Unpack a set of quantized and decimated weights.
* *
@ -2007,7 +2052,7 @@ void compute_angular_endpoints_2planes(
void compress_block( void compress_block(
const astcenc_contexti& ctx, const astcenc_contexti& ctx,
const image_block& blk, const image_block& blk,
physical_compressed_block& pcb, uint8_t pcb[16],
compression_working_buffers& tmpbuf); compression_working_buffers& tmpbuf);
/** /**
@ -2100,12 +2145,12 @@ float compute_symbolic_block_difference_1plane_1partition(
* *
* @param bsd The block size information. * @param bsd The block size information.
* @param scb The symbolic representation. * @param scb The symbolic representation.
* @param[out] pcb The binary encoded data. * @param[out] pcb The physical compressed block output.
*/ */
void symbolic_to_physical( void symbolic_to_physical(
const block_size_descriptor& bsd, const block_size_descriptor& bsd,
const symbolic_compressed_block& scb, const symbolic_compressed_block& scb,
physical_compressed_block& pcb); uint8_t pcb[16]);
/** /**
* @brief Convert a binary physical encoding into a symbolic representation. * @brief Convert a binary physical encoding into a symbolic representation.
@ -2114,12 +2159,12 @@ void symbolic_to_physical(
* flagged as an error block if the encoding is invalid. * flagged as an error block if the encoding is invalid.
* *
* @param bsd The block size information. * @param bsd The block size information.
* @param pcb The binary encoded data. * @param pcb The physical compresesd block input.
* @param[out] scb The output symbolic representation. * @param[out] scb The output symbolic representation.
*/ */
void physical_to_symbolic( void physical_to_symbolic(
const block_size_descriptor& bsd, const block_size_descriptor& bsd,
const physical_compressed_block& pcb, const uint8_t pcb[16],
symbolic_compressed_block& scb); symbolic_compressed_block& scb);
/* ============================================================================ /* ============================================================================
@ -2128,10 +2173,11 @@ Platform-specific functions.
/** /**
* @brief Allocate an aligned memory buffer. * @brief Allocate an aligned memory buffer.
* *
* Allocated memory must be freed by aligned_free; * Allocated memory must be freed by aligned_free.
* *
* @param size The desired buffer size. * @param size The desired buffer size.
* @param align The desired buffer alignment; must be 2^N. * @param align The desired buffer alignment; must be 2^N, may be increased
* by the implementation to a minimum allowable alignment.
* *
* @return The memory buffer pointer or nullptr on allocation failure. * @return The memory buffer pointer or nullptr on allocation failure.
*/ */
@ -2141,10 +2187,14 @@ T* aligned_malloc(size_t size, size_t align)
void* ptr; void* ptr;
int error = 0; int error = 0;
// Don't allow this to under-align a type
size_t min_align = astc::max(alignof(T), sizeof(void*));
size_t real_align = astc::max(min_align, align);
#if defined(_WIN32) #if defined(_WIN32)
ptr = _aligned_malloc(size, align); ptr = _aligned_malloc(size, real_align);
#else #else
error = posix_memalign(&ptr, align, size); error = posix_memalign(&ptr, real_align, size);
#endif #endif
if (error || (!ptr)) if (error || (!ptr))
@ -2164,9 +2214,9 @@ template<typename T>
void aligned_free(T* ptr) void aligned_free(T* ptr)
{ {
#if defined(_WIN32) #if defined(_WIN32)
_aligned_free(reinterpret_cast<void*>(ptr)); _aligned_free(ptr);
#else #else
free(reinterpret_cast<void*>(ptr)); free(ptr);
#endif #endif
} }

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -118,6 +118,18 @@ private:
/** @brief Number of tasks that need to be processed. */ /** @brief Number of tasks that need to be processed. */
unsigned int m_task_count; unsigned int m_task_count;
/** @brief Progress callback (optional). */
astcenc_progress_callback m_callback;
/** @brief Lock used for callback synchronization. */
std::mutex m_callback_lock;
/** @brief Minimum progress before making a callback. */
float m_callback_min_diff;
/** @brief Last progress callback value. */
float m_callback_last_value;
public: public:
/** @brief Create a new ParallelManager. */ /** @brief Create a new ParallelManager. */
ParallelManager() ParallelManager()
@ -138,6 +150,8 @@ public:
m_start_count = 0; m_start_count = 0;
m_done_count = 0; m_done_count = 0;
m_task_count = 0; m_task_count = 0;
m_callback_last_value = 0.0f;
m_callback_min_diff = 1.0f;
} }
/** /**
@ -166,14 +180,20 @@ public:
* initialization. Other threads will block and wait for it to complete. * initialization. Other threads will block and wait for it to complete.
* *
* @param task_count Total number of tasks needing processing. * @param task_count Total number of tasks needing processing.
* @param callback Function pointer for progress status callbacks.
*/ */
void init(unsigned int task_count) void init(unsigned int task_count, astcenc_progress_callback callback)
{ {
std::lock_guard<std::mutex> lck(m_lock); std::lock_guard<std::mutex> lck(m_lock);
if (!m_init_done) if (!m_init_done)
{ {
m_callback = callback;
m_task_count = task_count; m_task_count = task_count;
m_init_done = true; m_init_done = true;
// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
m_callback_min_diff = astc::max(min_diff, 1.0f);
} }
} }
@ -212,12 +232,49 @@ public:
{ {
// Note: m_done_count cannot use an atomic without the mutex; this has a race between the // Note: m_done_count cannot use an atomic without the mutex; this has a race between the
// update here and the wait() for other threads // update here and the wait() for other threads
std::unique_lock<std::mutex> lck(m_lock); unsigned int local_count;
this->m_done_count += count; float local_last_value;
if (m_done_count == m_task_count)
{ {
lck.unlock(); std::unique_lock<std::mutex> lck(m_lock);
m_complete.notify_all(); m_done_count += count;
local_count = m_done_count;
local_last_value = m_callback_last_value;
if (m_done_count == m_task_count)
{
// Ensure the progress bar hits 100%
if (m_callback)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
m_callback(100.0f);
m_callback_last_value = 100.0f;
}
lck.unlock();
m_complete.notify_all();
}
}
// Process progress callback if we have one
if (m_callback)
{
// Initial lockless test - have we progressed enough to emit?
float num = static_cast<float>(local_count);
float den = static_cast<float>(m_task_count);
float this_value = (num / den) * 100.0f;
bool report_test = (this_value - local_last_value) > m_callback_min_diff;
// Recheck under lock, because another thread might report first
if (report_test)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
if (report_retest)
{
m_callback(this_value);
m_callback_last_value = this_value;
}
}
} }
} }

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -73,10 +73,22 @@
#endif #endif
#endif #endif
// Force vector-sized SIMD alignment
#if ASTCENC_AVX #if ASTCENC_AVX
#define ASTCENC_VECALIGN 32 #define ASTCENC_VECALIGN 32
#else #elif ASTCENC_SSE || ASTCENC_NEON
#define ASTCENC_VECALIGN 16 #define ASTCENC_VECALIGN 16
// Use default alignment for non-SIMD builds
#else
#define ASTCENC_VECALIGN 0
#endif
// C++11 states that alignas(0) should be ignored but GCC doesn't do
// this on some versions, so workaround and avoid emitting alignas(0)
#if ASTCENC_VECALIGN > 0
#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
#else
#define ASTCENC_ALIGNAS
#endif #endif
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0

View file

@ -102,7 +102,7 @@ static inline void write_bits(
void symbolic_to_physical( void symbolic_to_physical(
const block_size_descriptor& bsd, const block_size_descriptor& bsd,
const symbolic_compressed_block& scb, const symbolic_compressed_block& scb,
physical_compressed_block& pcb uint8_t pcb[16]
) { ) {
assert(scb.block_type != SYM_BTYPE_ERROR); assert(scb.block_type != SYM_BTYPE_ERROR);
@ -113,13 +113,13 @@ void symbolic_to_physical(
static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
for (unsigned int i = 0; i < 8; i++) for (unsigned int i = 0; i < 8; i++)
{ {
pcb.data[i] = cbytes[i]; pcb[i] = cbytes[i];
} }
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++) for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
{ {
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF; pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF; pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
} }
return; return;
@ -132,13 +132,13 @@ void symbolic_to_physical(
static const uint8_t cbytes[8] { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; static const uint8_t cbytes[8] { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
for (unsigned int i = 0; i < 8; i++) for (unsigned int i = 0; i < 8; i++)
{ {
pcb.data[i] = cbytes[i]; pcb[i] = cbytes[i];
} }
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++) for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
{ {
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF; pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF; pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
} }
return; return;
@ -194,23 +194,23 @@ void symbolic_to_physical(
for (int i = 0; i < 16; i++) for (int i = 0; i < 16; i++)
{ {
pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i])); pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
} }
write_bits(scb.block_mode, 11, 0, pcb.data); write_bits(scb.block_mode, 11, 0, pcb);
write_bits(partition_count - 1, 2, 11, pcb.data); write_bits(partition_count - 1, 2, 11, pcb);
int below_weights_pos = 128 - bits_for_weights; int below_weights_pos = 128 - bits_for_weights;
// Encode partition index and color endpoint types for blocks with 2+ partitions // Encode partition index and color endpoint types for blocks with 2+ partitions
if (partition_count > 1) if (partition_count > 1)
{ {
write_bits(scb.partition_index, 6, 13, pcb.data); write_bits(scb.partition_index, 6, 13, pcb);
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb.data); write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb);
if (scb.color_formats_matched) if (scb.color_formats_matched)
{ {
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb.data); write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb);
} }
else else
{ {
@ -249,20 +249,20 @@ void symbolic_to_physical(
int encoded_type_highpart = encoded_type >> 6; int encoded_type_highpart = encoded_type >> 6;
int encoded_type_highpart_size = (3 * partition_count) - 4; int encoded_type_highpart_size = (3 * partition_count) - 4;
int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size; int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb.data); write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb);
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb.data); write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb);
below_weights_pos -= encoded_type_highpart_size; below_weights_pos -= encoded_type_highpart_size;
} }
} }
else else
{ {
write_bits(scb.color_formats[0], 4, 13, pcb.data); write_bits(scb.color_formats[0], 4, 13, pcb);
} }
// In dual-plane mode, encode the color component of the second plane of weights // In dual-plane mode, encode the color component of the second plane of weights
if (is_dual_plane) if (is_dual_plane)
{ {
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb.data); write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb);
} }
// Encode the color components // Encode the color components
@ -281,7 +281,7 @@ void symbolic_to_physical(
valuecount_to_encode += vals; valuecount_to_encode += vals;
} }
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb.data, encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb,
scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS); scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
} }
@ -290,7 +290,7 @@ void symbolic_to_physical(
/* See header for documentation. */ /* See header for documentation. */
void physical_to_symbolic( void physical_to_symbolic(
const block_size_descriptor& bsd, const block_size_descriptor& bsd,
const physical_compressed_block& pcb, const uint8_t pcb[16],
symbolic_compressed_block& scb symbolic_compressed_block& scb
) { ) {
uint8_t bswapped[16]; uint8_t bswapped[16];
@ -298,7 +298,7 @@ void physical_to_symbolic(
scb.block_type = SYM_BTYPE_NONCONST; scb.block_type = SYM_BTYPE_NONCONST;
// Extract header fields // Extract header fields
int block_mode = read_bits(11, 0, pcb.data); int block_mode = read_bits(11, 0, pcb);
if ((block_mode & 0x1FF) == 0x1FC) if ((block_mode & 0x1FF) == 0x1FC)
{ {
// Constant color block // Constant color block
@ -316,24 +316,24 @@ void physical_to_symbolic(
scb.partition_count = 0; scb.partition_count = 0;
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
{ {
scb.constant_color[i] = pcb.data[2 * i + 8] | (pcb.data[2 * i + 9] << 8); scb.constant_color[i] = pcb[2 * i + 8] | (pcb[2 * i + 9] << 8);
} }
// Additionally, check that the void-extent // Additionally, check that the void-extent
if (bsd.zdim == 1) if (bsd.zdim == 1)
{ {
// 2D void-extent // 2D void-extent
int rsvbits = read_bits(2, 10, pcb.data); int rsvbits = read_bits(2, 10, pcb);
if (rsvbits != 3) if (rsvbits != 3)
{ {
scb.block_type = SYM_BTYPE_ERROR; scb.block_type = SYM_BTYPE_ERROR;
return; return;
} }
int vx_low_s = read_bits(8, 12, pcb.data) | (read_bits(5, 12 + 8, pcb.data) << 8); int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
int vx_high_s = read_bits(8, 25, pcb.data) | (read_bits(5, 25 + 8, pcb.data) << 8); int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8);
int vx_low_t = read_bits(8, 38, pcb.data) | (read_bits(5, 38 + 8, pcb.data) << 8); int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
int vx_high_t = read_bits(8, 51, pcb.data) | (read_bits(5, 51 + 8, pcb.data) << 8); int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8);
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF; int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
@ -346,12 +346,12 @@ void physical_to_symbolic(
else else
{ {
// 3D void-extent // 3D void-extent
int vx_low_s = read_bits(9, 10, pcb.data); int vx_low_s = read_bits(9, 10, pcb);
int vx_high_s = read_bits(9, 19, pcb.data); int vx_high_s = read_bits(9, 19, pcb);
int vx_low_t = read_bits(9, 28, pcb.data); int vx_low_t = read_bits(9, 28, pcb);
int vx_high_t = read_bits(9, 37, pcb.data); int vx_high_t = read_bits(9, 37, pcb);
int vx_low_p = read_bits(9, 46, pcb.data); int vx_low_p = read_bits(9, 46, pcb);
int vx_high_p = read_bits(9, 55, pcb.data); int vx_high_p = read_bits(9, 55, pcb);
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF; int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
@ -383,7 +383,7 @@ void physical_to_symbolic(
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count; int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
int partition_count = read_bits(2, 11, pcb.data) + 1; int partition_count = read_bits(2, 11, pcb) + 1;
promise(partition_count > 0); promise(partition_count > 0);
scb.block_mode = static_cast<uint16_t>(block_mode); scb.block_mode = static_cast<uint16_t>(block_mode);
@ -391,7 +391,7 @@ void physical_to_symbolic(
for (int i = 0; i < 16; i++) for (int i = 0; i < 16; i++)
{ {
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb.data[15 - i])); bswapped[i] = static_cast<uint8_t>(bitrev8(pcb[15 - i]));
} }
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method); int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
@ -432,14 +432,15 @@ void physical_to_symbolic(
int encoded_type_highpart_size = 0; int encoded_type_highpart_size = 0;
if (partition_count == 1) if (partition_count == 1)
{ {
color_formats[0] = read_bits(4, 13, pcb.data); color_formats[0] = read_bits(4, 13, pcb);
scb.partition_index = 0; scb.partition_index = 0;
} }
else else
{ {
encoded_type_highpart_size = (3 * partition_count) - 4; encoded_type_highpart_size = (3 * partition_count) - 4;
below_weights_pos -= encoded_type_highpart_size; below_weights_pos -= encoded_type_highpart_size;
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pcb.data) << 6); int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb) |
(read_bits(encoded_type_highpart_size, below_weights_pos, pcb) << 6);
int baseclass = encoded_type & 0x3; int baseclass = encoded_type & 0x3;
if (baseclass == 0) if (baseclass == 0)
{ {
@ -469,7 +470,8 @@ void physical_to_symbolic(
bitpos += 2; bitpos += 2;
} }
} }
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb.data) | (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb.data) << 6)); scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb) |
(read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6));
} }
for (int i = 0; i < partition_count; i++) for (int i = 0; i < partition_count; i++)
@ -515,7 +517,7 @@ void physical_to_symbolic(
scb.quant_mode = static_cast<quant_method>(color_quant_level); scb.quant_mode = static_cast<quant_method>(color_quant_level);
uint8_t values_to_decode[32]; uint8_t values_to_decode[32];
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb.data, decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb,
values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS)); values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
int valuecount_to_decode = 0; int valuecount_to_decode = 0;
@ -534,6 +536,6 @@ void physical_to_symbolic(
scb.plane2_component = -1; scb.plane2_component = -1;
if (is_dual_plane) if (is_dual_plane)
{ {
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb.data)); scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb));
} }
} }

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited // Copyright 2019-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -241,6 +241,14 @@ struct vint8
return vint8(_mm256_broadcastd_epi32(a)); return vint8(_mm256_broadcastd_epi32(a));
} }
/**
* @brief Factory that returns a vector loaded from unaligned memory.
*/
static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
{
return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p)));
}
/** /**
* @brief Factory that returns a vector loaded from 32B aligned memory. * @brief Factory that returns a vector loaded from 32B aligned memory.
*/ */
@ -1000,7 +1008,7 @@ ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
*/ */
ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a) ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
{ {
a = round(a); a = a + vfloat8(0.5f);
return vint8(_mm256_cvttps_epi32(a.m)); return vint8(_mm256_cvttps_epi32(a.m));
} }
@ -1152,9 +1160,9 @@ ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
* *
* All masked lanes must be at the end of vector, after all non-masked lanes. * All masked lanes must be at the end of vector, after all non-masked lanes.
*/ */
ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint8 data, vmask8 mask) ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
{ {
_mm256_maskstore_epi32(base, _mm256_castps_si256(mask.m), data.m); _mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m);
} }
/** /**
@ -1162,7 +1170,7 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint8 data, vmask8 mask)
*/ */
ASTCENC_SIMD_INLINE void print(vint8 a) ASTCENC_SIMD_INLINE void print(vint8 a)
{ {
alignas(ASTCENC_VECALIGN) int v[8]; alignas(32) int v[8];
storea(a, v); storea(a, v);
printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
@ -1173,7 +1181,7 @@ ASTCENC_SIMD_INLINE void print(vint8 a)
*/ */
ASTCENC_SIMD_INLINE void printx(vint8 a) ASTCENC_SIMD_INLINE void printx(vint8 a)
{ {
alignas(ASTCENC_VECALIGN) int v[8]; alignas(32) int v[8];
storea(a, v); storea(a, v);
printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n", printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
@ -1184,7 +1192,7 @@ ASTCENC_SIMD_INLINE void printx(vint8 a)
*/ */
ASTCENC_SIMD_INLINE void print(vfloat8 a) ASTCENC_SIMD_INLINE void print(vfloat8 a)
{ {
alignas(ASTCENC_VECALIGN) float v[8]; alignas(32) float v[8];
storea(a, v); storea(a, v);
printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n", printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]), static_cast<double>(v[0]), static_cast<double>(v[1]),

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2020-2021 Arm Limited // Copyright 2020-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -383,7 +383,7 @@ static ASTCENC_SIMD_INLINE void bit_transfer_signed(
*/ */
ASTCENC_SIMD_INLINE void print(vint4 a) ASTCENC_SIMD_INLINE void print(vint4 a)
{ {
alignas(16) int v[4]; ASTCENC_ALIGNAS int v[4];
storea(a, v); storea(a, v);
printf("v4_i32:\n %8d %8d %8d %8d\n", printf("v4_i32:\n %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3]); v[0], v[1], v[2], v[3]);
@ -394,7 +394,7 @@ ASTCENC_SIMD_INLINE void print(vint4 a)
*/ */
ASTCENC_SIMD_INLINE void printx(vint4 a) ASTCENC_SIMD_INLINE void printx(vint4 a)
{ {
alignas(16) int v[4]; ASTCENC_ALIGNAS int v[4];
storea(a, v); storea(a, v);
printf("v4_i32:\n %08x %08x %08x %08x\n", printf("v4_i32:\n %08x %08x %08x %08x\n",
v[0], v[1], v[2], v[3]); v[0], v[1], v[2], v[3]);
@ -405,7 +405,7 @@ ASTCENC_SIMD_INLINE void printx(vint4 a)
*/ */
ASTCENC_SIMD_INLINE void print(vfloat4 a) ASTCENC_SIMD_INLINE void print(vfloat4 a)
{ {
alignas(16) float v[4]; ASTCENC_ALIGNAS float v[4];
storea(a, v); storea(a, v);
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n", printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]), static_cast<double>(v[0]), static_cast<double>(v[1]),

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited // Copyright 2019-2023 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -38,6 +38,7 @@
#endif #endif
#include <cstdio> #include <cstdio>
#include <cstring>
// ============================================================================ // ============================================================================
// vfloat4 data type // vfloat4 data type
@ -269,6 +270,16 @@ struct vint4
return vint4(*p); return vint4(*p);
} }
/**
* @brief Factory that returns a vector loaded from unaligned memory.
*/
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
{
vint4 data;
std::memcpy(&data.m, p, 4 * sizeof(int));
return data;
}
/** /**
* @brief Factory that returns a vector loaded from 16B aligned memory. * @brief Factory that returns a vector loaded from 16B aligned memory.
*/ */
@ -348,9 +359,9 @@ struct vmask4
/** /**
* @brief Get the scalar from a single lane. * @brief Get the scalar from a single lane.
*/ */
template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
{ {
return vgetq_lane_u32(m, l); return vgetq_lane_u32(m, l) != 0;
} }
/** /**
@ -584,6 +595,14 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
vst1q_s32(p, a.m); vst1q_s32(p, a.m);
} }
/**
* @brief Store a vector to an unaligned memory address.
*/
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
{
std::memcpy(p, &a.m, sizeof(int) * 4);
}
/** /**
* @brief Store lowest N (vector width) bytes into an unaligned address. * @brief Store lowest N (vector width) bytes into an unaligned address.
*/ */
@ -849,7 +868,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
*/ */
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
{ {
a = round(a); a = a + vfloat4(0.5f);
return vint4(vcvtq_s32_f32(a.m)); return vint4(vcvtq_s32_f32(a.m));
} }
@ -1027,31 +1046,39 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
} }
/**
* @brief Store a single vector lane to an unaligned address.
*/
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
{
std::memcpy(base, &data, sizeof(int));
}
/** /**
* @brief Store a vector, skipping masked lanes. * @brief Store a vector, skipping masked lanes.
* *
* All masked lanes must be at the end of vector, after all non-masked lanes. * All masked lanes must be at the end of vector, after all non-masked lanes.
*/ */
ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask) ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
{ {
if (mask.lane<3>()) if (mask.lane<3>())
{ {
store(data, base); store(data, base);
} }
else if (mask.lane<2>()) else if (mask.lane<2>() != 0.0f)
{ {
base[0] = data.lane<0>(); store_lane(base + 0, data.lane<0>());
base[1] = data.lane<1>(); store_lane(base + 4, data.lane<1>());
base[2] = data.lane<2>(); store_lane(base + 8, data.lane<2>());
} }
else if (mask.lane<1>()) else if (mask.lane<1>() != 0.0f)
{ {
base[0] = data.lane<0>(); store_lane(base + 0, data.lane<0>());
base[1] = data.lane<1>(); store_lane(base + 4, data.lane<1>());
} }
else if (mask.lane<0>()) else if (mask.lane<0>() != 0.0f)
{ {
base[0] = data.lane<0>(); store_lane(base + 0, data.lane<0>());
} }
} }

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited // Copyright 2019-2023 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -275,6 +275,16 @@ struct vint4
return vint4(*p); return vint4(*p);
} }
/**
* @brief Factory that returns a vector loaded from unaligned memory.
*/
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
{
vint4 data;
std::memcpy(&data.m, p, 4 * sizeof(int));
return data;
}
/** /**
* @brief Factory that returns a vector loaded from 16B aligned memory. * @brief Factory that returns a vector loaded from 16B aligned memory.
*/ */
@ -341,6 +351,13 @@ struct vmask4
m[3] = d == false ? 0 : -1; m[3] = d == false ? 0 : -1;
} }
/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
{
return m[l] != 0;
}
/** /**
* @brief The vector ... * @brief The vector ...
@ -644,13 +661,20 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
p[3] = a.m[3]; p[3] = a.m[3];
} }
/**
* @brief Store a vector to an unaligned memory address.
*/
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
{
std::memcpy(p, a.m, sizeof(int) * 4);
}
/** /**
* @brief Store lowest N (vector width) bytes into an unaligned address. * @brief Store lowest N (vector width) bytes into an unaligned address.
*/ */
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
{ {
int* pi = reinterpret_cast<int*>(p); std::memcpy(p, a.m, sizeof(uint8_t) * 4);
*pi = a.m[0];
} }
/** /**
@ -963,10 +987,11 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
*/ */
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
{ {
return vint4(static_cast<int>(a.m[0] + 0.5f), a = a + vfloat4(0.5f);
static_cast<int>(a.m[1] + 0.5f), return vint4(static_cast<int>(a.m[0]),
static_cast<int>(a.m[2] + 0.5f), static_cast<int>(a.m[1]),
static_cast<int>(a.m[3] + 0.5f)); static_cast<int>(a.m[2]),
static_cast<int>(a.m[3]));
} }
/** /**
@ -1030,7 +1055,7 @@ ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a) ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
{ {
vint4 r; vint4 r;
memcpy(r.m, a.m, 4 * 4); std::memcpy(r.m, a.m, 4 * 4);
return r; return r;
} }
@ -1044,7 +1069,7 @@ ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a) ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
{ {
vfloat4 r; vfloat4 r;
memcpy(r.m, a.m, 4 * 4); std::memcpy(r.m, a.m, 4 * 4);
return r; return r;
} }
@ -1079,12 +1104,13 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
} }
/** /**
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
*/ */
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
{ {
uint8_t table[16]; uint8_t table[16];
storea(t0, reinterpret_cast<int*>(table + 0));
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()], return vint4(table[idx.lane<0>()],
table[idx.lane<1>()], table[idx.lane<1>()],
@ -1099,8 +1125,9 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
{ {
uint8_t table[32]; uint8_t table[32];
storea(t0, reinterpret_cast<int*>(table + 0));
storea(t1, reinterpret_cast<int*>(table + 16)); std::memcpy(table + 0, t0.m, 4 * sizeof(int));
std::memcpy(table + 16, t1.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()], return vint4(table[idx.lane<0>()],
table[idx.lane<1>()], table[idx.lane<1>()],
@ -1114,10 +1141,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
{ {
uint8_t table[64]; uint8_t table[64];
storea(t0, reinterpret_cast<int*>(table + 0));
storea(t1, reinterpret_cast<int*>(table + 16)); std::memcpy(table + 0, t0.m, 4 * sizeof(int));
storea(t2, reinterpret_cast<int*>(table + 32)); std::memcpy(table + 16, t1.m, 4 * sizeof(int));
storea(t3, reinterpret_cast<int*>(table + 48)); std::memcpy(table + 32, t2.m, 4 * sizeof(int));
std::memcpy(table + 48, t3.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()], return vint4(table[idx.lane<0>()],
table[idx.lane<1>()], table[idx.lane<1>()],
@ -1138,12 +1166,21 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
} }
/**
* @brief Store a single vector lane to an unaligned address.
*/
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
{
std::memcpy(base, &data, sizeof(int));
}
/** /**
* @brief Store a vector, skipping masked lanes. * @brief Store a vector, skipping masked lanes.
* *
* All masked lanes must be at the end of vector, after all non-masked lanes. * All masked lanes must be at the end of vector, after all non-masked lanes.
* Input is a byte array of at least 4 bytes per unmasked entry.
*/ */
ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask) ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
{ {
if (mask.m[3]) if (mask.m[3])
{ {
@ -1151,18 +1188,18 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
} }
else if (mask.m[2]) else if (mask.m[2])
{ {
base[0] = data.lane<0>(); store_lane(base + 0, data.lane<0>());
base[1] = data.lane<1>(); store_lane(base + 4, data.lane<1>());
base[2] = data.lane<2>(); store_lane(base + 8, data.lane<2>());
} }
else if (mask.m[1]) else if (mask.m[1])
{ {
base[0] = data.lane<0>(); store_lane(base + 0, data.lane<0>());
base[1] = data.lane<1>(); store_lane(base + 4, data.lane<1>());
} }
else if (mask.m[0]) else if (mask.m[0])
{ {
base[0] = data.lane<0>(); store_lane(base + 0, data.lane<0>());
} }
} }

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited // Copyright 2019-2023 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -39,6 +39,7 @@
#endif #endif
#include <cstdio> #include <cstdio>
#include <cstring>
// ============================================================================ // ============================================================================
// vfloat4 data type // vfloat4 data type
@ -292,6 +293,18 @@ struct vint4
return vint4(*p); return vint4(*p);
} }
/**
* @brief Factory that returns a vector loaded from unaligned memory.
*/
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
{
#if ASTCENC_SSE >= 41
return vint4(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(p)));
#else
return vint4(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)));
#endif
}
/** /**
* @brief Factory that returns a vector loaded from 16B aligned memory. * @brief Factory that returns a vector loaded from 16B aligned memory.
*/ */
@ -366,9 +379,9 @@ struct vmask4
/** /**
* @brief Get the scalar value of a single lane. * @brief Get the scalar value of a single lane.
*/ */
template <int l> ASTCENC_SIMD_INLINE float lane() const template <int l> ASTCENC_SIMD_INLINE bool lane() const
{ {
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)); return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f;
} }
/** /**
@ -633,6 +646,14 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
_mm_storeu_ps(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m)); _mm_storeu_ps(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
} }
/**
* @brief Store a vector to an unaligned memory address.
*/
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
{
std::memcpy(p, &a.m, sizeof(int) * 4);
}
/** /**
* @brief Store lowest N (vector width) bytes into an unaligned address. * @brief Store lowest N (vector width) bytes into an unaligned address.
*/ */
@ -934,7 +955,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
*/ */
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
{ {
a = round(a); a = a + vfloat4(0.5f);
return vint4(_mm_cvttps_epi32(a.m)); return vint4(_mm_cvttps_epi32(a.m));
} }
@ -1087,8 +1108,9 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
__m128i result = _mm_shuffle_epi8(t0.m, idxx); __m128i result = _mm_shuffle_epi8(t0.m, idxx);
return vint4(result); return vint4(result);
#else #else
alignas(ASTCENC_VECALIGN) uint8_t table[16]; uint8_t table[16];
storea(t0, reinterpret_cast<int*>(table + 0));
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()], return vint4(table[idx.lane<0>()],
table[idx.lane<1>()], table[idx.lane<1>()],
@ -1114,9 +1136,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
return vint4(result); return vint4(result);
#else #else
alignas(ASTCENC_VECALIGN) uint8_t table[32]; uint8_t table[32];
storea(t0, reinterpret_cast<int*>(table + 0));
storea(t1, reinterpret_cast<int*>(table + 16)); std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()], return vint4(table[idx.lane<0>()],
table[idx.lane<1>()], table[idx.lane<1>()],
@ -1150,11 +1173,12 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3
return vint4(result); return vint4(result);
#else #else
alignas(ASTCENC_VECALIGN) uint8_t table[64]; uint8_t table[64];
storea(t0, reinterpret_cast<int*>(table + 0));
storea(t1, reinterpret_cast<int*>(table + 16)); std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
storea(t2, reinterpret_cast<int*>(table + 32)); std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
storea(t3, reinterpret_cast<int*>(table + 48)); std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()], return vint4(table[idx.lane<0>()],
table[idx.lane<1>()], table[idx.lane<1>()],
@ -1190,15 +1214,23 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
#endif #endif
} }
/**
* @brief Store a single vector lane to an unaligned address.
*/
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
{
std::memcpy(base, &data, sizeof(int));
}
/** /**
* @brief Store a vector, skipping masked lanes. * @brief Store a vector, skipping masked lanes.
* *
* All masked lanes must be at the end of vector, after all non-masked lanes. * All masked lanes must be at the end of vector, after all non-masked lanes.
*/ */
ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask) ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
{ {
#if ASTCENC_AVX >= 2 #if ASTCENC_AVX >= 2
_mm_maskstore_epi32(base, _mm_castps_si128(mask.m), data.m); _mm_maskstore_epi32(reinterpret_cast<int*>(base), _mm_castps_si128(mask.m), data.m);
#else #else
// Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee // Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee
// fault suppression on masked lanes so we can get page faults at the end of an image. // fault suppression on masked lanes so we can get page faults at the end of an image.
@ -1208,18 +1240,18 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
} }
else if (mask.lane<2>() != 0.0f) else if (mask.lane<2>() != 0.0f)
{ {
base[0] = data.lane<0>(); store_lane(base + 0, data.lane<0>());
base[1] = data.lane<1>(); store_lane(base + 4, data.lane<1>());
base[2] = data.lane<2>(); store_lane(base + 8, data.lane<2>());
} }
else if (mask.lane<1>() != 0.0f) else if (mask.lane<1>() != 0.0f)
{ {
base[0] = data.lane<0>(); store_lane(base + 0, data.lane<0>());
base[1] = data.lane<1>(); store_lane(base + 4, data.lane<1>());
} }
else if (mask.lane<0>() != 0.0f) else if (mask.lane<0>() != 0.0f)
{ {
base[0] = data.lane<0>(); store_lane(base + 0, data.lane<0>());
} }
#endif #endif
} }

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited // Copyright 2011-2024 Arm Limited
// //
// Licensed under the Apache License, Version 2.0 (the "License"); you may not // Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy // use this file except in compliance with the License. You may obtain a copy
@ -60,8 +60,8 @@ static const uint8_t steps_for_quant_level[12] {
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
}; };
alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS]; ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS]; ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
#if defined(ASTCENC_DIAGNOSTICS) #if defined(ASTCENC_DIAGNOSTICS)
static bool print_once { true }; static bool print_once { true };
@ -99,7 +99,7 @@ static void compute_angular_offsets(
promise(weight_count > 0); promise(weight_count > 0);
promise(max_angular_steps > 0); promise(max_angular_steps > 0);
alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS]; ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
// Precompute isample; arrays are always allocated 64 elements long // Precompute isample; arrays are always allocated 64 elements long
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
@ -242,16 +242,16 @@ static void compute_angular_endpoints_for_quant_levels(
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level]; unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level]; unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS]; ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
compute_angular_offsets(weight_count, dec_weight_ideal_value, compute_angular_offsets(weight_count, dec_weight_ideal_value,
max_angular_steps, angular_offsets); max_angular_steps, angular_offsets);
alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS]; ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS]; ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS]; ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS]; ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS]; ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value, compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
max_angular_steps, max_quant_steps, max_angular_steps, max_quant_steps,

View file

@ -1,81 +0,0 @@
From 02c22d3df501dc284ba732fa82a6c408c57b3237 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Thu, 19 Jan 2023 23:30:13 +0100
Subject: [PATCH] mathlib: Remove incomplete support for SSE3 which assumed
SSSE3
`_mm_shuffle_epi8` requires SSSE3 so the check on `ASTCENC_SSE >= 30` is
too lax and would fail if `__SSE3__` is supported, but not `__SSSE3__`.
The only supported configurations are SSE2, SSE4.1, and AVX2, so as
discussed in #393 we drop the SSE3 checks and require SSE4.1 instead.
---
Source/astcenc_mathlib.h | 2 --
Source/astcenc_vecmathlib_sse_4.h | 10 +++++-----
2 files changed, 5 insertions(+), 7 deletions(-)
diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h
index 67e989e..0540c4f 100644
--- a/Source/astcenc_mathlib.h
+++ b/Source/astcenc_mathlib.h
@@ -48,8 +48,6 @@
#define ASTCENC_SSE 42
#elif defined(__SSE4_1__)
#define ASTCENC_SSE 41
- #elif defined(__SSE3__)
- #define ASTCENC_SSE 30
#elif defined(__SSE2__)
#define ASTCENC_SSE 20
#else
diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
index 76fe577..26dcc4a 100644
--- a/Source/astcenc_vecmathlib_sse_4.h
+++ b/Source/astcenc_vecmathlib_sse_4.h
@@ -1046,7 +1046,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
*/
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
{
-#if ASTCENC_SSE >= 30
+#if ASTCENC_SSE >= 41
t0p = t0;
t1p = t0 ^ t1;
#else
@@ -1062,7 +1062,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
{
-#if ASTCENC_SSE >= 30
+#if ASTCENC_SSE >= 41
t0p = t0;
t1p = t0 ^ t1;
t2p = t1 ^ t2;
@@ -1080,7 +1080,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
{
-#if ASTCENC_SSE >= 30
+#if ASTCENC_SSE >= 41
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
@@ -1102,7 +1102,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
{
-#if ASTCENC_SSE >= 30
+#if ASTCENC_SSE >= 41
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
@@ -1130,7 +1130,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
{
-#if ASTCENC_SSE >= 30
+#if ASTCENC_SSE >= 41
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
--
2.39.1