diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt index e375280c6a1..3c0b544781b 100644 --- a/COPYRIGHT.txt +++ b/COPYRIGHT.txt @@ -165,7 +165,7 @@ License: BSD-3-clause Files: ./thirdparty/astcenc/ Comment: Arm ASTC Encoder -Copyright: 2011-2023, Arm Limited +Copyright: 2011-2024, Arm Limited License: Apache-2.0 Files: ./thirdparty/basis_universal/ diff --git a/thirdparty/README.md b/thirdparty/README.md index 4c276b3c001..cb03ddcb36c 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -47,7 +47,7 @@ Files extracted from upstream source: ## astcenc - Upstream: https://github.com/ARM-software/astc-encoder -- Version: 4.4.0 (5a5b5a1ef60dd47c27c28c66c118d22c40e3197e, 2023) +- Version: 4.7.0 (1a51f2915121275038677317c8bf61f1a78b590c, 2024) - License: Apache 2.0 Files extracted from upstream source: diff --git a/thirdparty/astcenc/astcenc.h b/thirdparty/astcenc/astcenc.h index dbf45998416..3d04b4ea583 100644 --- a/thirdparty/astcenc/astcenc.h +++ b/thirdparty/astcenc/astcenc.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2020-2023 Arm Limited +// Copyright 2020-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -215,6 +215,8 @@ enum astcenc_error { ASTCENC_ERR_BAD_CONTEXT, /** @brief The call failed due to unimplemented functionality. */ ASTCENC_ERR_NOT_IMPLEMENTED, + /** @brief The call failed due to an out-of-spec decode mode flag set. */ + ASTCENC_ERR_BAD_DECODE_MODE, #if defined(ASTCENC_DIAGNOSTICS) /** @brief The call failed due to an issue with diagnostic tracing. */ ASTCENC_ERR_DTRACE_FAILURE, @@ -302,6 +304,11 @@ enum astcenc_type ASTCENC_TYPE_F32 = 2 }; +/** + * @brief Function pointer type for compression progress reporting callback. + */ +extern "C" typedef void (*astcenc_progress_callback)(float); + /** * @brief Enable normal map compression. * @@ -312,6 +319,19 @@ enum astcenc_type */ static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0; +/** + * @brief Enable compression heuristics that assume use of decode_unorm8 decode mode. + * + * The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this + * flag during compression will allow the compressor to use the correct rounding when selecting + * encodings. This will improve the compressed image quality if your application is using the + * decode_unorm8 decode mode, but will reduce image quality if using decode_fp16. + * + * Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of + * this setting. + */ +static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1; + /** * @brief Enable alpha weighting. * @@ -378,6 +398,7 @@ static const unsigned int ASTCENC_ALL_FLAGS = ASTCENC_FLG_MAP_RGBM | ASTCENC_FLG_USE_ALPHA_WEIGHT | ASTCENC_FLG_USE_PERCEPTUAL | + ASTCENC_FLG_USE_DECODE_UNORM8 | ASTCENC_FLG_DECOMPRESS_ONLY | ASTCENC_FLG_SELF_DECOMPRESS_ONLY; @@ -542,6 +563,24 @@ struct astcenc_config */ float tune_2plane_early_out_limit_correlation; + /** + * @brief The config enable for the mode0 fast-path search. + * + * If this is set to TUNE_MIN_TEXELS_MODE0 or higher then the early-out fast mode0 + * search is enabled. This option is ineffective for 3D block sizes. + */ + float tune_search_mode0_enable; + + /** + * @brief The progress callback, can be @c nullptr. + * + * If this is specified the codec will peridocially report progress for + * compression as a percentage between 0 and 100. The callback is called from one + * of the compressor threads, so doing significant work in the callback will + * reduce compression performance. + */ + astcenc_progress_callback progress_callback; + #if defined(ASTCENC_DIAGNOSTICS) /** * @brief The path to save the diagnostic trace data to. diff --git a/thirdparty/astcenc/astcenc_color_quantize.cpp b/thirdparty/astcenc/astcenc_color_quantize.cpp index b0fec7a74c4..df17cac3c7f 100644 --- a/thirdparty/astcenc/astcenc_color_quantize.cpp +++ b/thirdparty/astcenc/astcenc_color_quantize.cpp @@ -40,6 +40,27 @@ #include "astcenc_internal.h" +/** + * @brief Compute the error of an LDR RGB or RGBA encoding. + * + * @param uquant0 The original endpoint 0 color. + * @param uquant1 The original endpoint 1 color. + * @param quant0 The unpacked quantized endpoint 0 color. + * @param quant1 The unpacked quantized endpoint 1 color. + * + * @return The MSE of the encoding. + */ +static float get_rgba_encoding_error( + vfloat4 uquant0, + vfloat4 uquant1, + vint4 quant0, + vint4 quant1 +) { + vfloat4 error0 = uquant0 - int_to_float(quant0); + vfloat4 error1 = uquant1 - int_to_float(quant1); + return hadd_s(error0 * error0 + error1 * error1); +} + /** * @brief Determine the quantized value given a quantization level. * @@ -56,6 +77,26 @@ static inline uint8_t quant_color( return color_unquant_to_uquant_tables[quant_level - QUANT_6][index]; } +/** + * @brief Determine the quantized value given a quantization level. + * + * @param quant_level The quantization level to use. + * @param value The value to convert. This must be in the 0-255 range. + * + * @return The unpacked quantized value, returned in 0-255 range. + */ +static inline vint4 quant_color3( + quant_method quant_level, + vint4 value +) { + vint4 index = value * 2 + 1; + return vint4( + color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<0>()], + color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<1>()], + color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<2>()], + 0); +} + /** * @brief Determine the quantized value given a quantization level and residual. * @@ -83,6 +124,35 @@ static inline uint8_t quant_color( return color_unquant_to_uquant_tables[quant_level - QUANT_6][index]; } +/** + * @brief Determine the quantized value given a quantization level and residual. + * + * @param quant_level The quantization level to use. + * @param value The value to convert. This must be in the 0-255 range. + * @param valuef The original value before rounding, used to compute a residual. + * + * @return The unpacked quantized value, returned in 0-255 range. + */ +static inline vint4 quant_color3( + quant_method quant_level, + vint4 value, + vfloat4 valuef +) { + vint4 index = value * 2; + + // Compute the residual to determine if we should round down or up ties. + // Test should be residual >= 0, but empirical testing shows small bias helps. + vfloat4 residual = valuef - int_to_float(value); + vmask4 mask = residual >= vfloat4(-0.1f); + index = select(index, index + 1, mask); + + return vint4( + color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<0>()], + color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<1>()], + color_unquant_to_uquant_tables[quant_level - QUANT_6][index.lane<2>()], + 0); +} + /** * @brief Quantize an LDR RGB color. * @@ -92,47 +162,33 @@ static inline uint8_t quant_color( * * @param color0 The input unquantized color0 endpoint. * @param color1 The input unquantized color1 endpoint. - * @param[out] output The output endpoints, returned as (r0, r1, g0, g1, b0, b1). + * @param[out] color0_out The output quantized color0 endpoint. + * @param[out] color1_out The output quantized color1 endpoint. * @param quant_level The quantization level to use. */ static void quantize_rgb( vfloat4 color0, vfloat4 color1, - uint8_t output[6], + vint4& color0_out, + vint4& color1_out, quant_method quant_level ) { - float scale = 1.0f / 257.0f; + vint4 color0i, color1i; + vfloat4 nudge(0.2f); - float r0 = astc::clamp255f(color0.lane<0>() * scale); - float g0 = astc::clamp255f(color0.lane<1>() * scale); - float b0 = astc::clamp255f(color0.lane<2>() * scale); - - float r1 = astc::clamp255f(color1.lane<0>() * scale); - float g1 = astc::clamp255f(color1.lane<1>() * scale); - float b1 = astc::clamp255f(color1.lane<2>() * scale); - - int ri0, gi0, bi0, ri1, gi1, bi1; - float rgb0_addon = 0.0f; - float rgb1_addon = 0.0f; do { - ri0 = quant_color(quant_level, astc::max(astc::flt2int_rtn(r0 + rgb0_addon), 0), r0 + rgb0_addon); - gi0 = quant_color(quant_level, astc::max(astc::flt2int_rtn(g0 + rgb0_addon), 0), g0 + rgb0_addon); - bi0 = quant_color(quant_level, astc::max(astc::flt2int_rtn(b0 + rgb0_addon), 0), b0 + rgb0_addon); - ri1 = quant_color(quant_level, astc::min(astc::flt2int_rtn(r1 + rgb1_addon), 255), r1 + rgb1_addon); - gi1 = quant_color(quant_level, astc::min(astc::flt2int_rtn(g1 + rgb1_addon), 255), g1 + rgb1_addon); - bi1 = quant_color(quant_level, astc::min(astc::flt2int_rtn(b1 + rgb1_addon), 255), b1 + rgb1_addon); + vint4 color0q = max(float_to_int_rtn(color0), vint4(0)); + color0i = quant_color3(quant_level, color0q, color0); + color0 = color0 - nudge; - rgb0_addon -= 0.2f; - rgb1_addon += 0.2f; - } while (ri0 + gi0 + bi0 > ri1 + gi1 + bi1); + vint4 color1q = min(float_to_int_rtn(color1), vint4(255)); + color1i = quant_color3(quant_level, color1q, color1); + color1 = color1 + nudge; + } while (hadd_rgb_s(color0i) > hadd_rgb_s(color1i)); - output[0] = static_cast(ri0); - output[1] = static_cast(ri1); - output[2] = static_cast(gi0); - output[3] = static_cast(gi1); - output[4] = static_cast(bi0); - output[5] = static_cast(bi1); + color0_out = color0i; + color1_out = color1i; } /** @@ -145,24 +201,24 @@ static void quantize_rgb( * * @param color0 The input unquantized color0 endpoint. * @param color1 The input unquantized color1 endpoint. - * @param[out] output The output endpoints, returned as (r0, r1, g0, g1, b0, b1, a0, a1). + * @param[out] color0_out The output quantized color0 endpoint. + * @param[out] color1_out The output quantized color1 endpoint. * @param quant_level The quantization level to use. */ static void quantize_rgba( vfloat4 color0, vfloat4 color1, - uint8_t output[8], + vint4& color0_out, + vint4& color1_out, quant_method quant_level ) { - float scale = 1.0f / 257.0f; + quantize_rgb(color0, color1, color0_out, color1_out, quant_level); - float a0 = astc::clamp255f(color0.lane<3>() * scale); - float a1 = astc::clamp255f(color1.lane<3>() * scale); + float a0 = color0.lane<3>(); + float a1 = color1.lane<3>(); - output[6] = quant_color(quant_level, astc::flt2int_rtn(a0), a0); - output[7] = quant_color(quant_level, astc::flt2int_rtn(a1), a1); - - quantize_rgb(color0, color1, output, quant_level); + color0_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a0), a0)); + color1_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a1), a1)); } /** @@ -172,7 +228,8 @@ static void quantize_rgba( * * @param color0 The input unquantized color0 endpoint. * @param color1 The input unquantized color1 endpoint. - * @param[out] output The output endpoints, returned as (r1, r0, g1, g0, b1, b0). + * @param[out] color0_out The output quantized color0 endpoint. + * @param[out] color1_out The output quantized color1 endpoint. * @param quant_level The quantization level to use. * * @return Returns @c false on failure, @c true on success. @@ -180,54 +237,35 @@ static void quantize_rgba( static bool try_quantize_rgb_blue_contract( vfloat4 color0, vfloat4 color1, - uint8_t output[6], + vint4& color0_out, + vint4& color1_out, quant_method quant_level ) { - float scale = 1.0f / 257.0f; + // Apply inverse blue-contraction + color0 += color0 - color0.swz<2, 2, 2, 3>(); + color1 += color1 - color1.swz<2, 2, 2, 3>(); - float r0 = color0.lane<0>() * scale; - float g0 = color0.lane<1>() * scale; - float b0 = color0.lane<2>() * scale; - - float r1 = color1.lane<0>() * scale; - float g1 = color1.lane<1>() * scale; - float b1 = color1.lane<2>() * scale; - - // Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used. - r0 += (r0 - b0); - g0 += (g0 - b0); - r1 += (r1 - b1); - g1 += (g1 - b1); - - if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f || - r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f) + // If anything overflows BC cannot be used + vmask4 color0_error = (color0 < vfloat4(0.0f)) | (color0 > vfloat4(255.0f)); + vmask4 color1_error = (color1 < vfloat4(0.0f)) | (color1 > vfloat4(255.0f)); + if (any(color0_error | color1_error)) { return false; } - // Quantize the inverse-blue-contracted color - int ri0 = quant_color(quant_level, astc::flt2int_rtn(r0), r0); - int gi0 = quant_color(quant_level, astc::flt2int_rtn(g0), g0); - int bi0 = quant_color(quant_level, astc::flt2int_rtn(b0), b0); + // Quantize the inverse blue-contracted color + vint4 color0i = quant_color3(quant_level, float_to_int_rtn(color0), color0); + vint4 color1i = quant_color3(quant_level, float_to_int_rtn(color1), color1); - int ri1 = quant_color(quant_level, astc::flt2int_rtn(r1), r1); - int gi1 = quant_color(quant_level, astc::flt2int_rtn(g1), g1); - int bi1 = quant_color(quant_level, astc::flt2int_rtn(b1), b1); - - // If color #1 is not larger than color #0 then blue-contraction cannot be used. Note that - // blue-contraction and quantization change this order, which is why we must test afterwards. - if (ri1 + gi1 + bi1 <= ri0 + gi0 + bi0) + // If color #1 is not larger than color #0 then blue-contraction cannot be used + // We must test afterwards because quantization can change the order + if (hadd_rgb_s(color1i) <= hadd_rgb_s(color0i)) { return false; } - output[0] = static_cast(ri1); - output[1] = static_cast(ri0); - output[2] = static_cast(gi1); - output[3] = static_cast(gi0); - output[4] = static_cast(bi1); - output[5] = static_cast(bi0); - + color0_out = color1i; + color1_out = color0i; return true; } @@ -238,7 +276,8 @@ static bool try_quantize_rgb_blue_contract( * * @param color0 The input unquantized color0 endpoint. * @param color1 The input unquantized color1 endpoint. - * @param[out] output The output endpoints, returned as (r1, r0, g1, g0, b1, b0, a1, a0). + * @param[out] color0_out The output quantized color0 endpoint. + * @param[out] color1_out The output quantized color1 endpoint. * @param quant_level The quantization level to use. * * @return Returns @c false on failure, @c true on success. @@ -246,18 +285,22 @@ static bool try_quantize_rgb_blue_contract( static bool try_quantize_rgba_blue_contract( vfloat4 color0, vfloat4 color1, - uint8_t output[8], + vint4& color0_out, + vint4& color1_out, quant_method quant_level ) { - float scale = 1.0f / 257.0f; + if (try_quantize_rgb_blue_contract(color0, color1, color0_out, color1_out, quant_level)) + { + float a0 = color0.lane<3>(); + float a1 = color1.lane<3>(); - float a0 = astc::clamp255f(color0.lane<3>() * scale); - float a1 = astc::clamp255f(color1.lane<3>() * scale); + color0_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a1), a1)); + color1_out.set_lane<3>(quant_color(quant_level, astc::flt2int_rtn(a0), a0)); - output[6] = quant_color(quant_level, astc::flt2int_rtn(a1), a1); - output[7] = quant_color(quant_level, astc::flt2int_rtn(a0), a0); + return true; + } - return try_quantize_rgb_blue_contract(color0, color1, output, quant_level); + return false; } /** @@ -269,7 +312,8 @@ static bool try_quantize_rgba_blue_contract( * * @param color0 The input unquantized color0 endpoint. * @param color1 The input unquantized color1 endpoint. - * @param[out] output The output endpoints, returned as (r0, r1, g0, g1, b0, b1). + * @param[out] color0_out The output quantized color0 endpoint. + * @param[out] color1_out The output quantized color1 endpoint. * @param quant_level The quantization level to use. * * @return Returns @c false on failure, @c true on success. @@ -277,85 +321,54 @@ static bool try_quantize_rgba_blue_contract( static bool try_quantize_rgb_delta( vfloat4 color0, vfloat4 color1, - uint8_t output[6], + vint4& color0_out, + vint4& color1_out, quant_method quant_level ) { - float scale = 1.0f / 257.0f; - - float r0 = astc::clamp255f(color0.lane<0>() * scale); - float g0 = astc::clamp255f(color0.lane<1>() * scale); - float b0 = astc::clamp255f(color0.lane<2>() * scale); - - float r1 = astc::clamp255f(color1.lane<0>() * scale); - float g1 = astc::clamp255f(color1.lane<1>() * scale); - float b1 = astc::clamp255f(color1.lane<2>() * scale); - - // Transform r0 to unorm9 - int r0a = astc::flt2int_rtn(r0); - int g0a = astc::flt2int_rtn(g0); - int b0a = astc::flt2int_rtn(b0); - - r0a <<= 1; - g0a <<= 1; - b0a <<= 1; + // Transform color0 to unorm9 + vint4 color0a = float_to_int_rtn(color0); + color0.set_lane<3>(0.0f); + color0a = lsl<1>(color0a); // Mask off the top bit - int r0b = r0a & 0xFF; - int g0b = g0a & 0xFF; - int b0b = b0a & 0xFF; + vint4 color0b = color0a & 0xFF; // Quantize then unquantize in order to get a value that we take differences against - int r0be = quant_color(quant_level, r0b); - int g0be = quant_color(quant_level, g0b); - int b0be = quant_color(quant_level, b0b); - - r0b = r0be | (r0a & 0x100); - g0b = g0be | (g0a & 0x100); - b0b = b0be | (b0a & 0x100); + vint4 color0be = quant_color3(quant_level, color0b); + color0b = color0be | (color0a & 0x100); // Get hold of the second value - int r1d = astc::flt2int_rtn(r1); - int g1d = astc::flt2int_rtn(g1); - int b1d = astc::flt2int_rtn(b1); - - r1d <<= 1; - g1d <<= 1; - b1d <<= 1; + vint4 color1d = float_to_int_rtn(color1); + color1d = lsl<1>(color1d); // ... and take differences - r1d -= r0b; - g1d -= g0b; - b1d -= b0b; + color1d = color1d - color0b; + color1d.set_lane<3>(0); // Check if the difference is too large to be encodable - if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64) + if (any((color1d > vint4(63)) | (color1d < vint4(-64)))) { return false; } // Insert top bit of the base into the offset - r1d &= 0x7F; - g1d &= 0x7F; - b1d &= 0x7F; - - r1d |= (r0b & 0x100) >> 1; - g1d |= (g0b & 0x100) >> 1; - b1d |= (b0b & 0x100) >> 1; + color1d = color1d & 0x7F; + color1d = color1d | lsr<1>(color0b & 0x100); // Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails // since we have then corrupted either the top bit of the base or the sign bit of the offset - int r1de = quant_color(quant_level, r1d); - int g1de = quant_color(quant_level, g1d); - int b1de = quant_color(quant_level, b1d); + vint4 color1de = quant_color3(quant_level, color1d); - if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0) + vint4 color_flips = (color1d ^ color1de) & 0xC0; + color_flips.set_lane<3>(0); + if (any(color_flips != vint4::zero())) { return false; } // If the sum of offsets triggers blue-contraction then encoding fails - vint4 ep0(r0be, g0be, b0be, 0); - vint4 ep1(r1de, g1de, b1de, 0); + vint4 ep0 = color0be; + vint4 ep1 = color1de; bit_transfer_signed(ep1, ep0); if (hadd_rgb_s(ep1) < 0) { @@ -369,111 +382,90 @@ static bool try_quantize_rgb_delta( return false; } - output[0] = static_cast(r0be); - output[1] = static_cast(r1de); - output[2] = static_cast(g0be); - output[3] = static_cast(g1de); - output[4] = static_cast(b0be); - output[5] = static_cast(b1de); - + color0_out = color0be; + color1_out = color1de; return true; } +/** + * @brief Try to quantize an LDR RGB color using delta encoding and blue-contraction. + * + * Blue-contraction is only usable if encoded color 1 RGB is larger than color 0 RGB. + * + * @param color0 The input unquantized color0 endpoint. + * @param color1 The input unquantized color1 endpoint. + * @param[out] color0_out The output quantized color0 endpoint. + * @param[out] color1_out The output quantized color1 endpoint. + * @param quant_level The quantization level to use. + * + * @return Returns @c false on failure, @c true on success. + */ static bool try_quantize_rgb_delta_blue_contract( vfloat4 color0, vfloat4 color1, - uint8_t output[6], + vint4& color0_out, + vint4& color1_out, quant_method quant_level ) { // Note: Switch around endpoint colors already at start - float scale = 1.0f / 257.0f; + std::swap(color0, color1); - float r1 = color0.lane<0>() * scale; - float g1 = color0.lane<1>() * scale; - float b1 = color0.lane<2>() * scale; + // Apply inverse blue-contraction + color0 += color0 - color0.swz<2, 2, 2, 3>(); + color1 += color1 - color1.swz<2, 2, 2, 3>(); - float r0 = color1.lane<0>() * scale; - float g0 = color1.lane<1>() * scale; - float b0 = color1.lane<2>() * scale; - - // Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used. - r0 += (r0 - b0); - g0 += (g0 - b0); - r1 += (r1 - b1); - g1 += (g1 - b1); - - if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f || - r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f) + // If anything overflows BC cannot be used + vmask4 color0_error = (color0 < vfloat4(0.0f)) | (color0 > vfloat4(255.0f)); + vmask4 color1_error = (color1 < vfloat4(0.0f)) | (color1 > vfloat4(255.0f)); + if (any(color0_error | color1_error)) { return false; } - // Transform r0 to unorm9 - int r0a = astc::flt2int_rtn(r0); - int g0a = astc::flt2int_rtn(g0); - int b0a = astc::flt2int_rtn(b0); - r0a <<= 1; - g0a <<= 1; - b0a <<= 1; + // Transform color0 to unorm9 + vint4 color0a = float_to_int_rtn(color0); + color0.set_lane<3>(0.0f); + color0a = lsl<1>(color0a); // Mask off the top bit - int r0b = r0a & 0xFF; - int g0b = g0a & 0xFF; - int b0b = b0a & 0xFF; + vint4 color0b = color0a & 0xFF; - // Quantize, then unquantize in order to get a value that we take differences against. - int r0be = quant_color(quant_level, r0b); - int g0be = quant_color(quant_level, g0b); - int b0be = quant_color(quant_level, b0b); - - r0b = r0be | (r0a & 0x100); - g0b = g0be | (g0a & 0x100); - b0b = b0be | (b0a & 0x100); + // Quantize then unquantize in order to get a value that we take differences against + vint4 color0be = quant_color3(quant_level, color0b); + color0b = color0be | (color0a & 0x100); // Get hold of the second value - int r1d = astc::flt2int_rtn(r1); - int g1d = astc::flt2int_rtn(g1); - int b1d = astc::flt2int_rtn(b1); + vint4 color1d = float_to_int_rtn(color1); + color1d = lsl<1>(color1d); - r1d <<= 1; - g1d <<= 1; - b1d <<= 1; - - // .. and take differences! - r1d -= r0b; - g1d -= g0b; - b1d -= b0b; + // ... and take differences + color1d = color1d - color0b; + color1d.set_lane<3>(0); // Check if the difference is too large to be encodable - if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64) + if (any((color1d > vint4(63)) | (color1d < vint4(-64)))) { return false; } // Insert top bit of the base into the offset - r1d &= 0x7F; - g1d &= 0x7F; - b1d &= 0x7F; + color1d = color1d & 0x7F; + color1d = color1d | lsr<1>(color0b & 0x100); - r1d |= (r0b & 0x100) >> 1; - g1d |= (g0b & 0x100) >> 1; - b1d |= (b0b & 0x100) >> 1; + // Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails + // since we have then corrupted either the top bit of the base or the sign bit of the offset + vint4 color1de = quant_color3(quant_level, color1d); - // Then quantize and unquantize; if this causes any of the top two bits to flip, - // then encoding fails, since we have then corrupted either the top bit of the base - // or the sign bit of the offset. - int r1de = quant_color(quant_level, r1d); - int g1de = quant_color(quant_level, g1d); - int b1de = quant_color(quant_level, b1d); - - if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0) + vint4 color_flips = (color1d ^ color1de) & 0xC0; + color_flips.set_lane<3>(0); + if (any(color_flips != vint4::zero())) { return false; } // If the sum of offsets does not trigger blue-contraction then encoding fails - vint4 ep0(r0be, g0be, b0be, 0); - vint4 ep1(r1de, g1de, b1de, 0); + vint4 ep0 = color0be; + vint4 ep1 = color1de; bit_transfer_signed(ep1, ep0); if (hadd_rgb_s(ep1) >= 0) { @@ -487,13 +479,8 @@ static bool try_quantize_rgb_delta_blue_contract( return false; } - output[0] = static_cast(r0be); - output[1] = static_cast(r1de); - output[2] = static_cast(g0be); - output[3] = static_cast(g1de); - output[4] = static_cast(b0be); - output[5] = static_cast(b1de); - + color0_out = color0be; + color1_out = color1de; return true; } @@ -508,7 +495,8 @@ static bool try_quantize_rgb_delta_blue_contract( * * @param color0 The input unquantized color0 endpoint. * @param color1 The input unquantized color1 endpoint. - * @param[out] output The output endpoints, returned as (x, x, x, x, x, x, a0, a1). + * @param[out] color0_out The output quantized color0 endpoint; must preserve lane 0/1/2. + * @param[out] color1_out The output quantized color1 endpoint; must preserve lane 0/1/2. * @param quant_level The quantization level to use. * * @return Returns @c false on failure, @c true on success. @@ -516,13 +504,12 @@ static bool try_quantize_rgb_delta_blue_contract( static bool try_quantize_alpha_delta( vfloat4 color0, vfloat4 color1, - uint8_t output[8], + vint4& color0_out, + vint4& color1_out, quant_method quant_level ) { - float scale = 1.0f / 257.0f; - - float a0 = astc::clamp255f(color0.lane<3>() * scale); - float a1 = astc::clamp255f(color1.lane<3>() * scale); + float a0 = color0.lane<3>(); + float a1 = color1.lane<3>(); int a0a = astc::flt2int_rtn(a0); a0a <<= 1; @@ -561,8 +548,8 @@ static bool try_quantize_alpha_delta( return false; } - output[6] = static_cast(a0be); - output[7] = static_cast(a1de); + color0_out.set_lane<3>(a0be); + color1_out.set_lane<3>(a1de); return true; } @@ -589,13 +576,11 @@ static bool try_quantize_luminance_alpha_delta( uint8_t output[4], quant_method quant_level ) { - float scale = 1.0f / 257.0f; + float l0 = hadd_rgb_s(color0) * (1.0f / 3.0f); + float l1 = hadd_rgb_s(color1) * (1.0f / 3.0f); - float l0 = astc::clamp255f(hadd_rgb_s(color0) * ((1.0f / 3.0f) * scale)); - float l1 = astc::clamp255f(hadd_rgb_s(color1) * ((1.0f / 3.0f) * scale)); - - float a0 = astc::clamp255f(color0.lane<3>() * scale); - float a1 = astc::clamp255f(color1.lane<3>() * scale); + float a0 = color0.lane<3>(); + float a1 = color1.lane<3>(); int l0a = astc::flt2int_rtn(l0); int a0a = astc::flt2int_rtn(a0); @@ -693,7 +678,8 @@ static bool try_quantize_luminance_alpha_delta( * * @param color0 The input unquantized color0 endpoint. * @param color1 The input unquantized color1 endpoint. - * @param[out] output The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1). + * @param[out] color0_out The output quantized color0 endpoint + * @param[out] color1_out The output quantized color1 endpoint * @param quant_level The quantization level to use. * * @return Returns @c false on failure, @c true on success. @@ -701,14 +687,14 @@ static bool try_quantize_luminance_alpha_delta( static bool try_quantize_rgba_delta( vfloat4 color0, vfloat4 color1, - uint8_t output[8], + vint4& color0_out, + vint4& color1_out, quant_method quant_level ) { - return try_quantize_rgb_delta(color0, color1, output, quant_level) && - try_quantize_alpha_delta(color0, color1, output, quant_level); + return try_quantize_rgb_delta(color0, color1, color0_out, color1_out, quant_level) && + try_quantize_alpha_delta(color0, color1, color0_out, color1_out, quant_level); } - /** * @brief Try to quantize an LDR RGBA color using delta and blue contract encoding. * @@ -720,7 +706,8 @@ static bool try_quantize_rgba_delta( * * @param color0 The input unquantized color0 endpoint. * @param color1 The input unquantized color1 endpoint. - * @param[out] output The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1). + * @param[out] color0_out The output quantized color0 endpoint + * @param[out] color1_out The output quantized color1 endpoint * @param quant_level The quantization level to use. * * @return Returns @c false on failure, @c true on success. @@ -728,12 +715,13 @@ static bool try_quantize_rgba_delta( static bool try_quantize_rgba_delta_blue_contract( vfloat4 color0, vfloat4 color1, - uint8_t output[8], + vint4& color0_out, + vint4& color1_out, quant_method quant_level ) { // Note that we swap the color0 and color1 ordering for alpha to match RGB blue-contract - return try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level) && - try_quantize_alpha_delta(color1, color0, output, quant_level); + return try_quantize_rgb_delta_blue_contract(color0, color1, color0_out, color1_out, quant_level) && + try_quantize_alpha_delta(color1, color0, color0_out, color1_out, quant_level); } /** @@ -774,6 +762,8 @@ static void quantize_rgbs( /** * @brief Quantize an LDR RGBA color using scale encoding. * + * @param color0 The input unquantized color0 alpha endpoint. + * @param color1 The input unquantized color1 alpha endpoint. * @param color The input unquantized color endpoint and scale factor. * @param[out] output The output endpoints, returned as (r0, g0, b0, s, a0, a1). * @param quant_level The quantization level to use. @@ -785,10 +775,8 @@ static void quantize_rgbs_alpha( uint8_t output[6], quant_method quant_level ) { - float scale = 1.0f / 257.0f; - - float a0 = astc::clamp255f(color0.lane<3>() * scale); - float a1 = astc::clamp255f(color1.lane<3>() * scale); + float a0 = color0.lane<3>(); + float a1 = color1.lane<3>(); output[4] = quant_color(quant_level, astc::flt2int_rtn(a0), a0); output[5] = quant_color(quant_level, astc::flt2int_rtn(a1), a1); @@ -810,13 +798,8 @@ static void quantize_luminance( uint8_t output[2], quant_method quant_level ) { - float scale = 1.0f / 257.0f; - - color0 = color0 * scale; - color1 = color1 * scale; - - float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f)); - float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f)); + float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f); + float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f); if (lum0 > lum1) { @@ -843,16 +826,11 @@ static void quantize_luminance_alpha( uint8_t output[4], quant_method quant_level ) { - float scale = 1.0f / 257.0f; + float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f); + float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f); - color0 = color0 * scale; - color1 = color1 * scale; - - float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f)); - float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f)); - - float a0 = astc::clamp255f(color0.lane<3>()); - float a1 = astc::clamp255f(color1.lane<3>()); + float a0 = color0.lane<3>(); + float a1 = color1.lane<3>(); output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0), lum0); output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1), lum1); @@ -1939,58 +1917,170 @@ uint8_t pack_color_endpoints( ) { assert(QUANT_6 <= quant_level && quant_level <= QUANT_256); - // We do not support negative colors - color0 = max(color0, 0.0f); - color1 = max(color1, 0.0f); + // Clamp colors to a valid LDR range + // Note that HDR has a lower max, handled in the conversion functions + color0 = clamp(0.0f, 65535.0f, color0); + color1 = clamp(0.0f, 65535.0f, color1); + + // Pre-scale the LDR value we need to the 0-255 quantizable range + vfloat4 color0_ldr = color0 * (1.0f / 257.0f); + vfloat4 color1_ldr = color1 * (1.0f / 257.0f); uint8_t retval = 0; + float best_error = ERROR_CALC_DEFAULT; + vint4 color0_out, color1_out; + vint4 color0_out2, color1_out2; switch (format) { case FMT_RGB: if (quant_level <= QUANT_160) { - if (try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level)) + if (try_quantize_rgb_delta_blue_contract(color0_ldr, color1_ldr, color0_out, color1_out, quant_level)) { + vint4 color0_unpack; + vint4 color1_unpack; + rgba_delta_unpack(color0_out, color1_out, color0_unpack, color1_unpack); + retval = FMT_RGB_DELTA; - break; + best_error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack); } - if (try_quantize_rgb_delta(color0, color1, output, quant_level)) + + if (try_quantize_rgb_delta(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level)) { - retval = FMT_RGB_DELTA; - break; + vint4 color0_unpack; + vint4 color1_unpack; + rgba_delta_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack); + + float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack); + if (error < best_error) + { + retval = FMT_RGB_DELTA; + best_error = error; + color0_out = color0_out2; + color1_out = color1_out2; + } } } - if (quant_level < QUANT_256 && try_quantize_rgb_blue_contract(color0, color1, output, quant_level)) + + if (quant_level < QUANT_256) { - retval = FMT_RGB; - break; + if (try_quantize_rgb_blue_contract(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level)) + { + vint4 color0_unpack; + vint4 color1_unpack; + rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack); + + float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack); + if (error < best_error) + { + retval = FMT_RGB; + best_error = error; + color0_out = color0_out2; + color1_out = color1_out2; + } + } } - quantize_rgb(color0, color1, output, quant_level); - retval = FMT_RGB; + + { + quantize_rgb(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level); + + vint4 color0_unpack; + vint4 color1_unpack; + rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack); + + float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack); + if (error < best_error) + { + retval = FMT_RGB; + color0_out = color0_out2; + color1_out = color1_out2; + } + } + + // TODO: Can we vectorize this? + output[0] = static_cast(color0_out.lane<0>()); + output[1] = static_cast(color1_out.lane<0>()); + output[2] = static_cast(color0_out.lane<1>()); + output[3] = static_cast(color1_out.lane<1>()); + output[4] = static_cast(color0_out.lane<2>()); + output[5] = static_cast(color1_out.lane<2>()); break; case FMT_RGBA: if (quant_level <= QUANT_160) { - if (try_quantize_rgba_delta_blue_contract(color0, color1, output, quant_level)) + if (try_quantize_rgba_delta_blue_contract(color0_ldr, color1_ldr, color0_out, color1_out, quant_level)) { + vint4 color0_unpack; + vint4 color1_unpack; + rgba_delta_unpack(color0_out, color1_out, color0_unpack, color1_unpack); + retval = FMT_RGBA_DELTA; - break; + best_error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack); } - if (try_quantize_rgba_delta(color0, color1, output, quant_level)) + + if (try_quantize_rgba_delta(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level)) { - retval = FMT_RGBA_DELTA; - break; + vint4 color0_unpack; + vint4 color1_unpack; + rgba_delta_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack); + + float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack); + if (error < best_error) + { + retval = FMT_RGBA_DELTA; + best_error = error; + color0_out = color0_out2; + color1_out = color1_out2; + } } } - if (quant_level < QUANT_256 && try_quantize_rgba_blue_contract(color0, color1, output, quant_level)) + + if (quant_level < QUANT_256) { - retval = FMT_RGBA; - break; + if (try_quantize_rgba_blue_contract(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level)) + { + vint4 color0_unpack; + vint4 color1_unpack; + rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack); + + float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack); + if (error < best_error) + { + retval = FMT_RGBA; + best_error = error; + color0_out = color0_out2; + color1_out = color1_out2; + } + } } - quantize_rgba(color0, color1, output, quant_level); - retval = FMT_RGBA; + + { + quantize_rgba(color0_ldr, color1_ldr, color0_out2, color1_out2, quant_level); + + vint4 color0_unpack; + vint4 color1_unpack; + rgba_unpack(color0_out2, color1_out2, color0_unpack, color1_unpack); + + float error = get_rgba_encoding_error(color0_ldr, color1_ldr, color0_unpack, color1_unpack); + if (error < best_error) + { + retval = FMT_RGBA; + color0_out = color0_out2; + color1_out = color1_out2; + } + } + + // TODO: Can we vectorize this? + output[0] = static_cast(color0_out.lane<0>()); + output[1] = static_cast(color1_out.lane<0>()); + output[2] = static_cast(color0_out.lane<1>()); + output[3] = static_cast(color1_out.lane<1>()); + output[4] = static_cast(color0_out.lane<2>()); + output[5] = static_cast(color1_out.lane<2>()); + output[6] = static_cast(color0_out.lane<3>()); + output[7] = static_cast(color1_out.lane<3>()); break; case FMT_RGB_SCALE: @@ -2009,7 +2099,7 @@ uint8_t pack_color_endpoints( break; case FMT_RGB_SCALE_ALPHA: - quantize_rgbs_alpha(color0, color1, rgbs_color, output, quant_level); + quantize_rgbs_alpha(color0_ldr, color1_ldr, rgbs_color, output, quant_level); retval = FMT_RGB_SCALE_ALPHA; break; @@ -2025,20 +2115,20 @@ uint8_t pack_color_endpoints( break; case FMT_LUMINANCE: - quantize_luminance(color0, color1, output, quant_level); + quantize_luminance(color0_ldr, color1_ldr, output, quant_level); retval = FMT_LUMINANCE; break; case FMT_LUMINANCE_ALPHA: if (quant_level <= 18) { - if (try_quantize_luminance_alpha_delta(color0, color1, output, quant_level)) + if (try_quantize_luminance_alpha_delta(color0_ldr, color1_ldr, output, quant_level)) { retval = FMT_LUMINANCE_ALPHA_DELTA; break; } } - quantize_luminance_alpha(color0, color1, output, quant_level); + quantize_luminance_alpha(color0_ldr, color1_ldr, output, quant_level); retval = FMT_LUMINANCE_ALPHA; break; diff --git a/thirdparty/astcenc/astcenc_color_unquantize.cpp b/thirdparty/astcenc/astcenc_color_unquantize.cpp index d31895a627b..2daa515e731 100644 --- a/thirdparty/astcenc/astcenc_color_unquantize.cpp +++ b/thirdparty/astcenc/astcenc_color_unquantize.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2023 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -40,15 +40,7 @@ static ASTCENC_SIMD_INLINE vint4 uncontract_color( return select(input, bc0, mask); } -/** - * @brief Unpack an LDR RGBA color that uses delta encoding. - * - * @param input0 The packed endpoint 0 color. - * @param input1 The packed endpoint 1 color deltas. - * @param[out] output0 The unpacked endpoint 0 color. - * @param[out] output1 The unpacked endpoint 1 color. - */ -static void rgba_delta_unpack( +void rgba_delta_unpack( vint4 input0, vint4 input1, vint4& output0, @@ -92,15 +84,7 @@ static void rgb_delta_unpack( output1.set_lane<3>(255); } -/** - * @brief Unpack an LDR RGBA color that uses direct encoding. - * - * @param input0 The packed endpoint 0 color. - * @param input1 The packed endpoint 1 color. - * @param[out] output0 The unpacked endpoint 0 color. - * @param[out] output1 The unpacked endpoint 1 color. - */ -static void rgba_unpack( +void rgba_unpack( vint4 input0, vint4 input1, vint4& output0, @@ -910,32 +894,55 @@ void unpack_color_endpoints( } } - vint4 ldr_scale(257); - vint4 hdr_scale(1); - vint4 output_scale = ldr_scale; + // Handle endpoint errors and expansion - // An LDR profile image - if ((decode_mode == ASTCENC_PRF_LDR) || - (decode_mode == ASTCENC_PRF_LDR_SRGB)) + // Linear LDR 8-bit endpoints are expanded to 16-bit by replication + if (decode_mode == ASTCENC_PRF_LDR) { - // Also matches HDR alpha, as cannot have HDR alpha without HDR RGB - if (rgb_hdr == true) + // Error color - HDR endpoint in an LDR encoding + if (rgb_hdr || alpha_hdr) { - output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00); - output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00); - output_scale = hdr_scale; - + output0 = vint4(0xFF, 0x00, 0xFF, 0xFF); + output1 = vint4(0xFF, 0x00, 0xFF, 0xFF); rgb_hdr = false; alpha_hdr = false; } + + output0 = output0 * 257; + output1 = output1 * 257; } - // An HDR profile image + // sRGB LDR 8-bit endpoints are expanded to 16 bit by: + // - RGB = shift left by 8 bits and OR with 0x80 + // - A = replication + else if (decode_mode == ASTCENC_PRF_LDR_SRGB) + { + // Error color - HDR endpoint in an LDR encoding + if (rgb_hdr || alpha_hdr) + { + output0 = vint4(0xFF, 0x00, 0xFF, 0xFF); + output1 = vint4(0xFF, 0x00, 0xFF, 0xFF); + rgb_hdr = false; + alpha_hdr = false; + } + + vmask4 mask(true, true, true, false); + + vint4 output0rgb = lsl<8>(output0) | vint4(0x80); + vint4 output0a = output0 * 257; + output0 = select(output0a, output0rgb, mask); + + vint4 output1rgb = lsl<8>(output1) | vint4(0x80); + vint4 output1a = output1 * 257; + output1 = select(output1a, output1rgb, mask); + } + // An HDR profile decode, but may be using linear LDR endpoints + // Linear LDR 8-bit endpoints are expanded to 16-bit by replication + // HDR endpoints are already 16-bit else { vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr); - output_scale = select(ldr_scale, hdr_scale, hdr_lanes); + vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes); + output0 = output0 * output_scale; + output1 = output1 * output_scale; } - - output0 = output0 * output_scale; - output1 = output1 * output_scale; } diff --git a/thirdparty/astcenc/astcenc_compress_symbolic.cpp b/thirdparty/astcenc/astcenc_compress_symbolic.cpp index 41a85582678..98d2495126d 100644 --- a/thirdparty/astcenc/astcenc_compress_symbolic.cpp +++ b/thirdparty/astcenc/astcenc_compress_symbolic.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -247,7 +247,7 @@ static bool realign_weights_decimated( } // Create an unquantized weight grid for this decimation level - alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS]; for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH) { vint unquant_value(dec_weights_uquant + we_idx); @@ -467,7 +467,7 @@ static float compress_symbolic_block_for_partition_1plane( qwt_bitcounts[i] = static_cast(bitcount); - alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; // Generate the optimized set of weights for the weight mode compute_quantized_weights_for_decimation( @@ -830,7 +830,7 @@ static float compress_symbolic_block_for_partition_2planes( unsigned int decimation_mode = bm.decimation_mode; const auto& di = bsd.get_decimation_info(decimation_mode); - alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; // Generate the optimized set of weights for the mode compute_quantized_weights_for_decimation( @@ -1163,7 +1163,7 @@ static float prepare_block_statistics( void compress_block( const astcenc_contexti& ctx, const image_block& blk, - physical_compressed_block& pcb, + uint8_t pcb[16], compression_working_buffers& tmpbuf) { astcenc_profile decode_mode = ctx.config.profile; @@ -1282,9 +1282,10 @@ void compress_block( static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot; - // Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels + // Only enable MODE0 fast path if enabled + // Never enable for 3D blocks as no "always" block modes are available int start_trial = 1; - if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1)) + if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1)) { start_trial = 0; } diff --git a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp index 39e5525c3b5..7463f7e20b1 100644 --- a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp +++ b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -27,15 +27,15 @@ /** * @brief Compute the integer linear interpolation of two color endpoints. * - * @param decode_mode The ASTC profile (linear or sRGB) + * @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16. * @param color0 The endpoint0 color. * @param color1 The endpoint1 color. - * @param weights The interpolation weight (between 0 and 64). + * @param weights The interpolation weight (between 0 and 64). * * @return The interpolated color. */ static vint4 lerp_color_int( - astcenc_profile decode_mode, + vmask4 u8_mask, vint4 color0, vint4 color1, vint4 weights @@ -43,24 +43,18 @@ static vint4 lerp_color_int( vint4 weight1 = weights; vint4 weight0 = vint4(64) - weight1; - if (decode_mode == ASTCENC_PRF_LDR_SRGB) - { - color0 = asr<8>(color0); - color1 = asr<8>(color1); - } - vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32); color = asr<6>(color); - if (decode_mode == ASTCENC_PRF_LDR_SRGB) - { - color = color * vint4(257); - } + // For decode_unorm8 values force the codec to bit replicate. This allows the + // rest of the codec to assume the full 0xFFFF range for everything and ignore + // the decode_mode setting + vint4 color_u8 = asr<8>(color) * vint4(257); + color = select(color, color_u8, u8_mask); return color; } - /** * @brief Convert integer color value into a float value for the decoder. * @@ -104,10 +98,10 @@ void unpack_weights( if (!is_dual_plane) { // Build full 64-entry weight lookup table - vint4 tab0(reinterpret_cast(scb.weights + 0)); - vint4 tab1(reinterpret_cast(scb.weights + 16)); - vint4 tab2(reinterpret_cast(scb.weights + 32)); - vint4 tab3(reinterpret_cast(scb.weights + 48)); + vint4 tab0 = vint4::load(scb.weights + 0); + vint4 tab1 = vint4::load(scb.weights + 16); + vint4 tab2 = vint4::load(scb.weights + 32); + vint4 tab3 = vint4::load(scb.weights + 48); vint tab0p, tab1p, tab2p, tab3p; vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p); @@ -134,14 +128,14 @@ void unpack_weights( { // Build a 32-entry weight lookup table per plane // Plane 1 - vint4 tab0_plane1(reinterpret_cast(scb.weights + 0)); - vint4 tab1_plane1(reinterpret_cast(scb.weights + 16)); + vint4 tab0_plane1 = vint4::load(scb.weights + 0); + vint4 tab1_plane1 = vint4::load(scb.weights + 16); vint tab0_plane1p, tab1_plane1p; vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p); // Plane 2 - vint4 tab0_plane2(reinterpret_cast(scb.weights + 32)); - vint4 tab1_plane2(reinterpret_cast(scb.weights + 48)); + vint4 tab0_plane2 = vint4::load(scb.weights + 32); + vint4 tab1_plane2 = vint4::load(scb.weights + 48); vint tab0_plane2p, tab1_plane2p; vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p); @@ -229,12 +223,13 @@ void decompress_symbolic_block( { vint4 colori(scb.constant_color); - // For sRGB decoding a real decoder would just use the top 8 bits for color conversion. - // We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range. - if (decode_mode == ASTCENC_PRF_LDR_SRGB) - { - colori = asr<8>(colori) * 257; - } + // Determine the UNORM8 rounding on the decode + vmask4 u8_mask = get_u8_component_mask(decode_mode, blk); + + // The real decoder would just use the top 8 bits, but we rescale + // in to a 16-bit value that rounds correctly. + vint4 colori_u8 = asr<8>(colori) * 257; + colori = select(colori, colori_u8, u8_mask); vint4 colorf16 = unorm16_to_sf16(colori); color = float16_to_float(colorf16); @@ -289,6 +284,8 @@ void decompress_symbolic_block( int plane2_component = scb.plane2_component; vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component); + vmask4 u8_mask = get_u8_component_mask(decode_mode, blk); + for (int i = 0; i < partition_count; i++) { // Decode the color endpoints for this partition @@ -310,7 +307,7 @@ void decompress_symbolic_block( { int tix = pi.texels_of_partition[i][j]; vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask); - vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight); + vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight); vfloat4 colorf = decode_texel(color, lns_mask); blk.data_r[tix] = colorf.lane<0>(); @@ -365,12 +362,14 @@ float compute_symbolic_block_difference_2plane( rgb_lns, a_lns, ep0, ep1); + vmask4 u8_mask = get_u8_component_mask(config.profile, blk); + // Unpack and compute error for each texel in the partition unsigned int texel_count = bsd.texel_count; for (unsigned int i = 0; i < texel_count; i++) { vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask); - vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight); + vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight); vfloat4 color = int_to_float(colori); vfloat4 oldColor = blk.texel(i); @@ -444,6 +443,8 @@ float compute_symbolic_block_difference_1plane( int plane1_weights[BLOCK_MAX_TEXELS]; unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); + vmask4 u8_mask = get_u8_component_mask(config.profile, blk); + vfloat4 summa = vfloat4::zero(); for (unsigned int i = 0; i < partition_count; i++) { @@ -464,7 +465,7 @@ float compute_symbolic_block_difference_1plane( for (unsigned int j = 0; j < texel_count; j++) { unsigned int tix = pi.texels_of_partition[i][j]; - vint4 colori = lerp_color_int(config.profile, ep0, ep1, + vint4 colori = lerp_color_int(u8_mask, ep0, ep1, vint4(plane1_weights[tix])); vfloat4 color = int_to_float(colori); @@ -532,7 +533,7 @@ float compute_symbolic_block_difference_1plane_1partition( const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); // Unquantize and undecimate the weights - alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS]; unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); // Decode the color endpoints for this partition @@ -547,19 +548,12 @@ float compute_symbolic_block_difference_1plane_1partition( rgb_lns, a_lns, ep0, ep1); - - // Pre-shift sRGB so things round correctly - if (config.profile == ASTCENC_PRF_LDR_SRGB) - { - ep0 = asr<8>(ep0); - ep1 = asr<8>(ep1); - } + vmask4 u8_mask = get_u8_component_mask(config.profile, blk); // Unpack and compute error for each texel in the partition vfloatacc summav = vfloatacc::zero(); vint lane_id = vint::lane_id(); - vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1); unsigned int texel_count = bsd.texel_count; for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) @@ -578,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition( vint ep0_b = vint(ep0.lane<2>()) * weight0; vint ep0_a = vint(ep0.lane<3>()) * weight0; - // Shift so things round correctly - vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale; - vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale; - vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale; - vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale; + // Combine contributions + vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)); + vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)); + vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)); + vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)); + + // If using a U8 decode mode bit replicate top 8 bits + // so rest of codec can assume 0xFFFF max range everywhere + vint colori_r8 = asr<8>(colori_r) * vint(257); + colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>())); + + vint colori_g8 = asr<8>(colori_g) * vint(257); + colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>())); + + vint colori_b8 = asr<8>(colori_b) * vint(257); + colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>())); + + vint colori_a8 = asr<8>(colori_a) * vint(257); + colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>())); // Compute color diff vfloat color_r = int_to_float(colori_r); diff --git a/thirdparty/astcenc/astcenc_entry.cpp b/thirdparty/astcenc/astcenc_entry.cpp index e53762c26a0..71efe9cec48 100644 --- a/thirdparty/astcenc/astcenc_entry.cpp +++ b/thirdparty/astcenc/astcenc_entry.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -55,6 +55,7 @@ struct astcenc_preset_config float tune_2partition_early_out_limit_factor; float tune_3partition_early_out_limit_factor; float tune_2plane_early_out_limit_correlation; + float tune_search_mode0_enable; }; /** @@ -63,22 +64,22 @@ struct astcenc_preset_config static const std::array preset_configs_high {{ { ASTCENC_PRE_FASTEST, - 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f + 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f }, { ASTCENC_PRE_FAST, - 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f + 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f }, { ASTCENC_PRE_MEDIUM, - 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f + 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f }, { ASTCENC_PRE_THOROUGH, - 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f + 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f }, { ASTCENC_PRE_VERYTHOROUGH, - 4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f + 4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f }, { ASTCENC_PRE_EXHAUSTIVE, - 4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f + 4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f } }}; @@ -88,22 +89,22 @@ static const std::array preset_configs_high {{ static const std::array preset_configs_mid {{ { ASTCENC_PRE_FASTEST, - 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f + 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f }, { ASTCENC_PRE_FAST, - 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f + 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f }, { ASTCENC_PRE_MEDIUM, - 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f + 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f }, { ASTCENC_PRE_THOROUGH, - 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f + 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f }, { ASTCENC_PRE_VERYTHOROUGH, - 4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f + 4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f }, { ASTCENC_PRE_EXHAUSTIVE, - 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f + 4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f } }}; @@ -113,22 +114,22 @@ static const std::array preset_configs_mid {{ static const std::array preset_configs_low {{ { ASTCENC_PRE_FASTEST, - 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f + 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f }, { ASTCENC_PRE_FAST, - 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f + 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f }, { ASTCENC_PRE_MEDIUM, - 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f + 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f }, { ASTCENC_PRE_THOROUGH, - 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f + 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f }, { ASTCENC_PRE_VERYTHOROUGH, - 4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f + 4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f }, { ASTCENC_PRE_EXHAUSTIVE, - 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f + 4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f } }}; @@ -216,11 +217,13 @@ static astcenc_error validate_block_size( /** * @brief Validate flags. * - * @param flags The flags to check. + * @param profile The profile to check. + * @param flags The flags to check. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_flags( + astcenc_profile profile, unsigned int flags ) { // Flags field must not contain any unknown flag bits @@ -238,6 +241,14 @@ static astcenc_error validate_flags( return ASTCENC_ERR_BAD_FLAGS; } + // Decode_unorm8 must only be used with an LDR profile + bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8; + bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A); + if (is_unorm8 && is_hdr) + { + return ASTCENC_ERR_BAD_DECODE_MODE; + } + return ASTCENC_SUCCESS; } @@ -363,7 +374,7 @@ static astcenc_error validate_config( return status; } - status = validate_flags(config.flags); + status = validate_flags(config.profile, config.flags); if (status != ASTCENC_SUCCESS) { return status; @@ -504,10 +515,10 @@ astcenc_error astcenc_config_init( config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit; config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit; config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit; - config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES); - config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); - config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); - config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); + config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit; + config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit; + config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit; + config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit; config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels, (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels); @@ -516,6 +527,7 @@ astcenc_error astcenc_config_init( config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor; config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor; config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation; + config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable; } // Start and end node are not the same - so interpolate between them else @@ -542,14 +554,10 @@ astcenc_error astcenc_config_init( config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit); config.tune_block_mode_limit = LERPI(tune_block_mode_limit); config.tune_refinement_limit = LERPI(tune_refinement_limit); - config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit), - TUNE_MAX_TRIAL_CANDIDATES); - config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit), - BLOCK_MAX_PARTITIONINGS); - config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit), - BLOCK_MAX_PARTITIONINGS); - config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit), - BLOCK_MAX_PARTITIONINGS); + config.tune_candidate_limit = LERPUI(tune_candidate_limit); + config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit); + config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit); + config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit); config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels, LERP(tune_db_limit_b_base) - 19 * ltexels); @@ -558,6 +566,7 @@ astcenc_error astcenc_config_init( config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor); config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor); config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation); + config.tune_search_mode0_enable = LERP(tune_search_mode0_enable); #undef LERP #undef LERPI #undef LERPUI @@ -585,13 +594,14 @@ astcenc_error astcenc_config_init( case ASTCENC_PRF_HDR_RGB_LDR_A: case ASTCENC_PRF_HDR: config.tune_db_limit = 999.0f; + config.tune_search_mode0_enable = 0.0f; break; default: return ASTCENC_ERR_BAD_PROFILE; } // Flags field must not contain any unknown flag bits - status = validate_flags(flags); + status = validate_flags(profile, flags); if (status != ASTCENC_SUCCESS) { return status; @@ -689,6 +699,12 @@ astcenc_error astcenc_context_alloc( } ctx->bsd = aligned_malloc(sizeof(block_size_descriptor), ASTCENC_VECALIGN); + if (!ctx->bsd) + { + delete ctxo; + return ASTCENC_ERR_OUT_OF_MEM; + } + bool can_omit_modes = static_cast(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY); init_block_size_descriptor(config.block_x, config.block_y, config.block_z, can_omit_modes, @@ -698,7 +714,7 @@ astcenc_error astcenc_context_alloc( #if !defined(ASTCENC_DECOMPRESS_ONLY) // Do setup only needed by compression - if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY)) + if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)) { // Turn a dB limit into a per-texel error for faster use later if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB)) @@ -712,7 +728,7 @@ astcenc_error astcenc_context_alloc( size_t worksize = sizeof(compression_working_buffers) * thread_count; ctx->working_buffers = aligned_malloc(worksize, ASTCENC_VECALIGN); - static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0, + static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0), "compression_working_buffers size must be multiple of vector alignment"); if (!ctx->working_buffers) { @@ -802,6 +818,8 @@ static void compress_image( int row_blocks = xblocks; int plane_blocks = xblocks * yblocks; + blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8; + // Populate the block channel weights blk.channel_weight = vfloat4(ctx.config.cw_r_weight, ctx.config.cw_g_weight, @@ -812,7 +830,7 @@ static void compress_image( auto& temp_buffers = ctx.working_buffers[thread_index]; // Only the first thread actually runs the initializer - ctxo.manage_compress.init(block_count); + ctxo.manage_compress.init(block_count, ctx.config.progress_callback); // Determine if we can use an optimized load function bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) || @@ -914,8 +932,7 @@ static void compress_image( int offset = ((z * yblocks + y) * xblocks + x) * 16; uint8_t *bp = buffer + offset; - physical_compressed_block* pcb = reinterpret_cast(bp); - compress_block(ctx, blk, *pcb, temp_buffers); + compress_block(ctx, blk, bp, temp_buffers); } ctxo.manage_compress.complete_task_assignment(count); @@ -1138,6 +1155,7 @@ astcenc_error astcenc_decompress_image( unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x; unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y; unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z; + unsigned int block_count = zblocks * yblocks * xblocks; int row_blocks = xblocks; int plane_blocks = xblocks * yblocks; @@ -1152,6 +1170,9 @@ astcenc_error astcenc_decompress_image( image_block blk; blk.texel_count = static_cast(block_x * block_y * block_z); + // Decode mode inferred from the output data type + blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8; + // If context thread count is one then implicitly reset if (ctx->thread_count == 1) { @@ -1159,7 +1180,7 @@ astcenc_error astcenc_decompress_image( } // Only the first thread actually runs the initializer - ctxo->manage_decompress.init(zblocks * yblocks * xblocks); + ctxo->manage_decompress.init(block_count, nullptr); // All threads run this processing loop until there is no work remaining while (true) @@ -1182,10 +1203,9 @@ astcenc_error astcenc_decompress_image( unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16; const uint8_t* bp = data + offset; - const physical_compressed_block& pcb = *reinterpret_cast(bp); symbolic_compressed_block scb; - physical_to_symbolic(*ctx->bsd, pcb, scb); + physical_to_symbolic(*ctx->bsd, bp, scb); decompress_symbolic_block(ctx->config.profile, *ctx->bsd, x * block_x, y * block_y, z * block_z, @@ -1224,9 +1244,8 @@ astcenc_error astcenc_get_block_info( astcenc_contexti* ctx = &ctxo->context; // Decode the compressed data into a symbolic form - const physical_compressed_block&pcb = *reinterpret_cast(data); symbolic_compressed_block scb; - physical_to_symbolic(*ctx->bsd, pcb, scb); + physical_to_symbolic(*ctx->bsd, data, scb); // Fetch the appropriate partition and decimation tables block_size_descriptor& bsd = *ctx->bsd; @@ -1359,6 +1378,8 @@ const char* astcenc_get_error_string( return "ASTCENC_ERR_BAD_CONTEXT"; case ASTCENC_ERR_NOT_IMPLEMENTED: return "ASTCENC_ERR_NOT_IMPLEMENTED"; + case ASTCENC_ERR_BAD_DECODE_MODE: + return "ASTCENC_ERR_BAD_DECODE_MODE"; #if defined(ASTCENC_DIAGNOSTICS) case ASTCENC_ERR_DTRACE_FAILURE: return "ASTCENC_ERR_DTRACE_FAILURE"; diff --git a/thirdparty/astcenc/astcenc_find_best_partitioning.cpp b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp index 789c3964ef7..bfbcc35e94e 100644 --- a/thirdparty/astcenc/astcenc_find_best_partitioning.cpp +++ b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp @@ -250,13 +250,16 @@ static void kmeans_update( * * @return The number of bit mismatches. */ -static inline unsigned int partition_mismatch2( +static inline uint8_t partition_mismatch2( const uint64_t a[2], const uint64_t b[2] ) { int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]); int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]); - return astc::min(v1, v2); + + // Divide by 2 because XOR always counts errors twice, once when missing + // in the expected position, and again when present in the wrong partition + return static_cast(astc::min(v1, v2) / 2); } /** @@ -267,7 +270,7 @@ static inline unsigned int partition_mismatch2( * * @return The number of bit mismatches. */ -static inline unsigned int partition_mismatch3( +static inline uint8_t partition_mismatch3( const uint64_t a[3], const uint64_t b[3] ) { @@ -295,7 +298,9 @@ static inline unsigned int partition_mismatch3( int s5 = p11 + p20; int v2 = astc::min(s4, s5) + p02; - return astc::min(v0, v1, v2); + // Divide by 2 because XOR always counts errors twice, once when missing + // in the expected position, and again when present in the wrong partition + return static_cast(astc::min(v0, v1, v2) / 2); } /** @@ -306,7 +311,7 @@ static inline unsigned int partition_mismatch3( * * @return The number of bit mismatches. */ -static inline unsigned int partition_mismatch4( +static inline uint8_t partition_mismatch4( const uint64_t a[4], const uint64_t b[4] ) { @@ -342,7 +347,9 @@ static inline unsigned int partition_mismatch4( int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01); int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12); - return astc::min(v0, v1, v2, v3); + // Divide by 2 because XOR always counts errors twice, once when missing + // in the expected position, and again when present in the wrong partition + return static_cast(astc::min(v0, v1, v2, v3) / 2); } using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*); @@ -359,7 +366,7 @@ static void count_partition_mismatch_bits( const block_size_descriptor& bsd, unsigned int partition_count, const uint64_t bitmaps[BLOCK_MAX_PARTITIONS], - unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS] + uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS] ) { unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1]; promise(active_count > 0); @@ -369,6 +376,8 @@ static void count_partition_mismatch_bits( for (unsigned int i = 0; i < active_count; i++) { mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]); + assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); + assert(mismatch_counts[i] < bsd.texel_count); } } else if (partition_count == 3) @@ -376,6 +385,8 @@ static void count_partition_mismatch_bits( for (unsigned int i = 0; i < active_count; i++) { mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]); + assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); + assert(mismatch_counts[i] < bsd.texel_count); } } else @@ -383,6 +394,8 @@ static void count_partition_mismatch_bits( for (unsigned int i = 0; i < active_count; i++) { mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]); + assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); + assert(mismatch_counts[i] < bsd.texel_count); } } } @@ -397,12 +410,13 @@ static void count_partition_mismatch_bits( * @return The number of active partitions in this selection. */ static unsigned int get_partition_ordering_by_mismatch_bits( + unsigned int texel_count, unsigned int partitioning_count, - const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS], - unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS] + const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS], + uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS] ) { promise(partitioning_count > 0); - unsigned int mscount[256] { 0 }; + uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 }; // Create the histogram of mismatch counts for (unsigned int i = 0; i < partitioning_count; i++) @@ -410,16 +424,14 @@ static unsigned int get_partition_ordering_by_mismatch_bits( mscount[mismatch_count[i]]++; } - unsigned int active_count = partitioning_count - mscount[255]; - // Create a running sum from the histogram array // Cells store previous values only; i.e. exclude self after sum - unsigned int summa = 0; - for (unsigned int i = 0; i < 256; i++) + unsigned int sum = 0; + for (unsigned int i = 0; i < texel_count; i++) { - unsigned int cnt = mscount[i]; - mscount[i] = summa; - summa += cnt; + uint16_t cnt = mscount[i]; + mscount[i] = sum; + sum += cnt; } // Use the running sum as the index, incrementing after read to allow @@ -427,10 +439,10 @@ static unsigned int get_partition_ordering_by_mismatch_bits( for (unsigned int i = 0; i < partitioning_count; i++) { unsigned int idx = mscount[mismatch_count[i]]++; - partition_ordering[idx] = i; + partition_ordering[idx] = static_cast(i); } - return active_count; + return partitioning_count; } /** @@ -447,7 +459,7 @@ static unsigned int compute_kmeans_partition_ordering( const block_size_descriptor& bsd, const image_block& blk, unsigned int partition_count, - unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS] + uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS] ) { vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]; uint8_t texel_partitions[BLOCK_MAX_TEXELS]; @@ -478,11 +490,12 @@ static unsigned int compute_kmeans_partition_ordering( } // Count the mismatch between the block and the format's partition tables - unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]; + uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]; count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts); // Sort the partitions based on the number of mismatched bits return get_partition_ordering_by_mismatch_bits( + texels_to_process, bsd.partitioning_count_selected[partition_count - 1], mismatch_counts, partition_ordering); } @@ -565,7 +578,7 @@ unsigned int find_best_partition_candidates( weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim; - unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS]; + uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS]; unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); partition_search_limit = astc::min(partition_search_limit, sequence_len); requested_candidates = astc::min(partition_search_limit, requested_candidates); diff --git a/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp b/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp index 5145e08693b..051782fd0f7 100644 --- a/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp +++ b/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -873,7 +873,7 @@ void compute_ideal_weights_for_decimation( } // Otherwise compute an estimate and perform single refinement iteration - alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS]; // Compute an initial average for each decimated weight bool constant_wes = ei.is_constant_weight_error_scale; @@ -1023,7 +1023,7 @@ void compute_quantized_weights_for_decimation( // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements if (get_quant_level(quant_level) <= 16) { - vint4 tab0(reinterpret_cast(qat.quant_to_unquant)); + vint4 tab0 = vint4::load(qat.quant_to_unquant); vint tab0p; vtable_prepare(tab0, tab0p); @@ -1056,8 +1056,8 @@ void compute_quantized_weights_for_decimation( } else { - vint4 tab0(reinterpret_cast(qat.quant_to_unquant)); - vint4 tab1(reinterpret_cast(qat.quant_to_unquant + 16)); + vint4 tab0 = vint4::load(qat.quant_to_unquant + 0); + vint4 tab1 = vint4::load(qat.quant_to_unquant + 16); vint tab0p, tab1p; vtable_prepare(tab0, tab1, tab0p, tab1p); @@ -1171,7 +1171,7 @@ void recompute_ideal_colors_1plane( promise(total_texel_count > 0); promise(partition_count > 0); - alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS]; for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { vint unquant_value(dec_weights_uquant + i); @@ -1179,7 +1179,7 @@ void recompute_ideal_colors_1plane( storea(unquant_valuef, dec_weight + i); } - alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS]; float* undec_weight_ref; if (di.max_texel_weight_count == 1) { @@ -1394,8 +1394,8 @@ void recompute_ideal_colors_2planes( promise(total_texel_count > 0); promise(weight_count > 0); - alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; - alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; + ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; + ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE); @@ -1410,8 +1410,8 @@ void recompute_ideal_colors_2planes( storea(unquant_value2f, dec_weight_plane2 + i); } - alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS]; - alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS]; float* undec_weight_plane1_ref; float* undec_weight_plane2_ref; diff --git a/thirdparty/astcenc/astcenc_image.cpp b/thirdparty/astcenc/astcenc_image.cpp index 9c0d6727d01..079f69f1947 100644 --- a/thirdparty/astcenc/astcenc_image.cpp +++ b/thirdparty/astcenc/astcenc_image.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2022 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -109,7 +109,7 @@ static vfloat4 swz_texel( vfloat4 data, const astcenc_swizzle& swz ) { - alignas(16) float datas[6]; + ASTCENC_ALIGNAS float datas[6]; storea(data, datas); datas[ASTCENC_SWZ_0] = 0.0f; @@ -433,7 +433,7 @@ void store_image_block( vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai); vmask store_mask = vint::lane_id() < vint(used_texels); - store_lanes_masked(reinterpret_cast(data8_row), data_rgbai, store_mask); + store_lanes_masked(data8_row, data_rgbai, store_mask); data8_row += ASTCENC_SIMD_WIDTH * 4; idx += used_texels; diff --git a/thirdparty/astcenc/astcenc_internal.h b/thirdparty/astcenc/astcenc_internal.h index 63bbf8af537..715028ac8f4 100644 --- a/thirdparty/astcenc/astcenc_internal.h +++ b/thirdparty/astcenc/astcenc_internal.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -29,6 +29,7 @@ #include #endif #include +#include #include "astcenc.h" #include "astcenc_mathlib.h" @@ -79,7 +80,7 @@ static constexpr unsigned int BLOCK_MAX_PARTITIONS { 4 }; /** @brief The number of partitionings, per partition count, suported by the ASTC format. */ static constexpr unsigned int BLOCK_MAX_PARTITIONINGS { 1024 }; -/** @brief The maximum number of weights used during partition selection for texel clustering. */ +/** @brief The maximum number of texels used during partition selection for texel clustering. */ static constexpr uint8_t BLOCK_MAX_KMEANS_TEXELS { 64 }; /** @brief The maximum number of weights a block can support. */ @@ -119,11 +120,9 @@ static constexpr unsigned int WEIGHTS_MAX_DECIMATION_MODES { 87 }; static constexpr float ERROR_CALC_DEFAULT { 1e30f }; /** - * @brief The minimum texel count for a block to use the one partition fast path. - * - * This setting skips 4x4 and 5x4 block sizes. + * @brief The minimum tuning setting threshold for the one partition fast path. */ -static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 }; +static constexpr float TUNE_MIN_SEARCH_MODE0 { 0.85f }; /** * @brief The maximum number of candidate encodings tested for each encoding mode. @@ -137,7 +136,7 @@ static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 8 }; * * This can be dynamically reduced by the compression quality preset. */ -static constexpr unsigned int TUNE_MAX_PARTITIONING_CANDIDATES { 32 }; +static constexpr unsigned int TUNE_MAX_PARTITIONING_CANDIDATES { 8 }; /** * @brief The maximum quant level using full angular endpoint search method. @@ -386,7 +385,7 @@ struct decimation_info * @brief The bilinear contribution of the N weights that are interpolated for each texel. * Value is between 0 and 1, stored transposed to improve vectorization. */ - alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS]; /** @brief The number of texels that each stored weight contributes to. */ uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS]; @@ -401,7 +400,7 @@ struct decimation_info * @brief The bilinear contribution to the N texels that use each weight. * Value is between 0 and 1, stored transposed to improve vectorization. */ - alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS]; /** * @brief The bilinear contribution to the Nth texel that uses each weight. @@ -581,7 +580,7 @@ struct block_size_descriptor decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES]; /** @brief The active decimation tables, stored in low indices. */ - alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES]; + ASTCENC_ALIGNAS decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES]; /** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */ uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES]; @@ -741,16 +740,16 @@ struct block_size_descriptor struct image_block { /** @brief The input (compress) or output (decompress) data for the red color component. */ - alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS]; /** @brief The input (compress) or output (decompress) data for the green color component. */ - alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS]; /** @brief The input (compress) or output (decompress) data for the blue color component. */ - alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS]; /** @brief The input (compress) or output (decompress) data for the alpha color component. */ - alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS]; /** @brief The number of texels in the block. */ uint8_t texel_count; @@ -773,6 +772,9 @@ struct image_block /** @brief Is this grayscale block where R == G == B for all texels? */ bool grayscale; + /** @brief Is the eventual decode using decode_unorm8 rounding? */ + bool decode_unorm8; + /** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */ uint8_t rgb_lns[BLOCK_MAX_TEXELS]; @@ -899,10 +901,10 @@ struct endpoints_and_weights endpoints ep; /** @brief The ideal weight for each texel; may be undecimated or decimated. */ - alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float weights[BLOCK_MAX_TEXELS]; /** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */ - alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS]; + ASTCENC_ALIGNAS float weight_error_scale[BLOCK_MAX_TEXELS]; }; /** @@ -932,7 +934,7 @@ struct encoding_choice_errors /** * @brief Preallocated working buffers, allocated per thread during context creation. */ -struct alignas(ASTCENC_VECALIGN) compression_working_buffers +struct ASTCENC_ALIGNAS compression_working_buffers { /** @brief Ideal endpoints and weights for plane 1. */ endpoints_and_weights ei1; @@ -948,7 +950,7 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers * * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets. */ - alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS]; /** * @brief Decimated quantized weight values in the unquantized 0-64 range. @@ -958,7 +960,7 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS]; /** @brief Error of the best encoding combination for each block mode. */ - alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES]; + ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES]; /** @brief The best color quant for each block mode. */ uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES]; @@ -1025,13 +1027,13 @@ struct dt_init_working_buffers struct quant_and_transfer_table { /** @brief The unscrambled unquantized value. */ - int8_t quant_to_unquant[32]; + uint8_t quant_to_unquant[32]; /** @brief The scrambling order: scrambled_quant = map[unscrambled_quant]. */ - int8_t scramble_map[32]; + uint8_t scramble_map[32]; /** @brief The unscrambling order: unscrambled_unquant = map[scrambled_quant]. */ - int8_t unscramble_and_unquant_map[32]; + uint8_t unscramble_and_unquant_map[32]; /** * @brief A table of previous-and-next weights, indexed by the current unquantized value. @@ -1060,7 +1062,7 @@ static constexpr uint8_t SYM_BTYPE_NONCONST { 3 }; * @brief A symbolic representation of a compressed block. * * The symbolic representation stores the unpacked content of a single - * @c physical_compressed_block, in a form which is much easier to access for + * physical compressed block, in a form which is much easier to access for * the rest of the compressor code. */ struct symbolic_compressed_block @@ -1122,18 +1124,6 @@ struct symbolic_compressed_block } }; -/** - * @brief A physical representation of a compressed block. - * - * The physical representation stores the raw bytes of the format in memory. - */ -struct physical_compressed_block -{ - /** @brief The ASTC encoded data for a single block. */ - uint8_t data[16]; -}; - - /** * @brief Parameter structure for @c compute_pixel_region_variance(). * @@ -1577,6 +1567,33 @@ unsigned int find_best_partition_candidates( Functionality for managing images and image related data. ============================================================================ */ +/** + * @brief Get a vector mask indicating lanes decompressing into a UNORM8 value. + * + * @param decode_mode The color profile for LDR_SRGB settings. + * @param blk The image block for output image bitness settings. + * + * @return The component mask vector. + */ +static inline vmask4 get_u8_component_mask( + astcenc_profile decode_mode, + const image_block& blk +) { + vmask4 u8_mask(false); + // Decode mode writing to a unorm8 output value + if (blk.decode_unorm8) + { + u8_mask = vmask4(true); + } + // SRGB writing to a unorm8 RGB value + else if (decode_mode == ASTCENC_PRF_LDR_SRGB) + { + u8_mask = vmask4(true, true, true, false); + } + + return u8_mask; +} + /** * @brief Setup computation of regional averages in an image. * @@ -1830,7 +1847,7 @@ uint8_t pack_color_endpoints( * * Endpoints must be unscrambled and converted into the 0-255 range before calling this functions. * - * @param decode_mode The decode mode (LDR, HDR). + * @param decode_mode The decode mode (LDR, HDR, etc). * @param format The color endpoint mode used. * @param input The raw array of encoded input integers. The length of this array * depends on @c format; it can be safely assumed to be large enough. @@ -1848,6 +1865,34 @@ void unpack_color_endpoints( vint4& output0, vint4& output1); +/** + * @brief Unpack an LDR RGBA color that uses delta encoding. + * + * @param input0 The packed endpoint 0 color. + * @param input1 The packed endpoint 1 color deltas. + * @param[out] output0 The unpacked endpoint 0 color. + * @param[out] output1 The unpacked endpoint 1 color. + */ +void rgba_delta_unpack( + vint4 input0, + vint4 input1, + vint4& output0, + vint4& output1); + +/** + * @brief Unpack an LDR RGBA color that uses direct encoding. + * + * @param input0 The packed endpoint 0 color. + * @param input1 The packed endpoint 1 color. + * @param[out] output0 The unpacked endpoint 0 color. + * @param[out] output1 The unpacked endpoint 1 color. + */ +void rgba_unpack( + vint4 input0, + vint4 input1, + vint4& output0, + vint4& output1); + /** * @brief Unpack a set of quantized and decimated weights. * @@ -2007,7 +2052,7 @@ void compute_angular_endpoints_2planes( void compress_block( const astcenc_contexti& ctx, const image_block& blk, - physical_compressed_block& pcb, + uint8_t pcb[16], compression_working_buffers& tmpbuf); /** @@ -2100,12 +2145,12 @@ float compute_symbolic_block_difference_1plane_1partition( * * @param bsd The block size information. * @param scb The symbolic representation. - * @param[out] pcb The binary encoded data. + * @param[out] pcb The physical compressed block output. */ void symbolic_to_physical( const block_size_descriptor& bsd, const symbolic_compressed_block& scb, - physical_compressed_block& pcb); + uint8_t pcb[16]); /** * @brief Convert a binary physical encoding into a symbolic representation. @@ -2114,12 +2159,12 @@ void symbolic_to_physical( * flagged as an error block if the encoding is invalid. * * @param bsd The block size information. - * @param pcb The binary encoded data. + * @param pcb The physical compresesd block input. * @param[out] scb The output symbolic representation. */ void physical_to_symbolic( const block_size_descriptor& bsd, - const physical_compressed_block& pcb, + const uint8_t pcb[16], symbolic_compressed_block& scb); /* ============================================================================ @@ -2128,10 +2173,11 @@ Platform-specific functions. /** * @brief Allocate an aligned memory buffer. * - * Allocated memory must be freed by aligned_free; + * Allocated memory must be freed by aligned_free. * * @param size The desired buffer size. - * @param align The desired buffer alignment; must be 2^N. + * @param align The desired buffer alignment; must be 2^N, may be increased + * by the implementation to a minimum allowable alignment. * * @return The memory buffer pointer or nullptr on allocation failure. */ @@ -2141,10 +2187,14 @@ T* aligned_malloc(size_t size, size_t align) void* ptr; int error = 0; + // Don't allow this to under-align a type + size_t min_align = astc::max(alignof(T), sizeof(void*)); + size_t real_align = astc::max(min_align, align); + #if defined(_WIN32) - ptr = _aligned_malloc(size, align); + ptr = _aligned_malloc(size, real_align); #else - error = posix_memalign(&ptr, align, size); + error = posix_memalign(&ptr, real_align, size); #endif if (error || (!ptr)) @@ -2164,9 +2214,9 @@ template void aligned_free(T* ptr) { #if defined(_WIN32) - _aligned_free(reinterpret_cast(ptr)); + _aligned_free(ptr); #else - free(reinterpret_cast(ptr)); + free(ptr); #endif } diff --git a/thirdparty/astcenc/astcenc_internal_entry.h b/thirdparty/astcenc/astcenc_internal_entry.h index 4e8794547ab..26677147810 100644 --- a/thirdparty/astcenc/astcenc_internal_entry.h +++ b/thirdparty/astcenc/astcenc_internal_entry.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2022 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -118,6 +118,18 @@ private: /** @brief Number of tasks that need to be processed. */ unsigned int m_task_count; + /** @brief Progress callback (optional). */ + astcenc_progress_callback m_callback; + + /** @brief Lock used for callback synchronization. */ + std::mutex m_callback_lock; + + /** @brief Minimum progress before making a callback. */ + float m_callback_min_diff; + + /** @brief Last progress callback value. */ + float m_callback_last_value; + public: /** @brief Create a new ParallelManager. */ ParallelManager() @@ -138,6 +150,8 @@ public: m_start_count = 0; m_done_count = 0; m_task_count = 0; + m_callback_last_value = 0.0f; + m_callback_min_diff = 1.0f; } /** @@ -166,14 +180,20 @@ public: * initialization. Other threads will block and wait for it to complete. * * @param task_count Total number of tasks needing processing. + * @param callback Function pointer for progress status callbacks. */ - void init(unsigned int task_count) + void init(unsigned int task_count, astcenc_progress_callback callback) { std::lock_guard lck(m_lock); if (!m_init_done) { + m_callback = callback; m_task_count = task_count; m_init_done = true; + + // Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead + float min_diff = (4096.0f / static_cast(task_count)) * 100.0f; + m_callback_min_diff = astc::max(min_diff, 1.0f); } } @@ -212,12 +232,49 @@ public: { // Note: m_done_count cannot use an atomic without the mutex; this has a race between the // update here and the wait() for other threads - std::unique_lock lck(m_lock); - this->m_done_count += count; - if (m_done_count == m_task_count) + unsigned int local_count; + float local_last_value; { - lck.unlock(); - m_complete.notify_all(); + std::unique_lock lck(m_lock); + m_done_count += count; + local_count = m_done_count; + local_last_value = m_callback_last_value; + + if (m_done_count == m_task_count) + { + // Ensure the progress bar hits 100% + if (m_callback) + { + std::unique_lock cblck(m_callback_lock); + m_callback(100.0f); + m_callback_last_value = 100.0f; + } + + lck.unlock(); + m_complete.notify_all(); + } + } + + // Process progress callback if we have one + if (m_callback) + { + // Initial lockless test - have we progressed enough to emit? + float num = static_cast(local_count); + float den = static_cast(m_task_count); + float this_value = (num / den) * 100.0f; + bool report_test = (this_value - local_last_value) > m_callback_min_diff; + + // Recheck under lock, because another thread might report first + if (report_test) + { + std::unique_lock cblck(m_callback_lock); + bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff; + if (report_retest) + { + m_callback(this_value); + m_callback_last_value = this_value; + } + } } } diff --git a/thirdparty/astcenc/astcenc_mathlib.h b/thirdparty/astcenc/astcenc_mathlib.h index 0540c4fedd3..562d6597f26 100644 --- a/thirdparty/astcenc/astcenc_mathlib.h +++ b/thirdparty/astcenc/astcenc_mathlib.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -73,10 +73,22 @@ #endif #endif +// Force vector-sized SIMD alignment #if ASTCENC_AVX #define ASTCENC_VECALIGN 32 -#else +#elif ASTCENC_SSE || ASTCENC_NEON #define ASTCENC_VECALIGN 16 +// Use default alignment for non-SIMD builds +#else + #define ASTCENC_VECALIGN 0 +#endif + +// C++11 states that alignas(0) should be ignored but GCC doesn't do +// this on some versions, so workaround and avoid emitting alignas(0) +#if ASTCENC_VECALIGN > 0 + #define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN) +#else + #define ASTCENC_ALIGNAS #endif #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0 diff --git a/thirdparty/astcenc/astcenc_symbolic_physical.cpp b/thirdparty/astcenc/astcenc_symbolic_physical.cpp index 49a8a1504b0..c4da678f1c3 100644 --- a/thirdparty/astcenc/astcenc_symbolic_physical.cpp +++ b/thirdparty/astcenc/astcenc_symbolic_physical.cpp @@ -102,7 +102,7 @@ static inline void write_bits( void symbolic_to_physical( const block_size_descriptor& bsd, const symbolic_compressed_block& scb, - physical_compressed_block& pcb + uint8_t pcb[16] ) { assert(scb.block_type != SYM_BTYPE_ERROR); @@ -113,13 +113,13 @@ void symbolic_to_physical( static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; for (unsigned int i = 0; i < 8; i++) { - pcb.data[i] = cbytes[i]; + pcb[i] = cbytes[i]; } for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++) { - pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF; - pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF; + pcb[2 * i + 8] = scb.constant_color[i] & 0xFF; + pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF; } return; @@ -132,13 +132,13 @@ void symbolic_to_physical( static const uint8_t cbytes[8] { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; for (unsigned int i = 0; i < 8; i++) { - pcb.data[i] = cbytes[i]; + pcb[i] = cbytes[i]; } for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++) { - pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF; - pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF; + pcb[2 * i + 8] = scb.constant_color[i] & 0xFF; + pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF; } return; @@ -194,23 +194,23 @@ void symbolic_to_physical( for (int i = 0; i < 16; i++) { - pcb.data[i] = static_cast(bitrev8(weightbuf[15 - i])); + pcb[i] = static_cast(bitrev8(weightbuf[15 - i])); } - write_bits(scb.block_mode, 11, 0, pcb.data); - write_bits(partition_count - 1, 2, 11, pcb.data); + write_bits(scb.block_mode, 11, 0, pcb); + write_bits(partition_count - 1, 2, 11, pcb); int below_weights_pos = 128 - bits_for_weights; // Encode partition index and color endpoint types for blocks with 2+ partitions if (partition_count > 1) { - write_bits(scb.partition_index, 6, 13, pcb.data); - write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb.data); + write_bits(scb.partition_index, 6, 13, pcb); + write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb); if (scb.color_formats_matched) { - write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb.data); + write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb); } else { @@ -249,20 +249,20 @@ void symbolic_to_physical( int encoded_type_highpart = encoded_type >> 6; int encoded_type_highpart_size = (3 * partition_count) - 4; int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size; - write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb.data); - write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb.data); + write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb); + write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb); below_weights_pos -= encoded_type_highpart_size; } } else { - write_bits(scb.color_formats[0], 4, 13, pcb.data); + write_bits(scb.color_formats[0], 4, 13, pcb); } // In dual-plane mode, encode the color component of the second plane of weights if (is_dual_plane) { - write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb.data); + write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb); } // Encode the color components @@ -281,7 +281,7 @@ void symbolic_to_physical( valuecount_to_encode += vals; } - encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb.data, + encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb, scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS); } @@ -290,7 +290,7 @@ void symbolic_to_physical( /* See header for documentation. */ void physical_to_symbolic( const block_size_descriptor& bsd, - const physical_compressed_block& pcb, + const uint8_t pcb[16], symbolic_compressed_block& scb ) { uint8_t bswapped[16]; @@ -298,7 +298,7 @@ void physical_to_symbolic( scb.block_type = SYM_BTYPE_NONCONST; // Extract header fields - int block_mode = read_bits(11, 0, pcb.data); + int block_mode = read_bits(11, 0, pcb); if ((block_mode & 0x1FF) == 0x1FC) { // Constant color block @@ -316,24 +316,24 @@ void physical_to_symbolic( scb.partition_count = 0; for (int i = 0; i < 4; i++) { - scb.constant_color[i] = pcb.data[2 * i + 8] | (pcb.data[2 * i + 9] << 8); + scb.constant_color[i] = pcb[2 * i + 8] | (pcb[2 * i + 9] << 8); } // Additionally, check that the void-extent if (bsd.zdim == 1) { // 2D void-extent - int rsvbits = read_bits(2, 10, pcb.data); + int rsvbits = read_bits(2, 10, pcb); if (rsvbits != 3) { scb.block_type = SYM_BTYPE_ERROR; return; } - int vx_low_s = read_bits(8, 12, pcb.data) | (read_bits(5, 12 + 8, pcb.data) << 8); - int vx_high_s = read_bits(8, 25, pcb.data) | (read_bits(5, 25 + 8, pcb.data) << 8); - int vx_low_t = read_bits(8, 38, pcb.data) | (read_bits(5, 38 + 8, pcb.data) << 8); - int vx_high_t = read_bits(8, 51, pcb.data) | (read_bits(5, 51 + 8, pcb.data) << 8); + int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8); + int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8); + int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8); + int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8); int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF; @@ -346,12 +346,12 @@ void physical_to_symbolic( else { // 3D void-extent - int vx_low_s = read_bits(9, 10, pcb.data); - int vx_high_s = read_bits(9, 19, pcb.data); - int vx_low_t = read_bits(9, 28, pcb.data); - int vx_high_t = read_bits(9, 37, pcb.data); - int vx_low_p = read_bits(9, 46, pcb.data); - int vx_high_p = read_bits(9, 55, pcb.data); + int vx_low_s = read_bits(9, 10, pcb); + int vx_high_s = read_bits(9, 19, pcb); + int vx_low_t = read_bits(9, 28, pcb); + int vx_high_t = read_bits(9, 37, pcb); + int vx_low_p = read_bits(9, 46, pcb); + int vx_high_p = read_bits(9, 55, pcb); int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF; @@ -383,7 +383,7 @@ void physical_to_symbolic( int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count; - int partition_count = read_bits(2, 11, pcb.data) + 1; + int partition_count = read_bits(2, 11, pcb) + 1; promise(partition_count > 0); scb.block_mode = static_cast(block_mode); @@ -391,7 +391,7 @@ void physical_to_symbolic( for (int i = 0; i < 16; i++) { - bswapped[i] = static_cast(bitrev8(pcb.data[15 - i])); + bswapped[i] = static_cast(bitrev8(pcb[15 - i])); } int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method); @@ -432,14 +432,15 @@ void physical_to_symbolic( int encoded_type_highpart_size = 0; if (partition_count == 1) { - color_formats[0] = read_bits(4, 13, pcb.data); + color_formats[0] = read_bits(4, 13, pcb); scb.partition_index = 0; } else { encoded_type_highpart_size = (3 * partition_count) - 4; below_weights_pos -= encoded_type_highpart_size; - int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pcb.data) << 6); + int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb) | + (read_bits(encoded_type_highpart_size, below_weights_pos, pcb) << 6); int baseclass = encoded_type & 0x3; if (baseclass == 0) { @@ -469,7 +470,8 @@ void physical_to_symbolic( bitpos += 2; } } - scb.partition_index = static_cast(read_bits(6, 13, pcb.data) | (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb.data) << 6)); + scb.partition_index = static_cast(read_bits(6, 13, pcb) | + (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6)); } for (int i = 0; i < partition_count; i++) @@ -515,7 +517,7 @@ void physical_to_symbolic( scb.quant_mode = static_cast(color_quant_level); uint8_t values_to_decode[32]; - decode_ise(static_cast(color_quant_level), color_integer_count, pcb.data, + decode_ise(static_cast(color_quant_level), color_integer_count, pcb, values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS)); int valuecount_to_decode = 0; @@ -534,6 +536,6 @@ void physical_to_symbolic( scb.plane2_component = -1; if (is_dual_plane) { - scb.plane2_component = static_cast(read_bits(2, below_weights_pos - 2, pcb.data)); + scb.plane2_component = static_cast(read_bits(2, below_weights_pos - 2, pcb)); } } diff --git a/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h b/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h index a785aca75b2..3ca25e35e13 100644 --- a/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h +++ b/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2022 Arm Limited +// Copyright 2019-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -241,6 +241,14 @@ struct vint8 return vint8(_mm256_broadcastd_epi32(a)); } + /** + * @brief Factory that returns a vector loaded from unaligned memory. + */ + static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p) + { + return vint8(_mm256_lddqu_si256(reinterpret_cast(p))); + } + /** * @brief Factory that returns a vector loaded from 32B aligned memory. */ @@ -1000,7 +1008,7 @@ ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a) */ ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a) { - a = round(a); + a = a + vfloat8(0.5f); return vint8(_mm256_cvttps_epi32(a.m)); } @@ -1152,9 +1160,9 @@ ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a) * * All masked lanes must be at the end of vector, after all non-masked lanes. */ -ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint8 data, vmask8 mask) +ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask) { - _mm256_maskstore_epi32(base, _mm256_castps_si256(mask.m), data.m); + _mm256_maskstore_epi32(reinterpret_cast(base), _mm256_castps_si256(mask.m), data.m); } /** @@ -1162,7 +1170,7 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint8 data, vmask8 mask) */ ASTCENC_SIMD_INLINE void print(vint8 a) { - alignas(ASTCENC_VECALIGN) int v[8]; + alignas(32) int v[8]; storea(a, v); printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); @@ -1173,7 +1181,7 @@ ASTCENC_SIMD_INLINE void print(vint8 a) */ ASTCENC_SIMD_INLINE void printx(vint8 a) { - alignas(ASTCENC_VECALIGN) int v[8]; + alignas(32) int v[8]; storea(a, v); printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); @@ -1184,7 +1192,7 @@ ASTCENC_SIMD_INLINE void printx(vint8 a) */ ASTCENC_SIMD_INLINE void print(vfloat8 a) { - alignas(ASTCENC_VECALIGN) float v[8]; + alignas(32) float v[8]; storea(a, v); printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n", static_cast(v[0]), static_cast(v[1]), diff --git a/thirdparty/astcenc/astcenc_vecmathlib_common_4.h b/thirdparty/astcenc/astcenc_vecmathlib_common_4.h index 86ee4fd3e1f..1e04367c1ff 100644 --- a/thirdparty/astcenc/astcenc_vecmathlib_common_4.h +++ b/thirdparty/astcenc/astcenc_vecmathlib_common_4.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2020-2021 Arm Limited +// Copyright 2020-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -383,7 +383,7 @@ static ASTCENC_SIMD_INLINE void bit_transfer_signed( */ ASTCENC_SIMD_INLINE void print(vint4 a) { - alignas(16) int v[4]; + ASTCENC_ALIGNAS int v[4]; storea(a, v); printf("v4_i32:\n %8d %8d %8d %8d\n", v[0], v[1], v[2], v[3]); @@ -394,7 +394,7 @@ ASTCENC_SIMD_INLINE void print(vint4 a) */ ASTCENC_SIMD_INLINE void printx(vint4 a) { - alignas(16) int v[4]; + ASTCENC_ALIGNAS int v[4]; storea(a, v); printf("v4_i32:\n %08x %08x %08x %08x\n", v[0], v[1], v[2], v[3]); @@ -405,7 +405,7 @@ ASTCENC_SIMD_INLINE void printx(vint4 a) */ ASTCENC_SIMD_INLINE void print(vfloat4 a) { - alignas(16) float v[4]; + ASTCENC_ALIGNAS float v[4]; storea(a, v); printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n", static_cast(v[0]), static_cast(v[1]), diff --git a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h index e742eae6cbc..42545e75627 100644 --- a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h +++ b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2022 Arm Limited +// Copyright 2019-2023 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -38,6 +38,7 @@ #endif #include +#include // ============================================================================ // vfloat4 data type @@ -269,6 +270,16 @@ struct vint4 return vint4(*p); } + /** + * @brief Factory that returns a vector loaded from unaligned memory. + */ + static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p) + { + vint4 data; + std::memcpy(&data.m, p, 4 * sizeof(int)); + return data; + } + /** * @brief Factory that returns a vector loaded from 16B aligned memory. */ @@ -348,9 +359,9 @@ struct vmask4 /** * @brief Get the scalar from a single lane. */ - template ASTCENC_SIMD_INLINE uint32_t lane() const + template ASTCENC_SIMD_INLINE bool lane() const { - return vgetq_lane_u32(m, l); + return vgetq_lane_u32(m, l) != 0; } /** @@ -584,6 +595,14 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p) vst1q_s32(p, a.m); } +/** + * @brief Store a vector to an unaligned memory address. + */ +ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p) +{ + std::memcpy(p, &a.m, sizeof(int) * 4); +} + /** * @brief Store lowest N (vector width) bytes into an unaligned address. */ @@ -849,7 +868,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a) */ ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) { - a = round(a); + a = a + vfloat4(0.5f); return vint4(vcvtq_s32_f32(a.m)); } @@ -1027,31 +1046,39 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a) return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); } +/** + * @brief Store a single vector lane to an unaligned address. + */ +ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data) +{ + std::memcpy(base, &data, sizeof(int)); +} + /** * @brief Store a vector, skipping masked lanes. * * All masked lanes must be at the end of vector, after all non-masked lanes. */ -ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask) +ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask) { if (mask.lane<3>()) { store(data, base); } - else if (mask.lane<2>()) + else if (mask.lane<2>() != 0.0f) { - base[0] = data.lane<0>(); - base[1] = data.lane<1>(); - base[2] = data.lane<2>(); + store_lane(base + 0, data.lane<0>()); + store_lane(base + 4, data.lane<1>()); + store_lane(base + 8, data.lane<2>()); } - else if (mask.lane<1>()) + else if (mask.lane<1>() != 0.0f) { - base[0] = data.lane<0>(); - base[1] = data.lane<1>(); + store_lane(base + 0, data.lane<0>()); + store_lane(base + 4, data.lane<1>()); } - else if (mask.lane<0>()) + else if (mask.lane<0>() != 0.0f) { - base[0] = data.lane<0>(); + store_lane(base + 0, data.lane<0>()); } } diff --git a/thirdparty/astcenc/astcenc_vecmathlib_none_4.h b/thirdparty/astcenc/astcenc_vecmathlib_none_4.h index d9b52be3e42..1c95c2ff88a 100644 --- a/thirdparty/astcenc/astcenc_vecmathlib_none_4.h +++ b/thirdparty/astcenc/astcenc_vecmathlib_none_4.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2022 Arm Limited +// Copyright 2019-2023 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -275,6 +275,16 @@ struct vint4 return vint4(*p); } + /** + * @brief Factory that returns a vector loaded from unaligned memory. + */ + static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p) + { + vint4 data; + std::memcpy(&data.m, p, 4 * sizeof(int)); + return data; + } + /** * @brief Factory that returns a vector loaded from 16B aligned memory. */ @@ -341,6 +351,13 @@ struct vmask4 m[3] = d == false ? 0 : -1; } + /** + * @brief Get the scalar value of a single lane. + */ + template ASTCENC_SIMD_INLINE float lane() const + { + return m[l] != 0; + } /** * @brief The vector ... @@ -644,13 +661,20 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p) p[3] = a.m[3]; } +/** + * @brief Store a vector to an unaligned memory address. + */ +ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p) +{ + std::memcpy(p, a.m, sizeof(int) * 4); +} + /** * @brief Store lowest N (vector width) bytes into an unaligned address. */ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) { - int* pi = reinterpret_cast(p); - *pi = a.m[0]; + std::memcpy(p, a.m, sizeof(uint8_t) * 4); } /** @@ -963,10 +987,11 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a) */ ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) { - return vint4(static_cast(a.m[0] + 0.5f), - static_cast(a.m[1] + 0.5f), - static_cast(a.m[2] + 0.5f), - static_cast(a.m[3] + 0.5f)); + a = a + vfloat4(0.5f); + return vint4(static_cast(a.m[0]), + static_cast(a.m[1]), + static_cast(a.m[2]), + static_cast(a.m[3])); } /** @@ -1030,7 +1055,7 @@ ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a) ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a) { vint4 r; - memcpy(r.m, a.m, 4 * 4); + std::memcpy(r.m, a.m, 4 * 4); return r; } @@ -1044,7 +1069,7 @@ ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a) ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a) { vfloat4 r; - memcpy(r.m, a.m, 4 * 4); + std::memcpy(r.m, a.m, 4 * 4); return r; } @@ -1079,12 +1104,13 @@ ASTCENC_SIMD_INLINE void vtable_prepare( } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. */ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) { uint8_t table[16]; - storea(t0, reinterpret_cast(table + 0)); + + std::memcpy(table + 0, t0.m, 4 * sizeof(int)); return vint4(table[idx.lane<0>()], table[idx.lane<1>()], @@ -1099,8 +1125,9 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) { uint8_t table[32]; - storea(t0, reinterpret_cast(table + 0)); - storea(t1, reinterpret_cast(table + 16)); + + std::memcpy(table + 0, t0.m, 4 * sizeof(int)); + std::memcpy(table + 16, t1.m, 4 * sizeof(int)); return vint4(table[idx.lane<0>()], table[idx.lane<1>()], @@ -1114,10 +1141,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) { uint8_t table[64]; - storea(t0, reinterpret_cast(table + 0)); - storea(t1, reinterpret_cast(table + 16)); - storea(t2, reinterpret_cast(table + 32)); - storea(t3, reinterpret_cast(table + 48)); + + std::memcpy(table + 0, t0.m, 4 * sizeof(int)); + std::memcpy(table + 16, t1.m, 4 * sizeof(int)); + std::memcpy(table + 32, t2.m, 4 * sizeof(int)); + std::memcpy(table + 48, t3.m, 4 * sizeof(int)); return vint4(table[idx.lane<0>()], table[idx.lane<1>()], @@ -1138,12 +1166,21 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a) return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); } +/** + * @brief Store a single vector lane to an unaligned address. + */ +ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data) +{ + std::memcpy(base, &data, sizeof(int)); +} + /** * @brief Store a vector, skipping masked lanes. * * All masked lanes must be at the end of vector, after all non-masked lanes. + * Input is a byte array of at least 4 bytes per unmasked entry. */ -ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask) +ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask) { if (mask.m[3]) { @@ -1151,18 +1188,18 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask) } else if (mask.m[2]) { - base[0] = data.lane<0>(); - base[1] = data.lane<1>(); - base[2] = data.lane<2>(); + store_lane(base + 0, data.lane<0>()); + store_lane(base + 4, data.lane<1>()); + store_lane(base + 8, data.lane<2>()); } else if (mask.m[1]) { - base[0] = data.lane<0>(); - base[1] = data.lane<1>(); + store_lane(base + 0, data.lane<0>()); + store_lane(base + 4, data.lane<1>()); } else if (mask.m[0]) { - base[0] = data.lane<0>(); + store_lane(base + 0, data.lane<0>()); } } diff --git a/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h b/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h index 26dcc4a891b..b69655f9041 100644 --- a/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h +++ b/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2022 Arm Limited +// Copyright 2019-2023 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -39,6 +39,7 @@ #endif #include +#include // ============================================================================ // vfloat4 data type @@ -292,6 +293,18 @@ struct vint4 return vint4(*p); } + /** + * @brief Factory that returns a vector loaded from unaligned memory. + */ + static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p) + { +#if ASTCENC_SSE >= 41 + return vint4(_mm_lddqu_si128(reinterpret_cast(p))); +#else + return vint4(_mm_loadu_si128(reinterpret_cast(p))); +#endif + } + /** * @brief Factory that returns a vector loaded from 16B aligned memory. */ @@ -366,9 +379,9 @@ struct vmask4 /** * @brief Get the scalar value of a single lane. */ - template ASTCENC_SIMD_INLINE float lane() const + template ASTCENC_SIMD_INLINE bool lane() const { - return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)); + return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f; } /** @@ -633,6 +646,14 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p) _mm_storeu_ps(reinterpret_cast(p), _mm_castsi128_ps(a.m)); } +/** + * @brief Store a vector to an unaligned memory address. + */ +ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p) +{ + std::memcpy(p, &a.m, sizeof(int) * 4); +} + /** * @brief Store lowest N (vector width) bytes into an unaligned address. */ @@ -934,7 +955,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a) */ ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) { - a = round(a); + a = a + vfloat4(0.5f); return vint4(_mm_cvttps_epi32(a.m)); } @@ -1087,8 +1108,9 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) __m128i result = _mm_shuffle_epi8(t0.m, idxx); return vint4(result); #else - alignas(ASTCENC_VECALIGN) uint8_t table[16]; - storea(t0, reinterpret_cast(table + 0)); + uint8_t table[16]; + + std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); return vint4(table[idx.lane<0>()], table[idx.lane<1>()], @@ -1114,9 +1136,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) return vint4(result); #else - alignas(ASTCENC_VECALIGN) uint8_t table[32]; - storea(t0, reinterpret_cast(table + 0)); - storea(t1, reinterpret_cast(table + 16)); + uint8_t table[32]; + + std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); + std::memcpy(table + 16, &t1.m, 4 * sizeof(int)); return vint4(table[idx.lane<0>()], table[idx.lane<1>()], @@ -1150,11 +1173,12 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3 return vint4(result); #else - alignas(ASTCENC_VECALIGN) uint8_t table[64]; - storea(t0, reinterpret_cast(table + 0)); - storea(t1, reinterpret_cast(table + 16)); - storea(t2, reinterpret_cast(table + 32)); - storea(t3, reinterpret_cast(table + 48)); + uint8_t table[64]; + + std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); + std::memcpy(table + 16, &t1.m, 4 * sizeof(int)); + std::memcpy(table + 32, &t2.m, 4 * sizeof(int)); + std::memcpy(table + 48, &t3.m, 4 * sizeof(int)); return vint4(table[idx.lane<0>()], table[idx.lane<1>()], @@ -1190,15 +1214,23 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a) #endif } +/** + * @brief Store a single vector lane to an unaligned address. + */ +ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data) +{ + std::memcpy(base, &data, sizeof(int)); +} + /** * @brief Store a vector, skipping masked lanes. * * All masked lanes must be at the end of vector, after all non-masked lanes. */ -ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask) +ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask) { #if ASTCENC_AVX >= 2 - _mm_maskstore_epi32(base, _mm_castps_si128(mask.m), data.m); + _mm_maskstore_epi32(reinterpret_cast(base), _mm_castps_si128(mask.m), data.m); #else // Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee // fault suppression on masked lanes so we can get page faults at the end of an image. @@ -1208,18 +1240,18 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask) } else if (mask.lane<2>() != 0.0f) { - base[0] = data.lane<0>(); - base[1] = data.lane<1>(); - base[2] = data.lane<2>(); + store_lane(base + 0, data.lane<0>()); + store_lane(base + 4, data.lane<1>()); + store_lane(base + 8, data.lane<2>()); } else if (mask.lane<1>() != 0.0f) { - base[0] = data.lane<0>(); - base[1] = data.lane<1>(); + store_lane(base + 0, data.lane<0>()); + store_lane(base + 4, data.lane<1>()); } else if (mask.lane<0>() != 0.0f) { - base[0] = data.lane<0>(); + store_lane(base + 0, data.lane<0>()); } #endif } diff --git a/thirdparty/astcenc/astcenc_weight_align.cpp b/thirdparty/astcenc/astcenc_weight_align.cpp index aa6ab61fa19..4e993e7397a 100644 --- a/thirdparty/astcenc/astcenc_weight_align.cpp +++ b/thirdparty/astcenc/astcenc_weight_align.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -60,8 +60,8 @@ static const uint8_t steps_for_quant_level[12] { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32 }; -alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS]; -alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS]; +ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS]; +ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS]; #if defined(ASTCENC_DIAGNOSTICS) static bool print_once { true }; @@ -99,7 +99,7 @@ static void compute_angular_offsets( promise(weight_count > 0); promise(max_angular_steps > 0); - alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS]; + ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS]; // Precompute isample; arrays are always allocated 64 elements long for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) @@ -242,16 +242,16 @@ static void compute_angular_endpoints_for_quant_levels( unsigned int max_quant_steps = steps_for_quant_level[max_quant_level]; unsigned int max_angular_steps = steps_for_quant_level[max_quant_level]; - alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS]; compute_angular_offsets(weight_count, dec_weight_ideal_value, max_angular_steps, angular_offsets); - alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS]; - alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS]; + ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float error[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS]; + ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS]; compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value, max_angular_steps, max_quant_steps, diff --git a/thirdparty/astcenc/patches/fix-build-no-ssse3.patch b/thirdparty/astcenc/patches/fix-build-no-ssse3.patch deleted file mode 100644 index 9da4f3e1f3a..00000000000 --- a/thirdparty/astcenc/patches/fix-build-no-ssse3.patch +++ /dev/null @@ -1,81 +0,0 @@ -From 02c22d3df501dc284ba732fa82a6c408c57b3237 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= -Date: Thu, 19 Jan 2023 23:30:13 +0100 -Subject: [PATCH] mathlib: Remove incomplete support for SSE3 which assumed - SSSE3 - -`_mm_shuffle_epi8` requires SSSE3 so the check on `ASTCENC_SSE >= 30` is -too lax and would fail if `__SSE3__` is supported, but not `__SSSE3__`. - -The only supported configurations are SSE2, SSE4.1, and AVX2, so as -discussed in #393 we drop the SSE3 checks and require SSE4.1 instead. ---- - Source/astcenc_mathlib.h | 2 -- - Source/astcenc_vecmathlib_sse_4.h | 10 +++++----- - 2 files changed, 5 insertions(+), 7 deletions(-) - -diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h -index 67e989e..0540c4f 100644 ---- a/Source/astcenc_mathlib.h -+++ b/Source/astcenc_mathlib.h -@@ -48,8 +48,6 @@ - #define ASTCENC_SSE 42 - #elif defined(__SSE4_1__) - #define ASTCENC_SSE 41 -- #elif defined(__SSE3__) -- #define ASTCENC_SSE 30 - #elif defined(__SSE2__) - #define ASTCENC_SSE 20 - #else -diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h -index 76fe577..26dcc4a 100644 ---- a/Source/astcenc_vecmathlib_sse_4.h -+++ b/Source/astcenc_vecmathlib_sse_4.h -@@ -1046,7 +1046,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) - */ - ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) - { --#if ASTCENC_SSE >= 30 -+#if ASTCENC_SSE >= 41 - t0p = t0; - t1p = t0 ^ t1; - #else -@@ -1062,7 +1062,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) - { --#if ASTCENC_SSE >= 30 -+#if ASTCENC_SSE >= 41 - t0p = t0; - t1p = t0 ^ t1; - t2p = t1 ^ t2; -@@ -1080,7 +1080,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( - */ - ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) - { --#if ASTCENC_SSE >= 30 -+#if ASTCENC_SSE >= 41 - // Set index byte MSB to 1 for unused bytes so shuffle returns zero - __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - -@@ -1102,7 +1102,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) - */ - ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) - { --#if ASTCENC_SSE >= 30 -+#if ASTCENC_SSE >= 41 - // Set index byte MSB to 1 for unused bytes so shuffle returns zero - __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - -@@ -1130,7 +1130,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) - */ - ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) - { --#if ASTCENC_SSE >= 30 -+#if ASTCENC_SSE >= 41 - // Set index byte MSB to 1 for unused bytes so shuffle returns zero - __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - --- -2.39.1 -