diff --git a/modules/etcpak/image_compress_etcpak.cpp b/modules/etcpak/image_compress_etcpak.cpp index e9ce92dd801..9c04f5b40eb 100644 --- a/modules/etcpak/image_compress_etcpak.cpp +++ b/modules/etcpak/image_compress_etcpak.cpp @@ -44,9 +44,9 @@ EtcpakType _determine_etc_type(Image::UsedChannels p_channels) { case Image::USED_CHANNELS_LA: return EtcpakType::ETCPAK_TYPE_ETC2_ALPHA; case Image::USED_CHANNELS_R: - return EtcpakType::ETCPAK_TYPE_ETC2; + return EtcpakType::ETCPAK_TYPE_ETC2_R; case Image::USED_CHANNELS_RG: - return EtcpakType::ETCPAK_TYPE_ETC2_RA_AS_RG; + return EtcpakType::ETCPAK_TYPE_ETC2_RG; case Image::USED_CHANNELS_RGB: return EtcpakType::ETCPAK_TYPE_ETC2; case Image::USED_CHANNELS_RGBA: @@ -114,6 +114,12 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img) { } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2) { target_format = Image::FORMAT_ETC2_RGB8; r_img->convert_rgba8_to_bgra8(); // It's badly documented but ETCPAK seems to be expected BGRA8 for ETC. + } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2_R) { + target_format = Image::FORMAT_ETC2_R11; + r_img->convert_rgba8_to_bgra8(); // It's badly documented but ETCPAK seems to be expected BGRA8 for ETC. + } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2_RG) { + target_format = Image::FORMAT_ETC2_RG11; + r_img->convert_rgba8_to_bgra8(); // It's badly documented but ETCPAK seems to be expected BGRA8 for ETC. } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2_RA_AS_RG) { target_format = Image::FORMAT_ETC2_RA_AS_RG; r_img->convert_rg_to_ra_rgba8(); @@ -224,22 +230,49 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img) { // Override the src_mip_read pointer to our temporary Vector. src_mip_read = padded_src.ptr(); } - if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC1) { - CompressEtc1RgbDither(src_mip_read, dest_mip_write, blocks, mip_w); - } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2) { - CompressEtc2Rgb(src_mip_read, dest_mip_write, blocks, mip_w, true); - } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2_ALPHA || p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2_RA_AS_RG) { - CompressEtc2Rgba(src_mip_read, dest_mip_write, blocks, mip_w, true); - } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT1) { - CompressDxt1Dither(src_mip_read, dest_mip_write, blocks, mip_w); - } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5 || p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG) { - CompressDxt5(src_mip_read, dest_mip_write, blocks, mip_w); - } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_RGTC_RG) { - CompressRgtcRG(src_mip_read, dest_mip_write, blocks, mip_w); - } else if (p_compresstype == EtcpakType::ETCPAK_TYPE_RGTC_R) { - CompressRgtcR(src_mip_read, dest_mip_write, blocks, mip_w); - } else { - ERR_FAIL_MSG("etcpak: Invalid or unsupported compression format."); + + switch (p_compresstype) { + case EtcpakType::ETCPAK_TYPE_ETC1: + CompressEtc1RgbDither(src_mip_read, dest_mip_write, blocks, mip_w); + break; + + case EtcpakType::ETCPAK_TYPE_ETC2: + CompressEtc2Rgb(src_mip_read, dest_mip_write, blocks, mip_w, true); + break; + + case EtcpakType::ETCPAK_TYPE_ETC2_ALPHA: + case EtcpakType::ETCPAK_TYPE_ETC2_RA_AS_RG: + CompressEtc2Rgba(src_mip_read, dest_mip_write, blocks, mip_w, true); + break; + + case EtcpakType::ETCPAK_TYPE_ETC2_R: + CompressEtc2R8(src_mip_read, dest_mip_write, blocks, mip_w); + break; + + case EtcpakType::ETCPAK_TYPE_ETC2_RG: + CompressEtc2RG8(src_mip_read, dest_mip_write, blocks, mip_w); + break; + + case EtcpakType::ETCPAK_TYPE_DXT1: + CompressDxt1Dither(src_mip_read, dest_mip_write, blocks, mip_w); + break; + + case EtcpakType::ETCPAK_TYPE_DXT5: + case EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG: + CompressDxt5(src_mip_read, dest_mip_write, blocks, mip_w); + break; + + case EtcpakType::ETCPAK_TYPE_RGTC_R: + CompressRgtcR(src_mip_read, dest_mip_write, blocks, mip_w); + break; + + case EtcpakType::ETCPAK_TYPE_RGTC_RG: + CompressRgtcRG(src_mip_read, dest_mip_write, blocks, mip_w); + break; + + default: + ERR_FAIL_MSG("etcpak: Invalid or unsupported compression format."); + break; } } diff --git a/modules/etcpak/image_compress_etcpak.h b/modules/etcpak/image_compress_etcpak.h index ff8bb635b40..9d5343740bf 100644 --- a/modules/etcpak/image_compress_etcpak.h +++ b/modules/etcpak/image_compress_etcpak.h @@ -38,6 +38,8 @@ enum class EtcpakType { ETCPAK_TYPE_ETC2, ETCPAK_TYPE_ETC2_ALPHA, ETCPAK_TYPE_ETC2_RA_AS_RG, + ETCPAK_TYPE_ETC2_R, + ETCPAK_TYPE_ETC2_RG, ETCPAK_TYPE_DXT1, ETCPAK_TYPE_DXT5, ETCPAK_TYPE_DXT5_RA_AS_RG, diff --git a/thirdparty/README.md b/thirdparty/README.md index 533f4592e12..c3f0e25b9ab 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -225,6 +225,9 @@ Files extracted from upstream source: ``` - `AUTHORS.txt` and `LICENSE.txt` +Two files (`ProcessRGB.{cpp,hpp}`) have been modified to provide ETC2_R and ETC2_RG compression, +the changes are based on the existing code. + Two files (`ProcessRgtc.{cpp,hpp}`) have been added to provide RGTC compression implementation, based on library's `ProcessDxtc.{cpp,hpp}`. diff --git a/thirdparty/etcpak/ProcessRGB.cpp b/thirdparty/etcpak/ProcessRGB.cpp index 4dc3bf23af2..0caa687bc63 100644 --- a/thirdparty/etcpak/ProcessRGB.cpp +++ b/thirdparty/etcpak/ProcessRGB.cpp @@ -4181,3 +4181,145 @@ void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size } while( --blocks ); } + +// -- GODOT start -- +void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) +{ + int w = 0; + uint8_t r[4*4]; + do + { +#ifdef __SSE4_1__ + __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); + __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); + __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); + __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); + + _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); + + __m128i c0 = _mm_castps_si128( px0 ); + __m128i c1 = _mm_castps_si128( px1 ); + __m128i c2 = _mm_castps_si128( px2 ); + __m128i c3 = _mm_castps_si128( px3 ); + + __m128i mask = _mm_setr_epi32( 0x0e0a0602, -1, -1, -1 ); + + __m128i a0 = _mm_shuffle_epi8( c0, mask ); + __m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); + __m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); + __m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); + + __m128i s0 = _mm_or_si128( a0, a1 ); + __m128i s1 = _mm_or_si128( a2, a3 ); + __m128i s2 = _mm_or_si128( s0, s1 ); + + _mm_store_si128( (__m128i*)r, s2 ); + + src += 4; +#else + auto ptr8 = r; + for( int x=0; x<4; x++ ) + { + auto v = *src; + *ptr8++ = (v & 0xff0000) >> 16; + src += width; + v = *src; + *ptr8++ = (v & 0xff0000) >> 16; + src += width; + v = *src; + *ptr8++ = (v & 0xff0000) >> 16; + src += width; + v = *src; + *ptr8++ = (v & 0xff0000) >> 16; + src -= width * 3 - 1; + } +#endif + if( ++w == width/4 ) + { + src += width * 3; + w = 0; + } + *dst++ = ProcessAlpha_ETC2( r ); + } + while( --blocks ); +} + +void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) +{ + int w = 0; + uint8_t rg[4*4*2]; + do + { +#ifdef __SSE4_1__ + __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); + __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); + __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); + __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); + + _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); + + __m128i c0 = _mm_castps_si128( px0 ); + __m128i c1 = _mm_castps_si128( px1 ); + __m128i c2 = _mm_castps_si128( px2 ); + __m128i c3 = _mm_castps_si128( px3 ); + + __m128i mask = _mm_setr_epi32( 0x0e0a0602, -1, -1, -1 ); + + __m128i r0 = _mm_shuffle_epi8( c0, mask ); + __m128i r1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); + __m128i r2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); + __m128i r3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); + + __m128i s0 = _mm_or_si128( r0, r1 ); + __m128i s1 = _mm_or_si128( r2, r3 ); + __m128i s2 = _mm_or_si128( s0, s1 ); + + _mm_store_si128( (__m128i*)rg, s2 ); + + mask = _mm_setr_epi32( 0x0d090501, -1, -1, -1 ); + + r0 = _mm_shuffle_epi8( c0, mask ); + r1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); + r2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); + r3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); + + s0 = _mm_or_si128( r0, r1 ); + s1 = _mm_or_si128( r2, r3 ); + s2 = _mm_or_si128( s0, s1 ); + + _mm_store_si128( (__m128i*)&rg[16], s2 ); + src += 4; +#else + auto ptrr = rg; + auto ptrg = ptrr + 16; + for( int x=0; x<4; x++ ) + { + auto v = *src; + *ptrr++ = (v & 0xff0000) >> 16; + *ptrg++ = (v & 0xff00) >> 8; + src += width; + v = *src; + *ptrr++ = (v & 0xff0000) >> 16; + *ptrg++ = (v & 0xff00) >> 8; + src += width; + v = *src; + *ptrr++ = (v & 0xff0000) >> 16; + *ptrg++ = (v & 0xff00) >> 8; + src += width; + v = *src; + *ptrr++ = (v & 0xff0000) >> 16; + *ptrg++ = (v & 0xff00) >> 8; + src -= width * 3 - 1; + } +#endif + if( ++w == width/4 ) + { + src += width * 3; + w = 0; + } + *dst++ = ProcessAlpha_ETC2( rg ); + *dst++ = ProcessAlpha_ETC2( &rg[16] ); + } + while( --blocks ); +} +// -- GODOT end -- diff --git a/thirdparty/etcpak/ProcessRGB.hpp b/thirdparty/etcpak/ProcessRGB.hpp index 043b46e636a..050ea42562c 100644 --- a/thirdparty/etcpak/ProcessRGB.hpp +++ b/thirdparty/etcpak/ProcessRGB.hpp @@ -9,5 +9,8 @@ void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_ void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ); void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ); void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ); - +// -- GODOT start -- +void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ); +void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ); +// -- GODOT end -- #endif diff --git a/thirdparty/etcpak/ProcessRgtc.cpp b/thirdparty/etcpak/ProcessRgtc.cpp index 3a283b743bd..5eec2648db7 100644 --- a/thirdparty/etcpak/ProcessRgtc.cpp +++ b/thirdparty/etcpak/ProcessRgtc.cpp @@ -6,8 +6,49 @@ #include #include +#if defined __AVX__ && !defined __SSE4_1__ +# define __SSE4_1__ +#endif + +#if defined __SSE4_1__ || defined __AVX2__ +# ifdef _MSC_VER +# include +# else +# include +# ifndef _mm256_cvtsi256_si32 +# define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) ) +# endif +# endif +#endif + static const uint8_t AlphaIndexTable[8] = { 1, 7, 6, 5, 4, 3, 2, 0 }; +static const uint8_t AlphaIndexTable_SSE[64] = { + 9, 15, 14, 13, 12, 11, 10, 8, 57, 63, 62, 61, 60, 59, 58, 56, + 49, 55, 54, 53, 52, 51, 50, 48, 41, 47, 46, 45, 44, 43, 42, 40, + 33, 39, 38, 37, 36, 35, 34, 32, 25, 31, 30, 29, 28, 27, 26, 24, + 17, 23, 22, 21, 20, 19, 18, 16, 1, 7, 6, 5, 4, 3, 2, 0, +}; + +static const uint16_t DivTableAlpha[256] = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xe38e, 0xcccc, 0xba2e, 0xaaaa, 0x9d89, 0x9249, 0x8888, 0x8000, + 0x7878, 0x71c7, 0x6bca, 0x6666, 0x6186, 0x5d17, 0x590b, 0x5555, 0x51eb, 0x4ec4, 0x4bda, 0x4924, 0x469e, 0x4444, 0x4210, 0x4000, + 0x3e0f, 0x3c3c, 0x3a83, 0x38e3, 0x3759, 0x35e5, 0x3483, 0x3333, 0x31f3, 0x30c3, 0x2fa0, 0x2e8b, 0x2d82, 0x2c85, 0x2b93, 0x2aaa, + 0x29cb, 0x28f5, 0x2828, 0x2762, 0x26a4, 0x25ed, 0x253c, 0x2492, 0x23ee, 0x234f, 0x22b6, 0x2222, 0x2192, 0x2108, 0x2082, 0x2000, + 0x1f81, 0x1f07, 0x1e91, 0x1e1e, 0x1dae, 0x1d41, 0x1cd8, 0x1c71, 0x1c0e, 0x1bac, 0x1b4e, 0x1af2, 0x1a98, 0x1a41, 0x19ec, 0x1999, + 0x1948, 0x18f9, 0x18ac, 0x1861, 0x1818, 0x17d0, 0x178a, 0x1745, 0x1702, 0x16c1, 0x1681, 0x1642, 0x1605, 0x15c9, 0x158e, 0x1555, + 0x151d, 0x14e5, 0x14af, 0x147a, 0x1446, 0x1414, 0x13e2, 0x13b1, 0x1381, 0x1352, 0x1323, 0x12f6, 0x12c9, 0x129e, 0x1273, 0x1249, + 0x121f, 0x11f7, 0x11cf, 0x11a7, 0x1181, 0x115b, 0x1135, 0x1111, 0x10ec, 0x10c9, 0x10a6, 0x1084, 0x1062, 0x1041, 0x1020, 0x1000, + 0x0fe0, 0x0fc0, 0x0fa2, 0x0f83, 0x0f66, 0x0f48, 0x0f2b, 0x0f0f, 0x0ef2, 0x0ed7, 0x0ebb, 0x0ea0, 0x0e86, 0x0e6c, 0x0e52, 0x0e38, + 0x0e1f, 0x0e07, 0x0dee, 0x0dd6, 0x0dbe, 0x0da7, 0x0d90, 0x0d79, 0x0d62, 0x0d4c, 0x0d36, 0x0d20, 0x0d0b, 0x0cf6, 0x0ce1, 0x0ccc, + 0x0cb8, 0x0ca4, 0x0c90, 0x0c7c, 0x0c69, 0x0c56, 0x0c43, 0x0c30, 0x0c1e, 0x0c0c, 0x0bfa, 0x0be8, 0x0bd6, 0x0bc5, 0x0bb3, 0x0ba2, + 0x0b92, 0x0b81, 0x0b70, 0x0b60, 0x0b50, 0x0b40, 0x0b30, 0x0b21, 0x0b11, 0x0b02, 0x0af3, 0x0ae4, 0x0ad6, 0x0ac7, 0x0ab8, 0x0aaa, + 0x0a9c, 0x0a8e, 0x0a80, 0x0a72, 0x0a65, 0x0a57, 0x0a4a, 0x0a3d, 0x0a30, 0x0a23, 0x0a16, 0x0a0a, 0x09fd, 0x09f1, 0x09e4, 0x09d8, + 0x09cc, 0x09c0, 0x09b4, 0x09a9, 0x099d, 0x0991, 0x0986, 0x097b, 0x0970, 0x0964, 0x095a, 0x094f, 0x0944, 0x0939, 0x092f, 0x0924, + 0x091a, 0x090f, 0x0905, 0x08fb, 0x08f1, 0x08e7, 0x08dd, 0x08d3, 0x08ca, 0x08c0, 0x08b7, 0x08ad, 0x08a4, 0x089a, 0x0891, 0x0888, + 0x087f, 0x0876, 0x086d, 0x0864, 0x085b, 0x0853, 0x084a, 0x0842, 0x0839, 0x0831, 0x0828, 0x0820, 0x0818, 0x0810, 0x0808, 0x0800, +}; + static etcpak_force_inline uint64_t ProcessAlpha( const uint8_t* src ) { uint8_t solid8 = *src; @@ -40,20 +81,72 @@ static etcpak_force_inline uint64_t ProcessAlpha( const uint8_t* src ) return max | ( min << 8 ) | ( data << 16 ); } +#ifdef __SSE4_1__ +static etcpak_force_inline uint64_t Process_Alpha_SSE( __m128i a ) +{ + __m128i solidCmp = _mm_shuffle_epi8( a, _mm_setzero_si128() ); + __m128i cmpRes = _mm_cmpeq_epi8( a, solidCmp ); + if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) ) + { + return _mm_cvtsi128_si32( a ) & 0xFF; + } + + __m128i a1 = _mm_shuffle_epi32( a, _MM_SHUFFLE( 2, 3, 0, 1 ) ); + __m128i max1 = _mm_max_epu8( a, a1 ); + __m128i min1 = _mm_min_epu8( a, a1 ); + __m128i amax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) ); + __m128i amin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) ); + __m128i max2 = _mm_max_epu8( max1, amax2 ); + __m128i min2 = _mm_min_epu8( min1, amin2 ); + __m128i amax3 = _mm_alignr_epi8( max2, max2, 2 ); + __m128i amin3 = _mm_alignr_epi8( min2, min2, 2 ); + __m128i max3 = _mm_max_epu8( max2, amax3 ); + __m128i min3 = _mm_min_epu8( min2, amin3 ); + __m128i amax4 = _mm_alignr_epi8( max3, max3, 1 ); + __m128i amin4 = _mm_alignr_epi8( min3, min3, 1 ); + __m128i max = _mm_max_epu8( max3, amax4 ); + __m128i min = _mm_min_epu8( min3, amin4 ); + __m128i minmax = _mm_unpacklo_epi8( max, min ); + + __m128i r = _mm_sub_epi8( max, min ); + int range = _mm_cvtsi128_si32( r ) & 0xFF; + __m128i rv = _mm_set1_epi16( DivTableAlpha[range] ); + + __m128i v = _mm_sub_epi8( a, min ); + + __m128i lo16 = _mm_unpacklo_epi8( v, _mm_setzero_si128() ); + __m128i hi16 = _mm_unpackhi_epi8( v, _mm_setzero_si128() ); + + __m128i lomul = _mm_mulhi_epu16( lo16, rv ); + __m128i himul = _mm_mulhi_epu16( hi16, rv ); + + __m128i p0 = _mm_packus_epi16( lomul, himul ); + __m128i p1 = _mm_or_si128( _mm_and_si128( p0, _mm_set1_epi16( 0x3F ) ), _mm_srai_epi16( _mm_and_si128( p0, _mm_set1_epi16( 0x3F00 ) ), 5 ) ); + __m128i p2 = _mm_packus_epi16( p1, p1 ); + + uint64_t pi = _mm_cvtsi128_si64( p2 ); + uint64_t data = 0; + for( int i=0; i<8; i++ ) + { + uint64_t idx = AlphaIndexTable_SSE[(pi>>(i*8)) & 0x3F]; + data |= idx << (i*6); + } + return (uint64_t)(uint16_t)_mm_cvtsi128_si32( minmax ) | ( data << 16 ); +} +#endif + void CompressRgtcR(const uint32_t *src, uint64_t *dst, uint32_t blocks, size_t width) { int i = 0; auto ptr = dst; do { - uint32_t rgba[4 * 4]; - uint8_t r[4 * 4]; +#ifdef __SSE4_1__ + __m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) ); + __m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) ); + __m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) ); + __m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) ); - auto tmp = (char *)rgba; - memcpy(tmp, src + width * 0, 4 * 4); - memcpy(tmp + 4 * 4, src + width * 1, 4 * 4); - memcpy(tmp + 8 * 4, src + width * 2, 4 * 4); - memcpy(tmp + 12 * 4, src + width * 3, 4 * 4); src += 4; if (++i == width / 4) { @@ -61,11 +154,38 @@ void CompressRgtcR(const uint32_t *src, uint64_t *dst, uint32_t blocks, size_t w i = 0; } - for (int i = 0; i < 16; i++) + __m128i mask = _mm_setr_epi32( 0x0c080400, -1, -1, -1 ); + + __m128i m0 = _mm_shuffle_epi8( px0, mask ); + __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); + __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); + __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); + __m128i m4 = _mm_or_si128( m0, m1 ); + __m128i m5 = _mm_or_si128( m2, m3 ); + + *ptr++ = Process_Alpha_SSE(_mm_or_si128( m4, m5 )); +#else + uint8_t r[4 * 4]; + auto rgba = src; + for (int i = 0; i < 4; i++) { - r[i] = rgba[i] & 0x000000FF; + r[i * 4] = rgba[0] & 0xff; + r[i * 4 + 1] = rgba[1] & 0xff; + r[i * 4 + 2] = rgba[2] & 0xff; + r[i * 4 + 3] = rgba[3] & 0xff; + + rgba += width; } + + src += 4; + if (++i == width / 4) + { + src += width * 3; + i = 0; + } + *ptr++ = ProcessAlpha(r); +#endif } while (--blocks); } @@ -76,15 +196,12 @@ void CompressRgtcRG(const uint32_t *src, uint64_t *dst, uint32_t blocks, size_t auto ptr = dst; do { - uint32_t rgba[4 * 4]; - uint8_t r[4 * 4]; - uint8_t g[4 * 4]; - - auto tmp = (char *)rgba; - memcpy(tmp, src + width * 0, 4 * 4); - memcpy(tmp + 4 * 4, src + width * 1, 4 * 4); - memcpy(tmp + 8 * 4, src + width * 2, 4 * 4); - memcpy(tmp + 12 * 4, src + width * 3, 4 * 4); +#ifdef __SSE4_1__ + __m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) ); + __m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) ); + __m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) ); + __m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) ); + src += 4; if (++i == width / 4) { @@ -92,13 +209,55 @@ void CompressRgtcRG(const uint32_t *src, uint64_t *dst, uint32_t blocks, size_t i = 0; } - for (int i = 0; i < 16; i++) + __m128i mask = _mm_setr_epi32( 0x0c080400, -1, -1, -1 ); + + __m128i m0 = _mm_shuffle_epi8( px0, mask ); + __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); + __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); + __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); + __m128i m4 = _mm_or_si128( m0, m1 ); + __m128i m5 = _mm_or_si128( m2, m3 ); + + *ptr++ = Process_Alpha_SSE(_mm_or_si128( m4, m5 )); + + mask = _mm_setr_epi32( 0x0d090501, -1, -1, -1 ); + + m0 = _mm_shuffle_epi8( px0, mask ); + m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); + m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); + m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); + m4 = _mm_or_si128( m0, m1 ); + m5 = _mm_or_si128( m2, m3 ); + + *ptr++ = Process_Alpha_SSE(_mm_or_si128( m4, m5 )); +#else + uint8_t rg[4 * 4 * 2]; + auto rgba = src; + for (int i = 0; i < 4; i++) { - r[i] = rgba[i] & 0x000000FF; - g[i] = (rgba[i] & 0x0000FF00) >> 8; + rg[i * 4] = rgba[0] & 0xff; + rg[i * 4 + 1] = rgba[1] & 0xff; + rg[i * 4 + 2] = rgba[2] & 0xff; + rg[i * 4 + 3] = rgba[3] & 0xff; + + rg[16 + i * 4] = (rgba[0] & 0xff00) >> 8; + rg[16 + i * 4 + 1] = (rgba[1] & 0xff00) >> 8; + rg[16 + i * 4 + 2] = (rgba[2] & 0xff00) >> 8; + rg[16 + i * 4 + 3] = (rgba[3] & 0xff00) >> 8; + + rgba += width; } - *ptr++ = ProcessAlpha(r); - *ptr++ = ProcessAlpha(g); + + src += 4; + if (++i == width / 4) + { + src += width * 3; + i = 0; + } + + *ptr++ = ProcessAlpha(rg); + *ptr++ = ProcessAlpha(&rg[16]); +#endif } while (--blocks); } diff --git a/thirdparty/etcpak/patches/etc2-r-rg.patch b/thirdparty/etcpak/patches/etc2-r-rg.patch new file mode 100644 index 00000000000..5d6c117bf71 --- /dev/null +++ b/thirdparty/etcpak/patches/etc2-r-rg.patch @@ -0,0 +1,164 @@ +diff --git a/thirdparty/etcpak/ProcessRGB.cpp b/thirdparty/etcpak/ProcessRGB.cpp +index 4dc3bf23af..0caa687bc6 100644 +--- a/thirdparty/etcpak/ProcessRGB.cpp ++++ b/thirdparty/etcpak/ProcessRGB.cpp +@@ -4181,3 +4181,145 @@ void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size + } + while( --blocks ); + } ++ ++// -- GODOT start -- ++void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) ++{ ++ int w = 0; ++ uint8_t r[4*4]; ++ do ++ { ++#ifdef __SSE4_1__ ++ __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); ++ __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); ++ __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); ++ __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); ++ ++ _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); ++ ++ __m128i c0 = _mm_castps_si128( px0 ); ++ __m128i c1 = _mm_castps_si128( px1 ); ++ __m128i c2 = _mm_castps_si128( px2 ); ++ __m128i c3 = _mm_castps_si128( px3 ); ++ ++ __m128i mask = _mm_setr_epi32( 0x0e0a0602, -1, -1, -1 ); ++ ++ __m128i a0 = _mm_shuffle_epi8( c0, mask ); ++ __m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); ++ __m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); ++ __m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); ++ ++ __m128i s0 = _mm_or_si128( a0, a1 ); ++ __m128i s1 = _mm_or_si128( a2, a3 ); ++ __m128i s2 = _mm_or_si128( s0, s1 ); ++ ++ _mm_store_si128( (__m128i*)r, s2 ); ++ ++ src += 4; ++#else ++ auto ptr8 = r; ++ for( int x=0; x<4; x++ ) ++ { ++ auto v = *src; ++ *ptr8++ = (v & 0xff0000) >> 16; ++ src += width; ++ v = *src; ++ *ptr8++ = (v & 0xff0000) >> 16; ++ src += width; ++ v = *src; ++ *ptr8++ = (v & 0xff0000) >> 16; ++ src += width; ++ v = *src; ++ *ptr8++ = (v & 0xff0000) >> 16; ++ src -= width * 3 - 1; ++ } ++#endif ++ if( ++w == width/4 ) ++ { ++ src += width * 3; ++ w = 0; ++ } ++ *dst++ = ProcessAlpha_ETC2( r ); ++ } ++ while( --blocks ); ++} ++ ++void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) ++{ ++ int w = 0; ++ uint8_t rg[4*4*2]; ++ do ++ { ++#ifdef __SSE4_1__ ++ __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); ++ __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); ++ __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); ++ __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); ++ ++ _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); ++ ++ __m128i c0 = _mm_castps_si128( px0 ); ++ __m128i c1 = _mm_castps_si128( px1 ); ++ __m128i c2 = _mm_castps_si128( px2 ); ++ __m128i c3 = _mm_castps_si128( px3 ); ++ ++ __m128i mask = _mm_setr_epi32( 0x0e0a0602, -1, -1, -1 ); ++ ++ __m128i r0 = _mm_shuffle_epi8( c0, mask ); ++ __m128i r1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); ++ __m128i r2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); ++ __m128i r3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); ++ ++ __m128i s0 = _mm_or_si128( r0, r1 ); ++ __m128i s1 = _mm_or_si128( r2, r3 ); ++ __m128i s2 = _mm_or_si128( s0, s1 ); ++ ++ _mm_store_si128( (__m128i*)rg, s2 ); ++ ++ mask = _mm_setr_epi32( 0x0d090501, -1, -1, -1 ); ++ ++ r0 = _mm_shuffle_epi8( c0, mask ); ++ r1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); ++ r2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); ++ r3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); ++ ++ s0 = _mm_or_si128( r0, r1 ); ++ s1 = _mm_or_si128( r2, r3 ); ++ s2 = _mm_or_si128( s0, s1 ); ++ ++ _mm_store_si128( (__m128i*)&rg[16], s2 ); ++ src += 4; ++#else ++ auto ptrr = rg; ++ auto ptrg = ptrr + 16; ++ for( int x=0; x<4; x++ ) ++ { ++ auto v = *src; ++ *ptrr++ = (v & 0xff0000) >> 16; ++ *ptrg++ = (v & 0xff00) >> 8; ++ src += width; ++ v = *src; ++ *ptrr++ = (v & 0xff0000) >> 16; ++ *ptrg++ = (v & 0xff00) >> 8; ++ src += width; ++ v = *src; ++ *ptrr++ = (v & 0xff0000) >> 16; ++ *ptrg++ = (v & 0xff00) >> 8; ++ src += width; ++ v = *src; ++ *ptrr++ = (v & 0xff0000) >> 16; ++ *ptrg++ = (v & 0xff00) >> 8; ++ src -= width * 3 - 1; ++ } ++#endif ++ if( ++w == width/4 ) ++ { ++ src += width * 3; ++ w = 0; ++ } ++ *dst++ = ProcessAlpha_ETC2( rg ); ++ *dst++ = ProcessAlpha_ETC2( &rg[16] ); ++ } ++ while( --blocks ); ++} ++// -- GODOT end -- +diff --git a/thirdparty/etcpak/ProcessRGB.hpp b/thirdparty/etcpak/ProcessRGB.hpp +index 043b46e636..050ea42562 100644 +--- a/thirdparty/etcpak/ProcessRGB.hpp ++++ b/thirdparty/etcpak/ProcessRGB.hpp +@@ -9,5 +9,8 @@ void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_ + void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ); + void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ); + void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ); +- ++// -- GODOT start -- ++void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ); ++void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ); ++// -- GODOT end -- + #endif