#[versions] standard = ""; dithered = "#define BC1_DITHER"; #[compute] #version 450 #include "CrossPlatformSettings_piece_all.glsl" #include "UavCrossPlatform_piece_all.glsl" #define FLT_MAX 340282346638528859811704183484516925440.0f layout(binding = 0) uniform sampler2D srcTex; layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture; layout(std430, binding = 2) readonly restrict buffer globalBuffer { float2 c_oMatch5[256]; float2 c_oMatch6[256]; }; layout(push_constant, std430) uniform Params { uint p_numRefinements; uint p_padding[3]; } params; layout(local_size_x = 8, // local_size_y = 8, // local_size_z = 1) in; float3 rgb565to888(float rgb565) { float3 retVal; retVal.x = floor(rgb565 / 2048.0f); retVal.y = floor(mod(rgb565, 2048.0f) / 32.0f); retVal.z = floor(mod(rgb565, 32.0f)); // This is the correct 565 to 888 conversion: // rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f ) // // However stb_dxt follows a different one: // rb = floor( rb * ( 256 / 32 + 8 / 32 ) ); // g = floor( g * ( 256 / 64 + 4 / 64 ) ); // // I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded // It's quite possible this is the reason: // http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/ // // Or maybe it's just because it's cheap to do with integer shifts. // Anyway, we follow stb_dxt's conversion just in case // (gives almost the same result, with 1 or -1 of difference for a very few values) // // Perhaps when we make 888 -> 565 -> 888 it doesn't matter // because they end up mapping to the original number return floor(retVal * float3(8.25f, 4.0625f, 8.25f)); } float rgb888to565(float3 rgbValue) { rgbValue.rb = floor(rgbValue.rb * 31.0f / 255.0f + 0.5f); rgbValue.g = floor(rgbValue.g * 63.0f / 255.0f + 0.5f); return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b; } // linear interpolation at 1/3 point between a and b, using desired rounding type float3 lerp13(float3 a, float3 b) { #ifdef STB_DXT_USE_ROUNDING_BIAS // with rounding bias return a + floor((b - a) * (1.0f / 3.0f) + 0.5f); #else // without rounding bias return floor((2.0f * a + b) / 3.0f); #endif } /// Unpacks a block of 4 colors from two 16-bit endpoints void EvalColors(out float3 colors[4], float c0, float c1) { colors[0] = rgb565to888(c0); colors[1] = rgb565to888(c1); colors[2] = lerp13(colors[0], colors[1]); colors[3] = lerp13(colors[1], colors[0]); } /** The color optimization function. (Clever code, part 1) @param outMinEndp16 [out] Minimum endpoint, in RGB565 @param outMaxEndp16 [out] Maximum endpoint, in RGB565 */ void OptimizeColorsBlock(const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16) { // determine color distribution float3 avgColor; float3 minColor; float3 maxColor; avgColor = minColor = maxColor = unpackUnorm4x8(srcPixelsBlock[0]).xyz; for (int i = 1; i < 16; ++i) { const float3 currColorUnorm = unpackUnorm4x8(srcPixelsBlock[i]).xyz; avgColor += currColorUnorm; minColor = min(minColor, currColorUnorm); maxColor = max(maxColor, currColorUnorm); } avgColor = round(avgColor * 255.0f / 16.0f); maxColor *= 255.0f; minColor *= 255.0f; // determine covariance matrix float cov[6]; for (int i = 0; i < 6; ++i) cov[i] = 0; for (int i = 0; i < 16; ++i) { const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f; float3 rgbDiff = currColor - avgColor; cov[0] += rgbDiff.r * rgbDiff.r; cov[1] += rgbDiff.r * rgbDiff.g; cov[2] += rgbDiff.r * rgbDiff.b; cov[3] += rgbDiff.g * rgbDiff.g; cov[4] += rgbDiff.g * rgbDiff.b; cov[5] += rgbDiff.b * rgbDiff.b; } // convert covariance matrix to float, find principal axis via power iter for (int i = 0; i < 6; ++i) cov[i] /= 255.0f; float3 vF = maxColor - minColor; const int nIterPower = 4; for (int iter = 0; iter < nIterPower; ++iter) { const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2]; const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4]; const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5]; vF.r = r; vF.g = g; vF.b = b; } float magn = max3(abs(vF.r), abs(vF.g), abs(vF.b)); float3 v; if (magn < 4.0f) { // too small, default to luminance v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000. v.g = 587.0f; v.b = 114.0f; } else { v = trunc(vF * (512.0f / magn)); } // Pick colors at extreme points float3 minEndpoint, maxEndpoint; float minDot = FLT_MAX; float maxDot = -FLT_MAX; for (int i = 0; i < 16; ++i) { const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f; const float dotValue = dot(currColor, v); if (dotValue < minDot) { minDot = dotValue; minEndpoint = currColor; } if (dotValue > maxDot) { maxDot = dotValue; maxEndpoint = currColor; } } outMinEndp16 = rgb888to565(minEndpoint); outMaxEndp16 = rgb888to565(maxEndpoint); } // The color matching function uint MatchColorsBlock(const uint srcPixelsBlock[16], float3 color[4]) { uint mask = 0u; float3 dir = color[0] - color[1]; float stops[4]; for (int i = 0; i < 4; ++i) stops[i] = dot(color[i], dir); // think of the colors as arranged on a line; project point onto that line, then choose // next color out of available ones. we compute the crossover points for "best color in top // half"/"best in bottom half" and then the same inside that subinterval. // // relying on this 1d approximation isn't always optimal in terms of euclidean distance, // but it's very close and a lot faster. // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html float c0Point = trunc((stops[1] + stops[3]) * 0.5f); float halfPoint = trunc((stops[3] + stops[2]) * 0.5f); float c3Point = trunc((stops[2] + stops[0]) * 0.5f); #ifndef BC1_DITHER // the version without dithering is straightforward for (uint i = 16u; i-- > 0u;) { const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f; const float dotValue = dot(currColor, dir); mask <<= 2u; if (dotValue < halfPoint) mask |= ((dotValue < c0Point) ? 1u : 3u); else mask |= ((dotValue < c3Point) ? 2u : 0u); } #else // with floyd-steinberg dithering float4 ep1 = float4(0, 0, 0, 0); float4 ep2 = float4(0, 0, 0, 0); c0Point *= 16.0f; halfPoint *= 16.0f; c3Point *= 16.0f; for (uint y = 0u; y < 4u; ++y) { float ditherDot; uint lmask, step; float3 currColor; float dotValue; currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 0]).xyz * 255.0f; dotValue = dot(currColor, dir); ditherDot = (dotValue * 16.0f) + (3 * ep2[1] + 5 * ep2[0]); if (ditherDot < halfPoint) step = (ditherDot < c0Point) ? 1u : 3u; else step = (ditherDot < c3Point) ? 2u : 0u; ep1[0] = dotValue - stops[step]; lmask = step; currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 1]).xyz * 255.0f; dotValue = dot(currColor, dir); ditherDot = (dotValue * 16.0f) + (7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]); if (ditherDot < halfPoint) step = (ditherDot < c0Point) ? 1u : 3u; else step = (ditherDot < c3Point) ? 2u : 0u; ep1[1] = dotValue - stops[step]; lmask |= step << 2u; currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f; dotValue = dot(currColor, dir); ditherDot = (dotValue * 16.0f) + (7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]); if (ditherDot < halfPoint) step = (ditherDot < c0Point) ? 1u : 3u; else step = (ditherDot < c3Point) ? 2u : 0u; ep1[2] = dotValue - stops[step]; lmask |= step << 4u; currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f; dotValue = dot(currColor, dir); ditherDot = (dotValue * 16.0f) + (7 * ep1[2] + 5 * ep2[3] + ep2[2]); if (ditherDot < halfPoint) step = (ditherDot < c0Point) ? 1u : 3u; else step = (ditherDot < c3Point) ? 2u : 0u; ep1[3] = dotValue - stops[step]; lmask |= step << 6u; mask |= lmask << (y * 8u); { float4 tmp = ep1; ep1 = ep2; ep2 = tmp; } // swap } #endif return mask; } // The refinement function. (Clever code, part 2) // Tries to optimize colors to suit block contents better. // (By solving a least squares system via normal equations+Cramer's rule) bool RefineBlock(const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16, inout float inOutMaxEndp16) { float newMin16, newMax16; const float oldMin = inOutMinEndp16; const float oldMax = inOutMaxEndp16; if ((mask ^ (mask << 2u)) < 4u) // all pixels have the same index? { // yes, linear system would be singular; solve using optimal // single-color match on average color float3 rgbVal = float3(8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f); for (int i = 0; i < 16; ++i) rgbVal += unpackUnorm4x8(srcPixelsBlock[i]).xyz; rgbVal = floor(rgbVal * (255.0f / 16.0f)); newMax16 = c_oMatch5[uint(rgbVal.r)][0] * 2048.0f + // c_oMatch6[uint(rgbVal.g)][0] * 32.0f + // c_oMatch5[uint(rgbVal.b)][0]; newMin16 = c_oMatch5[uint(rgbVal.r)][1] * 2048.0f + // c_oMatch6[uint(rgbVal.g)][1] * 32.0f + // c_oMatch5[uint(rgbVal.b)][1]; } else { const float w1Tab[4] = { 3, 0, 2, 1 }; const float prods[4] = { 589824.0f, 2304.0f, 262402.0f, 66562.0f }; // ^some magic to save a lot of multiplies in the accumulating loop... // (precomputed products of weights for least squares system, accumulated inside one 32-bit // register) float akku = 0.0f; uint cm = mask; float3 at1 = float3(0, 0, 0); float3 at2 = float3(0, 0, 0); for (int i = 0; i < 16; ++i, cm >>= 2u) { const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f; const uint step = cm & 3u; const float w1 = w1Tab[step]; akku += prods[step]; at1 += currColor * w1; at2 += currColor; } at2 = 3.0f * at2 - at1; // extract solutions and decide solvability const float xx = floor(akku / 65535.0f); const float yy = floor(mod(akku, 65535.0f) / 256.0f); const float xy = mod(akku, 256.0f); float2 f_rb_g; f_rb_g.x = 3.0f * 31.0f / 255.0f / (xx * yy - xy * xy); f_rb_g.y = f_rb_g.x * 63.0f / 31.0f; // solve. const float3 newMaxVal = clamp(floor((at1 * yy - at2 * xy) * f_rb_g.xyx + 0.5f), float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31)); newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z; const float3 newMinVal = clamp(floor((at2 * xx - at1 * xy) * f_rb_g.xyx + 0.5f), float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31)); newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z; } inOutMinEndp16 = newMin16; inOutMaxEndp16 = newMax16; return oldMin != newMin16 || oldMax != newMax16; } #ifdef BC1_DITHER /// Quantizes 'srcValue' which is originally in 888 (full range), /// converting it to 565 and then back to 888 (quantized) float3 quant(float3 srcValue) { srcValue = clamp(srcValue, 0.0f, 255.0f); // Convert 888 -> 565 srcValue = floor(srcValue * float3(31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f) + 0.5f); // Convert 565 -> 888 back srcValue = floor(srcValue * float3(8.25f, 4.0625f, 8.25f)); return srcValue; } void DitherBlock(const uint srcPixBlck[16], out uint dthPixBlck[16]) { float3 ep1[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) }; float3 ep2[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) }; for (uint y = 0u; y < 16u; y += 4u) { float3 srcPixel, dithPixel; srcPixel = unpackUnorm4x8(srcPixBlck[y + 0u]).xyz * 255.0f; dithPixel = quant(srcPixel + trunc((3 * ep2[1] + 5 * ep2[0]) * (1.0f / 16.0f))); ep1[0] = srcPixel - dithPixel; dthPixBlck[y + 0u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f)); srcPixel = unpackUnorm4x8(srcPixBlck[y + 1u]).xyz * 255.0f; dithPixel = quant( srcPixel + trunc((7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]) * (1.0f / 16.0f))); ep1[1] = srcPixel - dithPixel; dthPixBlck[y + 1u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f)); srcPixel = unpackUnorm4x8(srcPixBlck[y + 2u]).xyz * 255.0f; dithPixel = quant( srcPixel + trunc((7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]) * (1.0f / 16.0f))); ep1[2] = srcPixel - dithPixel; dthPixBlck[y + 2u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f)); srcPixel = unpackUnorm4x8(srcPixBlck[y + 3u]).xyz * 255.0f; dithPixel = quant(srcPixel + trunc((7 * ep1[2] + 5 * ep2[3] + ep2[2]) * (1.0f / 16.0f))); ep1[3] = srcPixel - dithPixel; dthPixBlck[y + 3u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f)); // swap( ep1, ep2 ) for (uint i = 0u; i < 4u; ++i) { float3 tmp = ep1[i]; ep1[i] = ep2[i]; ep2[i] = tmp; } } } #endif void main() { uint srcPixelsBlock[16]; bool bAllColorsEqual = true; // Load the whole 4x4 block const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u; for (uint i = 0u; i < 16u; ++i) { const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i & 0x03u, i >> 2u); const float3 srcPixels0 = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyz; srcPixelsBlock[i] = packUnorm4x8(float4(srcPixels0, 1.0f)); bAllColorsEqual = bAllColorsEqual && srcPixelsBlock[0] == srcPixelsBlock[i]; } float maxEndp16, minEndp16; uint mask = 0u; if (bAllColorsEqual) { const uint3 rgbVal = uint3(unpackUnorm4x8(srcPixelsBlock[0]).xyz * 255.0f); mask = 0xAAAAAAAAu; maxEndp16 = c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0]; minEndp16 = c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1]; } else { #ifdef BC1_DITHER uint ditherPixelsBlock[16]; // first step: compute dithered version for PCA if desired DitherBlock(srcPixelsBlock, ditherPixelsBlock); #else #define ditherPixelsBlock srcPixelsBlock #endif // second step: pca+map along principal axis OptimizeColorsBlock(ditherPixelsBlock, minEndp16, maxEndp16); if (minEndp16 != maxEndp16) { float3 colors[4]; EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted mask = MatchColorsBlock(srcPixelsBlock, colors); } // third step: refine (multiple times if requested) bool bStopRefinement = false; for (uint i = 0u; i < params.p_numRefinements && !bStopRefinement; ++i) { const uint lastMask = mask; if (RefineBlock(ditherPixelsBlock, mask, minEndp16, maxEndp16)) { if (minEndp16 != maxEndp16) { float3 colors[4]; EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted mask = MatchColorsBlock(srcPixelsBlock, colors); } else { mask = 0u; bStopRefinement = true; } } bStopRefinement = mask == lastMask || bStopRefinement; } } // write the color block if (maxEndp16 < minEndp16) { const float tmpValue = minEndp16; minEndp16 = maxEndp16; maxEndp16 = tmpValue; mask ^= 0x55555555u; } uint2 outputBytes; outputBytes.x = uint(maxEndp16) | (uint(minEndp16) << 16u); outputBytes.y = mask; uint2 dstUV = gl_GlobalInvocationID.xy; imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u)); }