From 606eedb0c9093789018f64381e79e7d93dd8752d Mon Sep 17 00:00:00 2001 From: BlueCube3310 <53150244+BlueCube3310@users.noreply.github.com> Date: Tue, 20 Aug 2024 15:14:48 +0200 Subject: [PATCH] Betsy: Add caching and BC1 compression support --- core/io/image.cpp | 14 + core/io/image.h | 1 + doc/classes/ProjectSettings.xml | 7 +- modules/betsy/SCsub | 1 + modules/betsy/bc1.glsl | 483 +++++++++++ modules/betsy/betsy_bc1.h | 1061 ++++++++++++++++++++++++ modules/betsy/image_compress_betsy.cpp | 408 ++++++--- modules/betsy/image_compress_betsy.h | 31 +- modules/betsy/register_types.cpp | 3 + servers/rendering_server.cpp | 1 + thirdparty/README.md | 2 +- 11 files changed, 1896 insertions(+), 116 deletions(-) create mode 100644 modules/betsy/bc1.glsl create mode 100644 modules/betsy/betsy_bc1.h diff --git a/core/io/image.cpp b/core/io/image.cpp index fcbe483e381..bc018bd45c6 100644 --- a/core/io/image.cpp +++ b/core/io/image.cpp @@ -2751,6 +2751,19 @@ Error Image::compress_from_channels(CompressMode p_mode, UsedChannels p_channels } break; + case COMPRESS_S3TC: { + // BC3 is unsupported currently. + if ((p_channels == USED_CHANNELS_RGB || p_channels == USED_CHANNELS_L) && _image_compress_bc_rd_func) { + Error result = _image_compress_bc_rd_func(this, p_channels); + + // If the image was compressed successfully, we return here. If not, we fall back to the default compression scheme. + if (result == OK) { + return OK; + } + } + + } break; + default: { } } @@ -3138,6 +3151,7 @@ void (*Image::_image_compress_etc1_func)(Image *) = nullptr; void (*Image::_image_compress_etc2_func)(Image *, Image::UsedChannels) = nullptr; void (*Image::_image_compress_astc_func)(Image *, Image::ASTCFormat) = nullptr; Error (*Image::_image_compress_bptc_rd_func)(Image *, Image::UsedChannels) = nullptr; +Error (*Image::_image_compress_bc_rd_func)(Image *, Image::UsedChannels) = nullptr; void (*Image::_image_decompress_bc)(Image *) = nullptr; void (*Image::_image_decompress_bptc)(Image *) = nullptr; void (*Image::_image_decompress_etc1)(Image *) = nullptr; diff --git a/core/io/image.h b/core/io/image.h index 4461ae71a6e..78757246e06 100644 --- a/core/io/image.h +++ b/core/io/image.h @@ -160,6 +160,7 @@ public: static void (*_image_compress_astc_func)(Image *, ASTCFormat p_format); static Error (*_image_compress_bptc_rd_func)(Image *, UsedChannels p_channels); + static Error (*_image_compress_bc_rd_func)(Image *, UsedChannels p_channels); static void (*_image_decompress_bc)(Image *); static void (*_image_decompress_bptc)(Image *); diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml index e5b787714f5..7b834c166b6 100644 --- a/doc/classes/ProjectSettings.xml +++ b/doc/classes/ProjectSettings.xml @@ -2890,10 +2890,13 @@ If [code]true[/code], the texture importer will import lossless textures using the PNG format. Otherwise, it will default to using WebP. + + If [code]true[/code], the GPU texture compressor will cache the local RenderingDevice and its resources (shaders and pipelines), allowing for faster subsequent imports at a memory cost. + - If [code]true[/code], the texture importer will utilize the GPU for compressing textures, which makes large textures import significantly faster. + If [code]true[/code], the texture importer will utilize the GPU for compressing textures, improving the import time of large images. [b]Note:[/b] This setting requires either Vulkan or D3D12 available as a rendering backend. - [b]Note:[/b] Currently this only affects BC6H compression, which is used on Desktop and Console for HDR images. + [b]Note:[/b] Currently this only affects BC1 and BC6H compression, which are used on Desktop and Console for fully opaque and HDR images respectively. If [code]true[/code], the texture importer will import VRAM-compressed textures using the Ericsson Texture Compression 2 algorithm for lower quality textures and normal maps and Adaptable Scalable Texture Compression algorithm for high quality textures (in 4×4 block size). diff --git a/modules/betsy/SCsub b/modules/betsy/SCsub index 9930e1f4cf9..ed5dcbf58b7 100644 --- a/modules/betsy/SCsub +++ b/modules/betsy/SCsub @@ -4,6 +4,7 @@ Import("env_modules") env_betsy = env_modules.Clone() env_betsy.GLSL_HEADER("bc6h.glsl") +env_betsy.GLSL_HEADER("bc1.glsl") env_betsy.Depends(Glob("*.glsl.gen.h"), ["#glsl_builders.py"]) # Thirdparty source files diff --git a/modules/betsy/bc1.glsl b/modules/betsy/bc1.glsl new file mode 100644 index 00000000000..f1b2c282547 --- /dev/null +++ b/modules/betsy/bc1.glsl @@ -0,0 +1,483 @@ +#[versions] + +standard = ""; +dithered = "#define BC1_DITHER"; + +#[compute] +#version 450 + +#include "CrossPlatformSettings_piece_all.glsl" +#include "UavCrossPlatform_piece_all.glsl" + +#define FLT_MAX 340282346638528859811704183484516925440.0f + +layout(binding = 0) uniform sampler2D srcTex; +layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture; + +layout(std430, binding = 2) readonly restrict buffer globalBuffer { + float2 c_oMatch5[256]; + float2 c_oMatch6[256]; +}; + +layout(push_constant, std430) uniform Params { + uint p_numRefinements; + uint p_padding[3]; +} +params; + +layout(local_size_x = 8, // + local_size_y = 8, // + local_size_z = 1) in; + +float3 rgb565to888(float rgb565) { + float3 retVal; + retVal.x = floor(rgb565 / 2048.0f); + retVal.y = floor(mod(rgb565, 2048.0f) / 32.0f); + retVal.z = floor(mod(rgb565, 32.0f)); + + // This is the correct 565 to 888 conversion: + // rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f ) + // + // However stb_dxt follows a different one: + // rb = floor( rb * ( 256 / 32 + 8 / 32 ) ); + // g = floor( g * ( 256 / 64 + 4 / 64 ) ); + // + // I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded + // It's quite possible this is the reason: + // http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/ + // + // Or maybe it's just because it's cheap to do with integer shifts. + // Anyway, we follow stb_dxt's conversion just in case + // (gives almost the same result, with 1 or -1 of difference for a very few values) + // + // Perhaps when we make 888 -> 565 -> 888 it doesn't matter + // because they end up mapping to the original number + + return floor(retVal * float3(8.25f, 4.0625f, 8.25f)); +} + +float rgb888to565(float3 rgbValue) { + rgbValue.rb = floor(rgbValue.rb * 31.0f / 255.0f + 0.5f); + rgbValue.g = floor(rgbValue.g * 63.0f / 255.0f + 0.5f); + + return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b; +} + +// linear interpolation at 1/3 point between a and b, using desired rounding type +float3 lerp13(float3 a, float3 b) { +#ifdef STB_DXT_USE_ROUNDING_BIAS + // with rounding bias + return a + floor((b - a) * (1.0f / 3.0f) + 0.5f); +#else + // without rounding bias + return floor((2.0f * a + b) / 3.0f); +#endif +} + +/// Unpacks a block of 4 colors from two 16-bit endpoints +void EvalColors(out float3 colors[4], float c0, float c1) { + colors[0] = rgb565to888(c0); + colors[1] = rgb565to888(c1); + colors[2] = lerp13(colors[0], colors[1]); + colors[3] = lerp13(colors[1], colors[0]); +} + +/** The color optimization function. (Clever code, part 1) +@param outMinEndp16 [out] + Minimum endpoint, in RGB565 +@param outMaxEndp16 [out] + Maximum endpoint, in RGB565 +*/ +void OptimizeColorsBlock(const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16) { + // determine color distribution + float3 avgColor; + float3 minColor; + float3 maxColor; + + avgColor = minColor = maxColor = unpackUnorm4x8(srcPixelsBlock[0]).xyz; + for (int i = 1; i < 16; ++i) { + const float3 currColorUnorm = unpackUnorm4x8(srcPixelsBlock[i]).xyz; + avgColor += currColorUnorm; + minColor = min(minColor, currColorUnorm); + maxColor = max(maxColor, currColorUnorm); + } + + avgColor = round(avgColor * 255.0f / 16.0f); + maxColor *= 255.0f; + minColor *= 255.0f; + + // determine covariance matrix + float cov[6]; + for (int i = 0; i < 6; ++i) + cov[i] = 0; + + for (int i = 0; i < 16; ++i) { + const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f; + float3 rgbDiff = currColor - avgColor; + + cov[0] += rgbDiff.r * rgbDiff.r; + cov[1] += rgbDiff.r * rgbDiff.g; + cov[2] += rgbDiff.r * rgbDiff.b; + cov[3] += rgbDiff.g * rgbDiff.g; + cov[4] += rgbDiff.g * rgbDiff.b; + cov[5] += rgbDiff.b * rgbDiff.b; + } + + // convert covariance matrix to float, find principal axis via power iter + for (int i = 0; i < 6; ++i) + cov[i] /= 255.0f; + + float3 vF = maxColor - minColor; + + const int nIterPower = 4; + for (int iter = 0; iter < nIterPower; ++iter) { + const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2]; + const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4]; + const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5]; + + vF.r = r; + vF.g = g; + vF.b = b; + } + + float magn = max3(abs(vF.r), abs(vF.g), abs(vF.b)); + float3 v; + + if (magn < 4.0f) { // too small, default to luminance + v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000. + v.g = 587.0f; + v.b = 114.0f; + } else { + v = trunc(vF * (512.0f / magn)); + } + + // Pick colors at extreme points + float3 minEndpoint, maxEndpoint; + float minDot = FLT_MAX; + float maxDot = -FLT_MAX; + for (int i = 0; i < 16; ++i) { + const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f; + const float dotValue = dot(currColor, v); + + if (dotValue < minDot) { + minDot = dotValue; + minEndpoint = currColor; + } + + if (dotValue > maxDot) { + maxDot = dotValue; + maxEndpoint = currColor; + } + } + + outMinEndp16 = rgb888to565(minEndpoint); + outMaxEndp16 = rgb888to565(maxEndpoint); +} + +// The color matching function +uint MatchColorsBlock(const uint srcPixelsBlock[16], float3 color[4]) { + uint mask = 0u; + float3 dir = color[0] - color[1]; + float stops[4]; + + for (int i = 0; i < 4; ++i) + stops[i] = dot(color[i], dir); + + // think of the colors as arranged on a line; project point onto that line, then choose + // next color out of available ones. we compute the crossover points for "best color in top + // half"/"best in bottom half" and then the same inside that subinterval. + // + // relying on this 1d approximation isn't always optimal in terms of euclidean distance, + // but it's very close and a lot faster. + // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html + + float c0Point = trunc((stops[1] + stops[3]) * 0.5f); + float halfPoint = trunc((stops[3] + stops[2]) * 0.5f); + float c3Point = trunc((stops[2] + stops[0]) * 0.5f); + +#ifndef BC1_DITHER + // the version without dithering is straightforward + for (uint i = 16u; i-- > 0u;) { + const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f; + + const float dotValue = dot(currColor, dir); + mask <<= 2u; + + if (dotValue < halfPoint) + mask |= ((dotValue < c0Point) ? 1u : 3u); + else + mask |= ((dotValue < c3Point) ? 2u : 0u); + } +#else + // with floyd-steinberg dithering + float4 ep1 = float4(0, 0, 0, 0); + float4 ep2 = float4(0, 0, 0, 0); + + c0Point *= 16.0f; + halfPoint *= 16.0f; + c3Point *= 16.0f; + + for (uint y = 0u; y < 4u; ++y) { + float ditherDot; + uint lmask, step; + + float3 currColor; + float dotValue; + + currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 0]).xyz * 255.0f; + dotValue = dot(currColor, dir); + + ditherDot = (dotValue * 16.0f) + (3 * ep2[1] + 5 * ep2[0]); + if (ditherDot < halfPoint) + step = (ditherDot < c0Point) ? 1u : 3u; + else + step = (ditherDot < c3Point) ? 2u : 0u; + ep1[0] = dotValue - stops[step]; + lmask = step; + + currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 1]).xyz * 255.0f; + dotValue = dot(currColor, dir); + + ditherDot = (dotValue * 16.0f) + (7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]); + if (ditherDot < halfPoint) + step = (ditherDot < c0Point) ? 1u : 3u; + else + step = (ditherDot < c3Point) ? 2u : 0u; + ep1[1] = dotValue - stops[step]; + lmask |= step << 2u; + + currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f; + dotValue = dot(currColor, dir); + + ditherDot = (dotValue * 16.0f) + (7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]); + if (ditherDot < halfPoint) + step = (ditherDot < c0Point) ? 1u : 3u; + else + step = (ditherDot < c3Point) ? 2u : 0u; + ep1[2] = dotValue - stops[step]; + lmask |= step << 4u; + + currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f; + dotValue = dot(currColor, dir); + + ditherDot = (dotValue * 16.0f) + (7 * ep1[2] + 5 * ep2[3] + ep2[2]); + if (ditherDot < halfPoint) + step = (ditherDot < c0Point) ? 1u : 3u; + else + step = (ditherDot < c3Point) ? 2u : 0u; + ep1[3] = dotValue - stops[step]; + lmask |= step << 6u; + + mask |= lmask << (y * 8u); + { + float4 tmp = ep1; + ep1 = ep2; + ep2 = tmp; + } // swap + } +#endif + + return mask; +} + +// The refinement function. (Clever code, part 2) +// Tries to optimize colors to suit block contents better. +// (By solving a least squares system via normal equations+Cramer's rule) +bool RefineBlock(const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16, + inout float inOutMaxEndp16) { + float newMin16, newMax16; + const float oldMin = inOutMinEndp16; + const float oldMax = inOutMaxEndp16; + + if ((mask ^ (mask << 2u)) < 4u) // all pixels have the same index? + { + // yes, linear system would be singular; solve using optimal + // single-color match on average color + float3 rgbVal = float3(8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f); + for (int i = 0; i < 16; ++i) + rgbVal += unpackUnorm4x8(srcPixelsBlock[i]).xyz; + + rgbVal = floor(rgbVal * (255.0f / 16.0f)); + + newMax16 = c_oMatch5[uint(rgbVal.r)][0] * 2048.0f + // + c_oMatch6[uint(rgbVal.g)][0] * 32.0f + // + c_oMatch5[uint(rgbVal.b)][0]; + newMin16 = c_oMatch5[uint(rgbVal.r)][1] * 2048.0f + // + c_oMatch6[uint(rgbVal.g)][1] * 32.0f + // + c_oMatch5[uint(rgbVal.b)][1]; + } else { + const float w1Tab[4] = { 3, 0, 2, 1 }; + const float prods[4] = { 589824.0f, 2304.0f, 262402.0f, 66562.0f }; + // ^some magic to save a lot of multiplies in the accumulating loop... + // (precomputed products of weights for least squares system, accumulated inside one 32-bit + // register) + + float akku = 0.0f; + uint cm = mask; + float3 at1 = float3(0, 0, 0); + float3 at2 = float3(0, 0, 0); + for (int i = 0; i < 16; ++i, cm >>= 2u) { + const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f; + + const uint step = cm & 3u; + const float w1 = w1Tab[step]; + akku += prods[step]; + at1 += currColor * w1; + at2 += currColor; + } + + at2 = 3.0f * at2 - at1; + + // extract solutions and decide solvability + const float xx = floor(akku / 65535.0f); + const float yy = floor(mod(akku, 65535.0f) / 256.0f); + const float xy = mod(akku, 256.0f); + + float2 f_rb_g; + f_rb_g.x = 3.0f * 31.0f / 255.0f / (xx * yy - xy * xy); + f_rb_g.y = f_rb_g.x * 63.0f / 31.0f; + + // solve. + const float3 newMaxVal = clamp(floor((at1 * yy - at2 * xy) * f_rb_g.xyx + 0.5f), + float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31)); + newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z; + + const float3 newMinVal = clamp(floor((at2 * xx - at1 * xy) * f_rb_g.xyx + 0.5f), + float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31)); + newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z; + } + + inOutMinEndp16 = newMin16; + inOutMaxEndp16 = newMax16; + + return oldMin != newMin16 || oldMax != newMax16; +} + +#ifdef BC1_DITHER +/// Quantizes 'srcValue' which is originally in 888 (full range), +/// converting it to 565 and then back to 888 (quantized) +float3 quant(float3 srcValue) { + srcValue = clamp(srcValue, 0.0f, 255.0f); + // Convert 888 -> 565 + srcValue = floor(srcValue * float3(31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f) + 0.5f); + // Convert 565 -> 888 back + srcValue = floor(srcValue * float3(8.25f, 4.0625f, 8.25f)); + + return srcValue; +} + +void DitherBlock(const uint srcPixBlck[16], out uint dthPixBlck[16]) { + float3 ep1[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) }; + float3 ep2[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) }; + + for (uint y = 0u; y < 16u; y += 4u) { + float3 srcPixel, dithPixel; + + srcPixel = unpackUnorm4x8(srcPixBlck[y + 0u]).xyz * 255.0f; + dithPixel = quant(srcPixel + trunc((3 * ep2[1] + 5 * ep2[0]) * (1.0f / 16.0f))); + ep1[0] = srcPixel - dithPixel; + dthPixBlck[y + 0u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f)); + + srcPixel = unpackUnorm4x8(srcPixBlck[y + 1u]).xyz * 255.0f; + dithPixel = quant( + srcPixel + trunc((7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]) * (1.0f / 16.0f))); + ep1[1] = srcPixel - dithPixel; + dthPixBlck[y + 1u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f)); + + srcPixel = unpackUnorm4x8(srcPixBlck[y + 2u]).xyz * 255.0f; + dithPixel = quant( + srcPixel + trunc((7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]) * (1.0f / 16.0f))); + ep1[2] = srcPixel - dithPixel; + dthPixBlck[y + 2u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f)); + + srcPixel = unpackUnorm4x8(srcPixBlck[y + 3u]).xyz * 255.0f; + dithPixel = quant(srcPixel + trunc((7 * ep1[2] + 5 * ep2[3] + ep2[2]) * (1.0f / 16.0f))); + ep1[3] = srcPixel - dithPixel; + dthPixBlck[y + 3u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f)); + + // swap( ep1, ep2 ) + for (uint i = 0u; i < 4u; ++i) { + float3 tmp = ep1[i]; + ep1[i] = ep2[i]; + ep2[i] = tmp; + } + } +} +#endif + +void main() { + uint srcPixelsBlock[16]; + + bool bAllColorsEqual = true; + + // Load the whole 4x4 block + const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u; + for (uint i = 0u; i < 16u; ++i) { + const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i & 0x03u, i >> 2u); + const float3 srcPixels0 = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyz; + srcPixelsBlock[i] = packUnorm4x8(float4(srcPixels0, 1.0f)); + bAllColorsEqual = bAllColorsEqual && srcPixelsBlock[0] == srcPixelsBlock[i]; + } + + float maxEndp16, minEndp16; + uint mask = 0u; + + if (bAllColorsEqual) { + const uint3 rgbVal = uint3(unpackUnorm4x8(srcPixelsBlock[0]).xyz * 255.0f); + mask = 0xAAAAAAAAu; + maxEndp16 = + c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0]; + minEndp16 = + c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1]; + } else { +#ifdef BC1_DITHER + uint ditherPixelsBlock[16]; + // first step: compute dithered version for PCA if desired + DitherBlock(srcPixelsBlock, ditherPixelsBlock); +#else +#define ditherPixelsBlock srcPixelsBlock +#endif + + // second step: pca+map along principal axis + OptimizeColorsBlock(ditherPixelsBlock, minEndp16, maxEndp16); + if (minEndp16 != maxEndp16) { + float3 colors[4]; + EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted + mask = MatchColorsBlock(srcPixelsBlock, colors); + } + + // third step: refine (multiple times if requested) + bool bStopRefinement = false; + for (uint i = 0u; i < params.p_numRefinements && !bStopRefinement; ++i) { + const uint lastMask = mask; + + if (RefineBlock(ditherPixelsBlock, mask, minEndp16, maxEndp16)) { + if (minEndp16 != maxEndp16) { + float3 colors[4]; + EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted + mask = MatchColorsBlock(srcPixelsBlock, colors); + } else { + mask = 0u; + bStopRefinement = true; + } + } + + bStopRefinement = mask == lastMask || bStopRefinement; + } + } + + // write the color block + if (maxEndp16 < minEndp16) { + const float tmpValue = minEndp16; + minEndp16 = maxEndp16; + maxEndp16 = tmpValue; + mask ^= 0x55555555u; + } + + uint2 outputBytes; + outputBytes.x = uint(maxEndp16) | (uint(minEndp16) << 16u); + outputBytes.y = mask; + + uint2 dstUV = gl_GlobalInvocationID.xy; + imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u)); +} diff --git a/modules/betsy/betsy_bc1.h b/modules/betsy/betsy_bc1.h new file mode 100644 index 00000000000..2274ed0a81e --- /dev/null +++ b/modules/betsy/betsy_bc1.h @@ -0,0 +1,1061 @@ +/**************************************************************************/ +/* betsy_bc1.h */ +/**************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#ifndef BETSY_BC1_H +#define BETSY_BC1_H + +constexpr const float dxt1_encoding_table[1024] = { + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 1, + 1, + 0, + 1, + 0, + 1, + 0, + 1, + 1, + 1, + 1, + 2, + 0, + 2, + 0, + 0, + 4, + 2, + 1, + 2, + 1, + 2, + 1, + 3, + 0, + 3, + 0, + 3, + 0, + 3, + 1, + 1, + 5, + 3, + 2, + 3, + 2, + 4, + 0, + 4, + 0, + 4, + 1, + 4, + 1, + 4, + 2, + 4, + 2, + 4, + 2, + 3, + 5, + 5, + 1, + 5, + 1, + 5, + 2, + 4, + 4, + 5, + 3, + 5, + 3, + 5, + 3, + 6, + 2, + 6, + 2, + 6, + 2, + 6, + 3, + 5, + 5, + 6, + 4, + 6, + 4, + 4, + 8, + 7, + 3, + 7, + 3, + 7, + 3, + 7, + 4, + 7, + 4, + 7, + 4, + 7, + 5, + 5, + 9, + 7, + 6, + 7, + 6, + 8, + 4, + 8, + 4, + 8, + 5, + 8, + 5, + 8, + 6, + 8, + 6, + 8, + 6, + 7, + 9, + 9, + 5, + 9, + 5, + 9, + 6, + 8, + 8, + 9, + 7, + 9, + 7, + 9, + 7, + 10, + 6, + 10, + 6, + 10, + 6, + 10, + 7, + 9, + 9, + 10, + 8, + 10, + 8, + 8, + 12, + 11, + 7, + 11, + 7, + 11, + 7, + 11, + 8, + 11, + 8, + 11, + 8, + 11, + 9, + 9, + 13, + 11, + 10, + 11, + 10, + 12, + 8, + 12, + 8, + 12, + 9, + 12, + 9, + 12, + 10, + 12, + 10, + 12, + 10, + 11, + 13, + 13, + 9, + 13, + 9, + 13, + 10, + 12, + 12, + 13, + 11, + 13, + 11, + 13, + 11, + 14, + 10, + 14, + 10, + 14, + 10, + 14, + 11, + 13, + 13, + 14, + 12, + 14, + 12, + 12, + 16, + 15, + 11, + 15, + 11, + 15, + 11, + 15, + 12, + 15, + 12, + 15, + 12, + 15, + 13, + 13, + 17, + 15, + 14, + 15, + 14, + 16, + 12, + 16, + 12, + 16, + 13, + 16, + 13, + 16, + 14, + 16, + 14, + 16, + 14, + 15, + 17, + 17, + 13, + 17, + 13, + 17, + 14, + 16, + 16, + 17, + 15, + 17, + 15, + 17, + 15, + 18, + 14, + 18, + 14, + 18, + 14, + 18, + 15, + 17, + 17, + 18, + 16, + 18, + 16, + 16, + 20, + 19, + 15, + 19, + 15, + 19, + 15, + 19, + 16, + 19, + 16, + 19, + 16, + 19, + 17, + 17, + 21, + 19, + 18, + 19, + 18, + 20, + 16, + 20, + 16, + 20, + 17, + 20, + 17, + 20, + 18, + 20, + 18, + 20, + 18, + 19, + 21, + 21, + 17, + 21, + 17, + 21, + 18, + 20, + 20, + 21, + 19, + 21, + 19, + 21, + 19, + 22, + 18, + 22, + 18, + 22, + 18, + 22, + 19, + 21, + 21, + 22, + 20, + 22, + 20, + 20, + 24, + 23, + 19, + 23, + 19, + 23, + 19, + 23, + 20, + 23, + 20, + 23, + 20, + 23, + 21, + 21, + 25, + 23, + 22, + 23, + 22, + 24, + 20, + 24, + 20, + 24, + 21, + 24, + 21, + 24, + 22, + 24, + 22, + 24, + 22, + 23, + 25, + 25, + 21, + 25, + 21, + 25, + 22, + 24, + 24, + 25, + 23, + 25, + 23, + 25, + 23, + 26, + 22, + 26, + 22, + 26, + 22, + 26, + 23, + 25, + 25, + 26, + 24, + 26, + 24, + 24, + 28, + 27, + 23, + 27, + 23, + 27, + 23, + 27, + 24, + 27, + 24, + 27, + 24, + 27, + 25, + 25, + 29, + 27, + 26, + 27, + 26, + 28, + 24, + 28, + 24, + 28, + 25, + 28, + 25, + 28, + 26, + 28, + 26, + 28, + 26, + 27, + 29, + 29, + 25, + 29, + 25, + 29, + 26, + 28, + 28, + 29, + 27, + 29, + 27, + 29, + 27, + 30, + 26, + 30, + 26, + 30, + 26, + 30, + 27, + 29, + 29, + 30, + 28, + 30, + 28, + 30, + 28, + 31, + 27, + 31, + 27, + 31, + 27, + 31, + 28, + 31, + 28, + 31, + 28, + 31, + 29, + 31, + 29, + 31, + 30, + 31, + 30, + 31, + 30, + 31, + 31, + 31, + 31, + 0, + 0, + 0, + 1, + 1, + 0, + 1, + 0, + 1, + 1, + 2, + 0, + 2, + 1, + 3, + 0, + 3, + 0, + 3, + 1, + 4, + 0, + 4, + 0, + 4, + 1, + 5, + 0, + 5, + 1, + 6, + 0, + 6, + 0, + 6, + 1, + 7, + 0, + 7, + 0, + 7, + 1, + 8, + 0, + 8, + 1, + 8, + 1, + 8, + 2, + 9, + 1, + 9, + 2, + 9, + 2, + 9, + 3, + 10, + 2, + 10, + 3, + 10, + 3, + 10, + 4, + 11, + 3, + 11, + 4, + 11, + 4, + 11, + 5, + 12, + 4, + 12, + 5, + 12, + 5, + 12, + 6, + 13, + 5, + 13, + 6, + 8, + 16, + 13, + 7, + 14, + 6, + 14, + 7, + 9, + 17, + 14, + 8, + 15, + 7, + 15, + 8, + 11, + 16, + 15, + 9, + 15, + 10, + 16, + 8, + 16, + 9, + 16, + 10, + 15, + 13, + 17, + 9, + 17, + 10, + 17, + 11, + 15, + 16, + 18, + 10, + 18, + 11, + 18, + 12, + 16, + 16, + 19, + 11, + 19, + 12, + 19, + 13, + 17, + 17, + 20, + 12, + 20, + 13, + 20, + 14, + 19, + 16, + 21, + 13, + 21, + 14, + 21, + 15, + 20, + 17, + 22, + 14, + 22, + 15, + 25, + 10, + 22, + 16, + 23, + 15, + 23, + 16, + 26, + 11, + 23, + 17, + 24, + 16, + 24, + 17, + 27, + 12, + 24, + 18, + 25, + 17, + 25, + 18, + 28, + 13, + 25, + 19, + 26, + 18, + 26, + 19, + 29, + 14, + 26, + 20, + 27, + 19, + 27, + 20, + 30, + 15, + 27, + 21, + 28, + 20, + 28, + 21, + 28, + 21, + 28, + 22, + 29, + 21, + 29, + 22, + 24, + 32, + 29, + 23, + 30, + 22, + 30, + 23, + 25, + 33, + 30, + 24, + 31, + 23, + 31, + 24, + 27, + 32, + 31, + 25, + 31, + 26, + 32, + 24, + 32, + 25, + 32, + 26, + 31, + 29, + 33, + 25, + 33, + 26, + 33, + 27, + 31, + 32, + 34, + 26, + 34, + 27, + 34, + 28, + 32, + 32, + 35, + 27, + 35, + 28, + 35, + 29, + 33, + 33, + 36, + 28, + 36, + 29, + 36, + 30, + 35, + 32, + 37, + 29, + 37, + 30, + 37, + 31, + 36, + 33, + 38, + 30, + 38, + 31, + 41, + 26, + 38, + 32, + 39, + 31, + 39, + 32, + 42, + 27, + 39, + 33, + 40, + 32, + 40, + 33, + 43, + 28, + 40, + 34, + 41, + 33, + 41, + 34, + 44, + 29, + 41, + 35, + 42, + 34, + 42, + 35, + 45, + 30, + 42, + 36, + 43, + 35, + 43, + 36, + 46, + 31, + 43, + 37, + 44, + 36, + 44, + 37, + 44, + 37, + 44, + 38, + 45, + 37, + 45, + 38, + 40, + 48, + 45, + 39, + 46, + 38, + 46, + 39, + 41, + 49, + 46, + 40, + 47, + 39, + 47, + 40, + 43, + 48, + 47, + 41, + 47, + 42, + 48, + 40, + 48, + 41, + 48, + 42, + 47, + 45, + 49, + 41, + 49, + 42, + 49, + 43, + 47, + 48, + 50, + 42, + 50, + 43, + 50, + 44, + 48, + 48, + 51, + 43, + 51, + 44, + 51, + 45, + 49, + 49, + 52, + 44, + 52, + 45, + 52, + 46, + 51, + 48, + 53, + 45, + 53, + 46, + 53, + 47, + 52, + 49, + 54, + 46, + 54, + 47, + 57, + 42, + 54, + 48, + 55, + 47, + 55, + 48, + 58, + 43, + 55, + 49, + 56, + 48, + 56, + 49, + 59, + 44, + 56, + 50, + 57, + 49, + 57, + 50, + 60, + 45, + 57, + 51, + 58, + 50, + 58, + 51, + 61, + 46, + 58, + 52, + 59, + 51, + 59, + 52, + 62, + 47, + 59, + 53, + 60, + 52, + 60, + 53, + 60, + 53, + 60, + 54, + 61, + 53, + 61, + 54, + 61, + 54, + 61, + 55, + 62, + 54, + 62, + 55, + 62, + 55, + 62, + 56, + 63, + 55, + 63, + 56, + 63, + 56, + 63, + 57, + 63, + 58, + 63, + 59, + 63, + 59, + 63, + 60, + 63, + 61, + 63, + 62, + 63, + 62, + 63, + 63, +}; + +#endif // BETSY_BC1_H diff --git a/modules/betsy/image_compress_betsy.cpp b/modules/betsy/image_compress_betsy.cpp index bc72203b2ff..07298d82fb5 100644 --- a/modules/betsy/image_compress_betsy.cpp +++ b/modules/betsy/image_compress_betsy.cpp @@ -30,6 +30,7 @@ #include "image_compress_betsy.h" +#include "core/config/project_settings.h" #include "servers/rendering/rendering_device_binds.h" #include "servers/rendering/rendering_server_default.h" @@ -40,113 +41,198 @@ #include "drivers/metal/rendering_context_driver_metal.h" #endif +#include "betsy_bc1.h" + +#include "bc1.glsl.gen.h" #include "bc6h.glsl.gen.h" -struct BC6PushConstant { - float sizeX; - float sizeY; - uint32_t padding[2]; -}; +// Static variables (for caching). + +static RenderingDevice *compress_rd = nullptr; +static RenderingContextDriver *compress_rcd = nullptr; + +static Mutex rd_mutex; +static Mutex shader_mutex; + +static HashMap> cached_shaders; + +// Betsy shader (for caching). + +BetsyShader::BetsyShader() { +} + +BetsyShader::~BetsyShader() { + // Free just the shader, the pipelines will be cleared automatically. + if (compress_rd && compiled.is_valid()) { + compress_rd->free(compiled); + } +} + +// Helper functions. static int get_next_multiple(int n, int m) { return n + (m - (n % m)); } -Error _compress_betsy(BetsyFormat p_format, Image *r_img) { +static String get_shader_name(BetsyFormat p_format) { + switch (p_format) { + case BETSY_FORMAT_BC1: + case BETSY_FORMAT_BC1_DITHER: + return "BC1"; + + case BETSY_FORMAT_BC3: + return "BC3"; + + case BETSY_FORMAT_BC6_SIGNED: + case BETSY_FORMAT_BC6_UNSIGNED: + return "BC6"; + + default: + return ""; + } +} + +Error compress_betsy(BetsyFormat p_format, Image *r_img) { uint64_t start_time = OS::get_singleton()->get_ticks_msec(); if (r_img->is_compressed()) { return ERR_INVALID_DATA; } - ERR_FAIL_COND_V_MSG(r_img->get_format() < Image::FORMAT_RF || r_img->get_format() > Image::FORMAT_RGBE9995, ERR_INVALID_DATA, "Image is not an HDR image."); - Error err = OK; - // Create local RD. - RenderingContextDriver *rcd = nullptr; - RenderingDevice *rd = RenderingServer::get_singleton()->create_local_rendering_device(); + rd_mutex.lock(); + if (!compress_rd) { + // Create local RD. + RenderingContextDriver *rcd = nullptr; + RenderingDevice *rd = RenderingServer::get_singleton()->create_local_rendering_device(); - if (rd == nullptr) { + if (rd == nullptr) { #if defined(RD_ENABLED) #if defined(METAL_ENABLED) - rcd = memnew(RenderingContextDriverMetal); - rd = memnew(RenderingDevice); + rcd = memnew(RenderingContextDriverMetal); + rd = memnew(RenderingDevice); #endif #if defined(VULKAN_ENABLED) - if (rcd == nullptr) { - rcd = memnew(RenderingContextDriverVulkan); - rd = memnew(RenderingDevice); - } -#endif -#endif - if (rcd != nullptr && rd != nullptr) { - err = rcd->initialize(); - if (err == OK) { - err = rd->initialize(rcd); + if (rcd == nullptr) { + rcd = memnew(RenderingContextDriverVulkan); + rd = memnew(RenderingDevice); } +#endif +#endif + if (rcd != nullptr && rd != nullptr) { + err = rcd->initialize(); + if (err == OK) { + err = rd->initialize(rcd); + } - if (err != OK) { - memdelete(rd); - memdelete(rcd); - rd = nullptr; - rcd = nullptr; + if (err != OK) { + memdelete(rd); + memdelete(rcd); + rd = nullptr; + rcd = nullptr; + } } } + + ERR_FAIL_NULL_V_MSG(rd, err, "Unable to create a local RenderingDevice."); + + compress_rd = rd; + compress_rcd = rcd; } - - ERR_FAIL_NULL_V_MSG(rd, err, "Unable to create a local RenderingDevice."); - - Ref compute_shader; - compute_shader.instantiate(); + rd_mutex.unlock(); // Destination format. Image::Format dest_format = Image::FORMAT_MAX; + RD::DataFormat dst_rd_format = RD::DATA_FORMAT_MAX; String version = ""; switch (p_format) { - case BETSY_FORMAT_BC6: { - err = compute_shader->parse_versions_from_text(bc6h_shader_glsl); + case BETSY_FORMAT_BC1: + version = "standard"; + dst_rd_format = RD::DATA_FORMAT_R32G32_UINT; + dest_format = Image::FORMAT_DXT1; + break; - if (r_img->detect_signed(true)) { - dest_format = Image::FORMAT_BPTC_RGBF; - version = "signed"; - } else { - dest_format = Image::FORMAT_BPTC_RGBFU; - version = "unsigned"; - } + case BETSY_FORMAT_BC1_DITHER: + version = "dithered"; + dst_rd_format = RD::DATA_FORMAT_R32G32_UINT; + dest_format = Image::FORMAT_DXT1; + break; - } break; + case BETSY_FORMAT_BC6_SIGNED: + version = "signed"; + dst_rd_format = RD::DATA_FORMAT_R32G32B32A32_UINT; + dest_format = Image::FORMAT_BPTC_RGBF; + break; + + case BETSY_FORMAT_BC6_UNSIGNED: + version = "unsigned"; + dst_rd_format = RD::DATA_FORMAT_R32G32B32A32_UINT; + dest_format = Image::FORMAT_BPTC_RGBFU; + break; default: err = ERR_INVALID_PARAMETER; break; } - if (err != OK) { - compute_shader->print_errors("Betsy compress shader"); - memdelete(rd); - if (rcd != nullptr) { - memdelete(rcd); + const String shader_name = get_shader_name(p_format) + "-" + version; + const BetsyShader *shader_ptr; + + shader_mutex.lock(); + if (cached_shaders.has(shader_name)) { + shader_ptr = cached_shaders[shader_name].ptr(); + + } else { + Ref shader; + shader.instantiate(); + + Ref source; + source.instantiate(); + + switch (p_format) { + case BETSY_FORMAT_BC1: + case BETSY_FORMAT_BC1_DITHER: + err = source->parse_versions_from_text(bc1_shader_glsl); + break; + + case BETSY_FORMAT_BC6_UNSIGNED: + case BETSY_FORMAT_BC6_SIGNED: + err = source->parse_versions_from_text(bc6h_shader_glsl); + break; + + default: + err = ERR_INVALID_PARAMETER; + break; } - return err; - } - - // Compile the shader, return early if invalid. - RID shader = rd->shader_create_from_spirv(compute_shader->get_spirv_stages(version)); - - if (shader.is_null()) { - memdelete(rd); - if (rcd != nullptr) { - memdelete(rcd); + if (err != OK) { + source->print_errors("Betsy compress shader"); + return err; } - return err; - } + // Compile the shader, return early if invalid. + shader->compiled = compress_rd->shader_create_from_spirv(source->get_spirv_stages(version)); + if (shader->compiled.is_null()) { + return ERR_CANT_CREATE; + } - RID pipeline = rd->compute_pipeline_create(shader); + // Compile the pipeline, return early if invalid. + shader->pipeline = compress_rd->compute_pipeline_create(shader->compiled); + if (shader->pipeline.is_null()) { + return ERR_CANT_CREATE; + } + + cached_shaders[shader_name] = shader; + shader_ptr = cached_shaders[shader_name].ptr(); + } + shader_mutex.unlock(); + + if (shader_ptr->compiled.is_null() || shader_ptr->pipeline.is_null()) { + return ERR_INVALID_DATA; + } // src_texture format information. RD::TextureFormat src_texture_format; @@ -159,6 +245,33 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) { } switch (r_img->get_format()) { + case Image::FORMAT_L8: + r_img->convert(Image::FORMAT_RGBA8); + src_texture_format.format = RD::DATA_FORMAT_R8G8B8A8_UNORM; + break; + + case Image::FORMAT_LA8: + r_img->convert(Image::FORMAT_RGBA8); + src_texture_format.format = RD::DATA_FORMAT_R8G8B8A8_UNORM; + break; + + case Image::FORMAT_R8: + src_texture_format.format = RD::DATA_FORMAT_R8_UNORM; + break; + + case Image::FORMAT_RG8: + src_texture_format.format = RD::DATA_FORMAT_R8G8_UNORM; + break; + + case Image::FORMAT_RGB8: + r_img->convert(Image::FORMAT_RGBA8); + src_texture_format.format = RD::DATA_FORMAT_R8G8B8A8_UNORM; + break; + + case Image::FORMAT_RGBA8: + src_texture_format.format = RD::DATA_FORMAT_R8G8B8A8_UNORM; + break; + case Image::FORMAT_RH: src_texture_format.format = RD::DATA_FORMAT_R16_SFLOAT; break; @@ -198,13 +311,6 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) { break; default: { - rd->free(shader); - - memdelete(rd); - if (rcd != nullptr) { - memdelete(rcd); - } - return err; } } @@ -219,12 +325,25 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) { src_sampler_state.mip_filter = RD::SAMPLER_FILTER_NEAREST; } - RID src_sampler = rd->sampler_create(src_sampler_state); + RID src_sampler = compress_rd->sampler_create(src_sampler_state); // For the destination format just copy the source format and change the usage bits. RD::TextureFormat dst_texture_format = src_texture_format; dst_texture_format.usage_bits = RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT | RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_CAN_COPY_FROM_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT | RD::TEXTURE_USAGE_CAN_UPDATE_BIT; - dst_texture_format.format = RD::DATA_FORMAT_R32G32B32A32_UINT; + dst_texture_format.format = dst_rd_format; + + RID encoding_table_buffer; + bool uses_encoding_table = false; + + // Encoding table setup. + if (dest_format == Image::FORMAT_DXT1) { + Vector data; + data.resize(1024 * 4); + memcpy(data.ptrw(), dxt1_encoding_table, 1024 * 4); + + encoding_table_buffer = compress_rd->storage_buffer_create(1024 * 4, data); + uses_encoding_table = true; + } const int mip_count = r_img->get_mipmap_count() + 1; @@ -256,8 +375,41 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) { memcpy(src_image_ptr[0].ptrw(), r_img->ptr() + ofs, size); // Create the textures on the GPU. - RID src_texture = rd->texture_create(src_texture_format, RD::TextureView(), src_images); - RID dst_texture = rd->texture_create(dst_texture_format, RD::TextureView()); + RID src_texture = compress_rd->texture_create(src_texture_format, RD::TextureView(), src_images); + RID dst_texture = compress_rd->texture_create(dst_texture_format, RD::TextureView()); + + Vector uniforms; + { + { + RD::Uniform u; + u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE; + u.binding = 0; + u.append_id(src_sampler); + u.append_id(src_texture); + uniforms.push_back(u); + } + { + RD::Uniform u; + u.uniform_type = RD::UNIFORM_TYPE_IMAGE; + u.binding = 1; + u.append_id(dst_texture); + uniforms.push_back(u); + } + + if (uses_encoding_table) { + RD::Uniform u; + u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER; + u.binding = 2; + u.append_id(encoding_table_buffer); + uniforms.push_back(u); + } + } + + RID uniform_set = compress_rd->uniform_set_create(uniforms, shader_ptr->compiled, 0); + RD::ComputeListID compute_list = compress_rd->compute_list_begin(); + + compress_rd->compute_list_bind_compute_pipeline(compute_list, shader_ptr->pipeline); + compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0); if (dest_format == Image::FORMAT_BPTC_RGBFU || dest_format == Image::FORMAT_BPTC_RGBF) { BC6PushConstant push_constant; @@ -266,47 +418,33 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) { push_constant.padding[0] = 0; push_constant.padding[1] = 0; - Vector uniforms; - { - { - RD::Uniform u; - u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE; - u.binding = 0; - u.append_id(src_sampler); - u.append_id(src_texture); - uniforms.push_back(u); - } - { - RD::Uniform u; - u.uniform_type = RD::UNIFORM_TYPE_IMAGE; - u.binding = 1; - u.append_id(dst_texture); - uniforms.push_back(u); - } - } + compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant)); - RID uniform_set = rd->uniform_set_create(uniforms, shader, 0); - RD::ComputeListID compute_list = rd->compute_list_begin(); + } else { + BC1PushConstant push_constant; + push_constant.num_refines = 2; + push_constant.padding[0] = 0; + push_constant.padding[1] = 0; + push_constant.padding[2] = 0; - rd->compute_list_bind_compute_pipeline(compute_list, pipeline); - rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0); - rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant)); - rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1); - rd->compute_list_end(); + compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant)); } - rd->submit(); - rd->sync(); + compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1); + compress_rd->compute_list_end(); + + compress_rd->submit(); + compress_rd->sync(); // Copy data from the GPU to the buffer. - const Vector texture_data = rd->texture_get_data(dst_texture, 0); + const Vector texture_data = compress_rd->texture_get_data(dst_texture, 0); int64_t dst_ofs = Image::get_image_mipmap_offset(r_img->get_width(), r_img->get_height(), dest_format, i); memcpy(dst_data_ptr + dst_ofs, texture_data.ptr(), texture_data.size()); // Free the source and dest texture. - rd->free(dst_texture); - rd->free(src_texture); + compress_rd->free(dst_texture); + compress_rd->free(src_texture); } src_images.clear(); @@ -315,14 +453,11 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) { r_img->set_data(r_img->get_width(), r_img->get_height(), r_img->has_mipmaps(), dest_format, dst_data); // Free the shader (dependencies will be cleared automatically). - rd->free(src_sampler); - rd->free(shader); - - memdelete(rd); - if (rcd != nullptr) { - memdelete(rcd); + if (uses_encoding_table) { + compress_rd->free(encoding_table_buffer); } + compress_rd->free(src_sampler); print_verbose(vformat("Betsy: Encoding took %d ms.", OS::get_singleton()->get_ticks_msec() - start_time)); return OK; @@ -330,10 +465,61 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) { Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels) { Image::Format format = r_img->get_format(); + Error result = ERR_UNAVAILABLE; if (format >= Image::FORMAT_RF && format <= Image::FORMAT_RGBE9995) { - return _compress_betsy(BETSY_FORMAT_BC6, r_img); + if (r_img->detect_signed()) { + result = compress_betsy(BETSY_FORMAT_BC6_SIGNED, r_img); + } else { + result = compress_betsy(BETSY_FORMAT_BC6_UNSIGNED, r_img); + } } - return ERR_UNAVAILABLE; + if (!GLOBAL_GET("rendering/textures/vram_compression/cache_gpu_compressor")) { + free_device(); + } + + return result; +} + +Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels) { + Error result = ERR_UNAVAILABLE; + + switch (p_channels) { + case Image::USED_CHANNELS_RGB: + result = compress_betsy(BETSY_FORMAT_BC1_DITHER, r_img); + break; + + case Image::USED_CHANNELS_L: + result = compress_betsy(BETSY_FORMAT_BC1, r_img); + break; + + default: + break; + } + + if (!GLOBAL_GET("rendering/textures/vram_compression/cache_gpu_compressor")) { + free_device(); + } + + return result; +} + +void free_device() { + if (compress_rd != nullptr) { + // Clear the shader cache, shaders will be unreferenced automatically. + shader_mutex.lock(); + cached_shaders.clear(); + shader_mutex.unlock(); + + // Free the RD (and RCD if necessary). + rd_mutex.lock(); + memdelete(compress_rd); + compress_rd = nullptr; + if (compress_rcd != nullptr) { + memdelete(compress_rcd); + compress_rcd = nullptr; + } + rd_mutex.unlock(); + } } diff --git a/modules/betsy/image_compress_betsy.h b/modules/betsy/image_compress_betsy.h index a64e586c76c..522b09160a6 100644 --- a/modules/betsy/image_compress_betsy.h +++ b/modules/betsy/image_compress_betsy.h @@ -34,11 +34,38 @@ #include "core/io/image.h" enum BetsyFormat { - BETSY_FORMAT_BC6, + BETSY_FORMAT_BC1, + BETSY_FORMAT_BC1_DITHER, + BETSY_FORMAT_BC3, + BETSY_FORMAT_BC6_SIGNED, + BETSY_FORMAT_BC6_UNSIGNED, }; -Error _compress_betsy(BetsyFormat p_format, Image *r_img); +class BetsyShader : public RefCounted { +public: + RID compiled; + RID pipeline; + + BetsyShader(); + ~BetsyShader(); +}; + +struct BC6PushConstant { + float sizeX; + float sizeY; + uint32_t padding[2]; +}; + +struct BC1PushConstant { + uint32_t num_refines; + uint32_t padding[3]; +}; + +void free_device(); + +Error compress_betsy(BetsyFormat p_format, Image *r_img); Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels); +Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels); #endif // IMAGE_COMPRESS_BETSY_H diff --git a/modules/betsy/register_types.cpp b/modules/betsy/register_types.cpp index 019099e67c6..a3a3b5a99bf 100644 --- a/modules/betsy/register_types.cpp +++ b/modules/betsy/register_types.cpp @@ -38,10 +38,13 @@ void initialize_betsy_module(ModuleInitializationLevel p_level) { } Image::_image_compress_bptc_rd_func = _betsy_compress_bptc; + Image::_image_compress_bc_rd_func = _betsy_compress_s3tc; } void uninitialize_betsy_module(ModuleInitializationLevel p_level) { if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) { return; } + + free_device(); } diff --git a/servers/rendering_server.cpp b/servers/rendering_server.cpp index f354e83893b..020fa94e9dc 100644 --- a/servers/rendering_server.cpp +++ b/servers/rendering_server.cpp @@ -3528,6 +3528,7 @@ void RenderingServer::init() { GLOBAL_DEF_RST("rendering/textures/vram_compression/import_s3tc_bptc", false); GLOBAL_DEF_RST("rendering/textures/vram_compression/import_etc2_astc", false); GLOBAL_DEF("rendering/textures/vram_compression/compress_with_gpu", true); + GLOBAL_DEF("rendering/textures/vram_compression/cache_gpu_compressor", true); GLOBAL_DEF("rendering/textures/lossless_compression/force_png", false); diff --git a/thirdparty/README.md b/thirdparty/README.md index dbe20ba2a58..ca6be902e32 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -78,7 +78,7 @@ fix build with our own copy of zstd (patch in `patches`). Files extracted from upstream source: -- `bc6h.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`. +- `bc6h.glsl`, `bc1.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`. - `LICENSE.md`