From 606eedb0c9093789018f64381e79e7d93dd8752d Mon Sep 17 00:00:00 2001
From: BlueCube3310 <53150244+BlueCube3310@users.noreply.github.com>
Date: Tue, 20 Aug 2024 15:14:48 +0200
Subject: [PATCH] Betsy: Add caching and BC1 compression support
---
core/io/image.cpp | 14 +
core/io/image.h | 1 +
doc/classes/ProjectSettings.xml | 7 +-
modules/betsy/SCsub | 1 +
modules/betsy/bc1.glsl | 483 +++++++++++
modules/betsy/betsy_bc1.h | 1061 ++++++++++++++++++++++++
modules/betsy/image_compress_betsy.cpp | 408 ++++++---
modules/betsy/image_compress_betsy.h | 31 +-
modules/betsy/register_types.cpp | 3 +
servers/rendering_server.cpp | 1 +
thirdparty/README.md | 2 +-
11 files changed, 1896 insertions(+), 116 deletions(-)
create mode 100644 modules/betsy/bc1.glsl
create mode 100644 modules/betsy/betsy_bc1.h
diff --git a/core/io/image.cpp b/core/io/image.cpp
index fcbe483e381..bc018bd45c6 100644
--- a/core/io/image.cpp
+++ b/core/io/image.cpp
@@ -2751,6 +2751,19 @@ Error Image::compress_from_channels(CompressMode p_mode, UsedChannels p_channels
} break;
+ case COMPRESS_S3TC: {
+ // BC3 is unsupported currently.
+ if ((p_channels == USED_CHANNELS_RGB || p_channels == USED_CHANNELS_L) && _image_compress_bc_rd_func) {
+ Error result = _image_compress_bc_rd_func(this, p_channels);
+
+ // If the image was compressed successfully, we return here. If not, we fall back to the default compression scheme.
+ if (result == OK) {
+ return OK;
+ }
+ }
+
+ } break;
+
default: {
}
}
@@ -3138,6 +3151,7 @@ void (*Image::_image_compress_etc1_func)(Image *) = nullptr;
void (*Image::_image_compress_etc2_func)(Image *, Image::UsedChannels) = nullptr;
void (*Image::_image_compress_astc_func)(Image *, Image::ASTCFormat) = nullptr;
Error (*Image::_image_compress_bptc_rd_func)(Image *, Image::UsedChannels) = nullptr;
+Error (*Image::_image_compress_bc_rd_func)(Image *, Image::UsedChannels) = nullptr;
void (*Image::_image_decompress_bc)(Image *) = nullptr;
void (*Image::_image_decompress_bptc)(Image *) = nullptr;
void (*Image::_image_decompress_etc1)(Image *) = nullptr;
diff --git a/core/io/image.h b/core/io/image.h
index 4461ae71a6e..78757246e06 100644
--- a/core/io/image.h
+++ b/core/io/image.h
@@ -160,6 +160,7 @@ public:
static void (*_image_compress_astc_func)(Image *, ASTCFormat p_format);
static Error (*_image_compress_bptc_rd_func)(Image *, UsedChannels p_channels);
+ static Error (*_image_compress_bc_rd_func)(Image *, UsedChannels p_channels);
static void (*_image_decompress_bc)(Image *);
static void (*_image_decompress_bptc)(Image *);
diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml
index e5b787714f5..7b834c166b6 100644
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -2890,10 +2890,13 @@
If [code]true[/code], the texture importer will import lossless textures using the PNG format. Otherwise, it will default to using WebP.
+
+ If [code]true[/code], the GPU texture compressor will cache the local RenderingDevice and its resources (shaders and pipelines), allowing for faster subsequent imports at a memory cost.
+
- If [code]true[/code], the texture importer will utilize the GPU for compressing textures, which makes large textures import significantly faster.
+ If [code]true[/code], the texture importer will utilize the GPU for compressing textures, improving the import time of large images.
[b]Note:[/b] This setting requires either Vulkan or D3D12 available as a rendering backend.
- [b]Note:[/b] Currently this only affects BC6H compression, which is used on Desktop and Console for HDR images.
+ [b]Note:[/b] Currently this only affects BC1 and BC6H compression, which are used on Desktop and Console for fully opaque and HDR images respectively.
If [code]true[/code], the texture importer will import VRAM-compressed textures using the Ericsson Texture Compression 2 algorithm for lower quality textures and normal maps and Adaptable Scalable Texture Compression algorithm for high quality textures (in 4×4 block size).
diff --git a/modules/betsy/SCsub b/modules/betsy/SCsub
index 9930e1f4cf9..ed5dcbf58b7 100644
--- a/modules/betsy/SCsub
+++ b/modules/betsy/SCsub
@@ -4,6 +4,7 @@ Import("env_modules")
env_betsy = env_modules.Clone()
env_betsy.GLSL_HEADER("bc6h.glsl")
+env_betsy.GLSL_HEADER("bc1.glsl")
env_betsy.Depends(Glob("*.glsl.gen.h"), ["#glsl_builders.py"])
# Thirdparty source files
diff --git a/modules/betsy/bc1.glsl b/modules/betsy/bc1.glsl
new file mode 100644
index 00000000000..f1b2c282547
--- /dev/null
+++ b/modules/betsy/bc1.glsl
@@ -0,0 +1,483 @@
+#[versions]
+
+standard = "";
+dithered = "#define BC1_DITHER";
+
+#[compute]
+#version 450
+
+#include "CrossPlatformSettings_piece_all.glsl"
+#include "UavCrossPlatform_piece_all.glsl"
+
+#define FLT_MAX 340282346638528859811704183484516925440.0f
+
+layout(binding = 0) uniform sampler2D srcTex;
+layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
+
+layout(std430, binding = 2) readonly restrict buffer globalBuffer {
+ float2 c_oMatch5[256];
+ float2 c_oMatch6[256];
+};
+
+layout(push_constant, std430) uniform Params {
+ uint p_numRefinements;
+ uint p_padding[3];
+}
+params;
+
+layout(local_size_x = 8, //
+ local_size_y = 8, //
+ local_size_z = 1) in;
+
+float3 rgb565to888(float rgb565) {
+ float3 retVal;
+ retVal.x = floor(rgb565 / 2048.0f);
+ retVal.y = floor(mod(rgb565, 2048.0f) / 32.0f);
+ retVal.z = floor(mod(rgb565, 32.0f));
+
+ // This is the correct 565 to 888 conversion:
+ // rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )
+ //
+ // However stb_dxt follows a different one:
+ // rb = floor( rb * ( 256 / 32 + 8 / 32 ) );
+ // g = floor( g * ( 256 / 64 + 4 / 64 ) );
+ //
+ // I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded
+ // It's quite possible this is the reason:
+ // http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/
+ //
+ // Or maybe it's just because it's cheap to do with integer shifts.
+ // Anyway, we follow stb_dxt's conversion just in case
+ // (gives almost the same result, with 1 or -1 of difference for a very few values)
+ //
+ // Perhaps when we make 888 -> 565 -> 888 it doesn't matter
+ // because they end up mapping to the original number
+
+ return floor(retVal * float3(8.25f, 4.0625f, 8.25f));
+}
+
+float rgb888to565(float3 rgbValue) {
+ rgbValue.rb = floor(rgbValue.rb * 31.0f / 255.0f + 0.5f);
+ rgbValue.g = floor(rgbValue.g * 63.0f / 255.0f + 0.5f);
+
+ return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
+}
+
+// linear interpolation at 1/3 point between a and b, using desired rounding type
+float3 lerp13(float3 a, float3 b) {
+#ifdef STB_DXT_USE_ROUNDING_BIAS
+ // with rounding bias
+ return a + floor((b - a) * (1.0f / 3.0f) + 0.5f);
+#else
+ // without rounding bias
+ return floor((2.0f * a + b) / 3.0f);
+#endif
+}
+
+/// Unpacks a block of 4 colors from two 16-bit endpoints
+void EvalColors(out float3 colors[4], float c0, float c1) {
+ colors[0] = rgb565to888(c0);
+ colors[1] = rgb565to888(c1);
+ colors[2] = lerp13(colors[0], colors[1]);
+ colors[3] = lerp13(colors[1], colors[0]);
+}
+
+/** The color optimization function. (Clever code, part 1)
+@param outMinEndp16 [out]
+ Minimum endpoint, in RGB565
+@param outMaxEndp16 [out]
+ Maximum endpoint, in RGB565
+*/
+void OptimizeColorsBlock(const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16) {
+ // determine color distribution
+ float3 avgColor;
+ float3 minColor;
+ float3 maxColor;
+
+ avgColor = minColor = maxColor = unpackUnorm4x8(srcPixelsBlock[0]).xyz;
+ for (int i = 1; i < 16; ++i) {
+ const float3 currColorUnorm = unpackUnorm4x8(srcPixelsBlock[i]).xyz;
+ avgColor += currColorUnorm;
+ minColor = min(minColor, currColorUnorm);
+ maxColor = max(maxColor, currColorUnorm);
+ }
+
+ avgColor = round(avgColor * 255.0f / 16.0f);
+ maxColor *= 255.0f;
+ minColor *= 255.0f;
+
+ // determine covariance matrix
+ float cov[6];
+ for (int i = 0; i < 6; ++i)
+ cov[i] = 0;
+
+ for (int i = 0; i < 16; ++i) {
+ const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+ float3 rgbDiff = currColor - avgColor;
+
+ cov[0] += rgbDiff.r * rgbDiff.r;
+ cov[1] += rgbDiff.r * rgbDiff.g;
+ cov[2] += rgbDiff.r * rgbDiff.b;
+ cov[3] += rgbDiff.g * rgbDiff.g;
+ cov[4] += rgbDiff.g * rgbDiff.b;
+ cov[5] += rgbDiff.b * rgbDiff.b;
+ }
+
+ // convert covariance matrix to float, find principal axis via power iter
+ for (int i = 0; i < 6; ++i)
+ cov[i] /= 255.0f;
+
+ float3 vF = maxColor - minColor;
+
+ const int nIterPower = 4;
+ for (int iter = 0; iter < nIterPower; ++iter) {
+ const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
+ const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
+ const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];
+
+ vF.r = r;
+ vF.g = g;
+ vF.b = b;
+ }
+
+ float magn = max3(abs(vF.r), abs(vF.g), abs(vF.b));
+ float3 v;
+
+ if (magn < 4.0f) { // too small, default to luminance
+ v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000.
+ v.g = 587.0f;
+ v.b = 114.0f;
+ } else {
+ v = trunc(vF * (512.0f / magn));
+ }
+
+ // Pick colors at extreme points
+ float3 minEndpoint, maxEndpoint;
+ float minDot = FLT_MAX;
+ float maxDot = -FLT_MAX;
+ for (int i = 0; i < 16; ++i) {
+ const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+ const float dotValue = dot(currColor, v);
+
+ if (dotValue < minDot) {
+ minDot = dotValue;
+ minEndpoint = currColor;
+ }
+
+ if (dotValue > maxDot) {
+ maxDot = dotValue;
+ maxEndpoint = currColor;
+ }
+ }
+
+ outMinEndp16 = rgb888to565(minEndpoint);
+ outMaxEndp16 = rgb888to565(maxEndpoint);
+}
+
+// The color matching function
+uint MatchColorsBlock(const uint srcPixelsBlock[16], float3 color[4]) {
+ uint mask = 0u;
+ float3 dir = color[0] - color[1];
+ float stops[4];
+
+ for (int i = 0; i < 4; ++i)
+ stops[i] = dot(color[i], dir);
+
+ // think of the colors as arranged on a line; project point onto that line, then choose
+ // next color out of available ones. we compute the crossover points for "best color in top
+ // half"/"best in bottom half" and then the same inside that subinterval.
+ //
+ // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
+ // but it's very close and a lot faster.
+ // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
+
+ float c0Point = trunc((stops[1] + stops[3]) * 0.5f);
+ float halfPoint = trunc((stops[3] + stops[2]) * 0.5f);
+ float c3Point = trunc((stops[2] + stops[0]) * 0.5f);
+
+#ifndef BC1_DITHER
+ // the version without dithering is straightforward
+ for (uint i = 16u; i-- > 0u;) {
+ const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+
+ const float dotValue = dot(currColor, dir);
+ mask <<= 2u;
+
+ if (dotValue < halfPoint)
+ mask |= ((dotValue < c0Point) ? 1u : 3u);
+ else
+ mask |= ((dotValue < c3Point) ? 2u : 0u);
+ }
+#else
+ // with floyd-steinberg dithering
+ float4 ep1 = float4(0, 0, 0, 0);
+ float4 ep2 = float4(0, 0, 0, 0);
+
+ c0Point *= 16.0f;
+ halfPoint *= 16.0f;
+ c3Point *= 16.0f;
+
+ for (uint y = 0u; y < 4u; ++y) {
+ float ditherDot;
+ uint lmask, step;
+
+ float3 currColor;
+ float dotValue;
+
+ currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 0]).xyz * 255.0f;
+ dotValue = dot(currColor, dir);
+
+ ditherDot = (dotValue * 16.0f) + (3 * ep2[1] + 5 * ep2[0]);
+ if (ditherDot < halfPoint)
+ step = (ditherDot < c0Point) ? 1u : 3u;
+ else
+ step = (ditherDot < c3Point) ? 2u : 0u;
+ ep1[0] = dotValue - stops[step];
+ lmask = step;
+
+ currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 1]).xyz * 255.0f;
+ dotValue = dot(currColor, dir);
+
+ ditherDot = (dotValue * 16.0f) + (7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]);
+ if (ditherDot < halfPoint)
+ step = (ditherDot < c0Point) ? 1u : 3u;
+ else
+ step = (ditherDot < c3Point) ? 2u : 0u;
+ ep1[1] = dotValue - stops[step];
+ lmask |= step << 2u;
+
+ currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
+ dotValue = dot(currColor, dir);
+
+ ditherDot = (dotValue * 16.0f) + (7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]);
+ if (ditherDot < halfPoint)
+ step = (ditherDot < c0Point) ? 1u : 3u;
+ else
+ step = (ditherDot < c3Point) ? 2u : 0u;
+ ep1[2] = dotValue - stops[step];
+ lmask |= step << 4u;
+
+ currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
+ dotValue = dot(currColor, dir);
+
+ ditherDot = (dotValue * 16.0f) + (7 * ep1[2] + 5 * ep2[3] + ep2[2]);
+ if (ditherDot < halfPoint)
+ step = (ditherDot < c0Point) ? 1u : 3u;
+ else
+ step = (ditherDot < c3Point) ? 2u : 0u;
+ ep1[3] = dotValue - stops[step];
+ lmask |= step << 6u;
+
+ mask |= lmask << (y * 8u);
+ {
+ float4 tmp = ep1;
+ ep1 = ep2;
+ ep2 = tmp;
+ } // swap
+ }
+#endif
+
+ return mask;
+}
+
+// The refinement function. (Clever code, part 2)
+// Tries to optimize colors to suit block contents better.
+// (By solving a least squares system via normal equations+Cramer's rule)
+bool RefineBlock(const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
+ inout float inOutMaxEndp16) {
+ float newMin16, newMax16;
+ const float oldMin = inOutMinEndp16;
+ const float oldMax = inOutMaxEndp16;
+
+ if ((mask ^ (mask << 2u)) < 4u) // all pixels have the same index?
+ {
+ // yes, linear system would be singular; solve using optimal
+ // single-color match on average color
+ float3 rgbVal = float3(8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f);
+ for (int i = 0; i < 16; ++i)
+ rgbVal += unpackUnorm4x8(srcPixelsBlock[i]).xyz;
+
+ rgbVal = floor(rgbVal * (255.0f / 16.0f));
+
+ newMax16 = c_oMatch5[uint(rgbVal.r)][0] * 2048.0f + //
+ c_oMatch6[uint(rgbVal.g)][0] * 32.0f + //
+ c_oMatch5[uint(rgbVal.b)][0];
+ newMin16 = c_oMatch5[uint(rgbVal.r)][1] * 2048.0f + //
+ c_oMatch6[uint(rgbVal.g)][1] * 32.0f + //
+ c_oMatch5[uint(rgbVal.b)][1];
+ } else {
+ const float w1Tab[4] = { 3, 0, 2, 1 };
+ const float prods[4] = { 589824.0f, 2304.0f, 262402.0f, 66562.0f };
+ // ^some magic to save a lot of multiplies in the accumulating loop...
+ // (precomputed products of weights for least squares system, accumulated inside one 32-bit
+ // register)
+
+ float akku = 0.0f;
+ uint cm = mask;
+ float3 at1 = float3(0, 0, 0);
+ float3 at2 = float3(0, 0, 0);
+ for (int i = 0; i < 16; ++i, cm >>= 2u) {
+ const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+
+ const uint step = cm & 3u;
+ const float w1 = w1Tab[step];
+ akku += prods[step];
+ at1 += currColor * w1;
+ at2 += currColor;
+ }
+
+ at2 = 3.0f * at2 - at1;
+
+ // extract solutions and decide solvability
+ const float xx = floor(akku / 65535.0f);
+ const float yy = floor(mod(akku, 65535.0f) / 256.0f);
+ const float xy = mod(akku, 256.0f);
+
+ float2 f_rb_g;
+ f_rb_g.x = 3.0f * 31.0f / 255.0f / (xx * yy - xy * xy);
+ f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;
+
+ // solve.
+ const float3 newMaxVal = clamp(floor((at1 * yy - at2 * xy) * f_rb_g.xyx + 0.5f),
+ float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));
+ newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;
+
+ const float3 newMinVal = clamp(floor((at2 * xx - at1 * xy) * f_rb_g.xyx + 0.5f),
+ float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));
+ newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
+ }
+
+ inOutMinEndp16 = newMin16;
+ inOutMaxEndp16 = newMax16;
+
+ return oldMin != newMin16 || oldMax != newMax16;
+}
+
+#ifdef BC1_DITHER
+/// Quantizes 'srcValue' which is originally in 888 (full range),
+/// converting it to 565 and then back to 888 (quantized)
+float3 quant(float3 srcValue) {
+ srcValue = clamp(srcValue, 0.0f, 255.0f);
+ // Convert 888 -> 565
+ srcValue = floor(srcValue * float3(31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f) + 0.5f);
+ // Convert 565 -> 888 back
+ srcValue = floor(srcValue * float3(8.25f, 4.0625f, 8.25f));
+
+ return srcValue;
+}
+
+void DitherBlock(const uint srcPixBlck[16], out uint dthPixBlck[16]) {
+ float3 ep1[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };
+ float3 ep2[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };
+
+ for (uint y = 0u; y < 16u; y += 4u) {
+ float3 srcPixel, dithPixel;
+
+ srcPixel = unpackUnorm4x8(srcPixBlck[y + 0u]).xyz * 255.0f;
+ dithPixel = quant(srcPixel + trunc((3 * ep2[1] + 5 * ep2[0]) * (1.0f / 16.0f)));
+ ep1[0] = srcPixel - dithPixel;
+ dthPixBlck[y + 0u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+ srcPixel = unpackUnorm4x8(srcPixBlck[y + 1u]).xyz * 255.0f;
+ dithPixel = quant(
+ srcPixel + trunc((7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]) * (1.0f / 16.0f)));
+ ep1[1] = srcPixel - dithPixel;
+ dthPixBlck[y + 1u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+ srcPixel = unpackUnorm4x8(srcPixBlck[y + 2u]).xyz * 255.0f;
+ dithPixel = quant(
+ srcPixel + trunc((7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]) * (1.0f / 16.0f)));
+ ep1[2] = srcPixel - dithPixel;
+ dthPixBlck[y + 2u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+ srcPixel = unpackUnorm4x8(srcPixBlck[y + 3u]).xyz * 255.0f;
+ dithPixel = quant(srcPixel + trunc((7 * ep1[2] + 5 * ep2[3] + ep2[2]) * (1.0f / 16.0f)));
+ ep1[3] = srcPixel - dithPixel;
+ dthPixBlck[y + 3u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+ // swap( ep1, ep2 )
+ for (uint i = 0u; i < 4u; ++i) {
+ float3 tmp = ep1[i];
+ ep1[i] = ep2[i];
+ ep2[i] = tmp;
+ }
+ }
+}
+#endif
+
+void main() {
+ uint srcPixelsBlock[16];
+
+ bool bAllColorsEqual = true;
+
+ // Load the whole 4x4 block
+ const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
+ for (uint i = 0u; i < 16u; ++i) {
+ const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i & 0x03u, i >> 2u);
+ const float3 srcPixels0 = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyz;
+ srcPixelsBlock[i] = packUnorm4x8(float4(srcPixels0, 1.0f));
+ bAllColorsEqual = bAllColorsEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
+ }
+
+ float maxEndp16, minEndp16;
+ uint mask = 0u;
+
+ if (bAllColorsEqual) {
+ const uint3 rgbVal = uint3(unpackUnorm4x8(srcPixelsBlock[0]).xyz * 255.0f);
+ mask = 0xAAAAAAAAu;
+ maxEndp16 =
+ c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
+ minEndp16 =
+ c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
+ } else {
+#ifdef BC1_DITHER
+ uint ditherPixelsBlock[16];
+ // first step: compute dithered version for PCA if desired
+ DitherBlock(srcPixelsBlock, ditherPixelsBlock);
+#else
+#define ditherPixelsBlock srcPixelsBlock
+#endif
+
+ // second step: pca+map along principal axis
+ OptimizeColorsBlock(ditherPixelsBlock, minEndp16, maxEndp16);
+ if (minEndp16 != maxEndp16) {
+ float3 colors[4];
+ EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
+ mask = MatchColorsBlock(srcPixelsBlock, colors);
+ }
+
+ // third step: refine (multiple times if requested)
+ bool bStopRefinement = false;
+ for (uint i = 0u; i < params.p_numRefinements && !bStopRefinement; ++i) {
+ const uint lastMask = mask;
+
+ if (RefineBlock(ditherPixelsBlock, mask, minEndp16, maxEndp16)) {
+ if (minEndp16 != maxEndp16) {
+ float3 colors[4];
+ EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
+ mask = MatchColorsBlock(srcPixelsBlock, colors);
+ } else {
+ mask = 0u;
+ bStopRefinement = true;
+ }
+ }
+
+ bStopRefinement = mask == lastMask || bStopRefinement;
+ }
+ }
+
+ // write the color block
+ if (maxEndp16 < minEndp16) {
+ const float tmpValue = minEndp16;
+ minEndp16 = maxEndp16;
+ maxEndp16 = tmpValue;
+ mask ^= 0x55555555u;
+ }
+
+ uint2 outputBytes;
+ outputBytes.x = uint(maxEndp16) | (uint(minEndp16) << 16u);
+ outputBytes.y = mask;
+
+ uint2 dstUV = gl_GlobalInvocationID.xy;
+ imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u));
+}
diff --git a/modules/betsy/betsy_bc1.h b/modules/betsy/betsy_bc1.h
new file mode 100644
index 00000000000..2274ed0a81e
--- /dev/null
+++ b/modules/betsy/betsy_bc1.h
@@ -0,0 +1,1061 @@
+/**************************************************************************/
+/* betsy_bc1.h */
+/**************************************************************************/
+/* This file is part of: */
+/* GODOT ENGINE */
+/* https://godotengine.org */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
+/* */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the */
+/* "Software"), to deal in the Software without restriction, including */
+/* without limitation the rights to use, copy, modify, merge, publish, */
+/* distribute, sublicense, and/or sell copies of the Software, and to */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions: */
+/* */
+/* The above copyright notice and this permission notice shall be */
+/* included in all copies or substantial portions of the Software. */
+/* */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
+/**************************************************************************/
+
+#ifndef BETSY_BC1_H
+#define BETSY_BC1_H
+
+constexpr const float dxt1_encoding_table[1024] = {
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 1,
+ 0,
+ 1,
+ 1,
+ 0,
+ 1,
+ 0,
+ 1,
+ 0,
+ 1,
+ 1,
+ 1,
+ 1,
+ 2,
+ 0,
+ 2,
+ 0,
+ 0,
+ 4,
+ 2,
+ 1,
+ 2,
+ 1,
+ 2,
+ 1,
+ 3,
+ 0,
+ 3,
+ 0,
+ 3,
+ 0,
+ 3,
+ 1,
+ 1,
+ 5,
+ 3,
+ 2,
+ 3,
+ 2,
+ 4,
+ 0,
+ 4,
+ 0,
+ 4,
+ 1,
+ 4,
+ 1,
+ 4,
+ 2,
+ 4,
+ 2,
+ 4,
+ 2,
+ 3,
+ 5,
+ 5,
+ 1,
+ 5,
+ 1,
+ 5,
+ 2,
+ 4,
+ 4,
+ 5,
+ 3,
+ 5,
+ 3,
+ 5,
+ 3,
+ 6,
+ 2,
+ 6,
+ 2,
+ 6,
+ 2,
+ 6,
+ 3,
+ 5,
+ 5,
+ 6,
+ 4,
+ 6,
+ 4,
+ 4,
+ 8,
+ 7,
+ 3,
+ 7,
+ 3,
+ 7,
+ 3,
+ 7,
+ 4,
+ 7,
+ 4,
+ 7,
+ 4,
+ 7,
+ 5,
+ 5,
+ 9,
+ 7,
+ 6,
+ 7,
+ 6,
+ 8,
+ 4,
+ 8,
+ 4,
+ 8,
+ 5,
+ 8,
+ 5,
+ 8,
+ 6,
+ 8,
+ 6,
+ 8,
+ 6,
+ 7,
+ 9,
+ 9,
+ 5,
+ 9,
+ 5,
+ 9,
+ 6,
+ 8,
+ 8,
+ 9,
+ 7,
+ 9,
+ 7,
+ 9,
+ 7,
+ 10,
+ 6,
+ 10,
+ 6,
+ 10,
+ 6,
+ 10,
+ 7,
+ 9,
+ 9,
+ 10,
+ 8,
+ 10,
+ 8,
+ 8,
+ 12,
+ 11,
+ 7,
+ 11,
+ 7,
+ 11,
+ 7,
+ 11,
+ 8,
+ 11,
+ 8,
+ 11,
+ 8,
+ 11,
+ 9,
+ 9,
+ 13,
+ 11,
+ 10,
+ 11,
+ 10,
+ 12,
+ 8,
+ 12,
+ 8,
+ 12,
+ 9,
+ 12,
+ 9,
+ 12,
+ 10,
+ 12,
+ 10,
+ 12,
+ 10,
+ 11,
+ 13,
+ 13,
+ 9,
+ 13,
+ 9,
+ 13,
+ 10,
+ 12,
+ 12,
+ 13,
+ 11,
+ 13,
+ 11,
+ 13,
+ 11,
+ 14,
+ 10,
+ 14,
+ 10,
+ 14,
+ 10,
+ 14,
+ 11,
+ 13,
+ 13,
+ 14,
+ 12,
+ 14,
+ 12,
+ 12,
+ 16,
+ 15,
+ 11,
+ 15,
+ 11,
+ 15,
+ 11,
+ 15,
+ 12,
+ 15,
+ 12,
+ 15,
+ 12,
+ 15,
+ 13,
+ 13,
+ 17,
+ 15,
+ 14,
+ 15,
+ 14,
+ 16,
+ 12,
+ 16,
+ 12,
+ 16,
+ 13,
+ 16,
+ 13,
+ 16,
+ 14,
+ 16,
+ 14,
+ 16,
+ 14,
+ 15,
+ 17,
+ 17,
+ 13,
+ 17,
+ 13,
+ 17,
+ 14,
+ 16,
+ 16,
+ 17,
+ 15,
+ 17,
+ 15,
+ 17,
+ 15,
+ 18,
+ 14,
+ 18,
+ 14,
+ 18,
+ 14,
+ 18,
+ 15,
+ 17,
+ 17,
+ 18,
+ 16,
+ 18,
+ 16,
+ 16,
+ 20,
+ 19,
+ 15,
+ 19,
+ 15,
+ 19,
+ 15,
+ 19,
+ 16,
+ 19,
+ 16,
+ 19,
+ 16,
+ 19,
+ 17,
+ 17,
+ 21,
+ 19,
+ 18,
+ 19,
+ 18,
+ 20,
+ 16,
+ 20,
+ 16,
+ 20,
+ 17,
+ 20,
+ 17,
+ 20,
+ 18,
+ 20,
+ 18,
+ 20,
+ 18,
+ 19,
+ 21,
+ 21,
+ 17,
+ 21,
+ 17,
+ 21,
+ 18,
+ 20,
+ 20,
+ 21,
+ 19,
+ 21,
+ 19,
+ 21,
+ 19,
+ 22,
+ 18,
+ 22,
+ 18,
+ 22,
+ 18,
+ 22,
+ 19,
+ 21,
+ 21,
+ 22,
+ 20,
+ 22,
+ 20,
+ 20,
+ 24,
+ 23,
+ 19,
+ 23,
+ 19,
+ 23,
+ 19,
+ 23,
+ 20,
+ 23,
+ 20,
+ 23,
+ 20,
+ 23,
+ 21,
+ 21,
+ 25,
+ 23,
+ 22,
+ 23,
+ 22,
+ 24,
+ 20,
+ 24,
+ 20,
+ 24,
+ 21,
+ 24,
+ 21,
+ 24,
+ 22,
+ 24,
+ 22,
+ 24,
+ 22,
+ 23,
+ 25,
+ 25,
+ 21,
+ 25,
+ 21,
+ 25,
+ 22,
+ 24,
+ 24,
+ 25,
+ 23,
+ 25,
+ 23,
+ 25,
+ 23,
+ 26,
+ 22,
+ 26,
+ 22,
+ 26,
+ 22,
+ 26,
+ 23,
+ 25,
+ 25,
+ 26,
+ 24,
+ 26,
+ 24,
+ 24,
+ 28,
+ 27,
+ 23,
+ 27,
+ 23,
+ 27,
+ 23,
+ 27,
+ 24,
+ 27,
+ 24,
+ 27,
+ 24,
+ 27,
+ 25,
+ 25,
+ 29,
+ 27,
+ 26,
+ 27,
+ 26,
+ 28,
+ 24,
+ 28,
+ 24,
+ 28,
+ 25,
+ 28,
+ 25,
+ 28,
+ 26,
+ 28,
+ 26,
+ 28,
+ 26,
+ 27,
+ 29,
+ 29,
+ 25,
+ 29,
+ 25,
+ 29,
+ 26,
+ 28,
+ 28,
+ 29,
+ 27,
+ 29,
+ 27,
+ 29,
+ 27,
+ 30,
+ 26,
+ 30,
+ 26,
+ 30,
+ 26,
+ 30,
+ 27,
+ 29,
+ 29,
+ 30,
+ 28,
+ 30,
+ 28,
+ 30,
+ 28,
+ 31,
+ 27,
+ 31,
+ 27,
+ 31,
+ 27,
+ 31,
+ 28,
+ 31,
+ 28,
+ 31,
+ 28,
+ 31,
+ 29,
+ 31,
+ 29,
+ 31,
+ 30,
+ 31,
+ 30,
+ 31,
+ 30,
+ 31,
+ 31,
+ 31,
+ 31,
+ 0,
+ 0,
+ 0,
+ 1,
+ 1,
+ 0,
+ 1,
+ 0,
+ 1,
+ 1,
+ 2,
+ 0,
+ 2,
+ 1,
+ 3,
+ 0,
+ 3,
+ 0,
+ 3,
+ 1,
+ 4,
+ 0,
+ 4,
+ 0,
+ 4,
+ 1,
+ 5,
+ 0,
+ 5,
+ 1,
+ 6,
+ 0,
+ 6,
+ 0,
+ 6,
+ 1,
+ 7,
+ 0,
+ 7,
+ 0,
+ 7,
+ 1,
+ 8,
+ 0,
+ 8,
+ 1,
+ 8,
+ 1,
+ 8,
+ 2,
+ 9,
+ 1,
+ 9,
+ 2,
+ 9,
+ 2,
+ 9,
+ 3,
+ 10,
+ 2,
+ 10,
+ 3,
+ 10,
+ 3,
+ 10,
+ 4,
+ 11,
+ 3,
+ 11,
+ 4,
+ 11,
+ 4,
+ 11,
+ 5,
+ 12,
+ 4,
+ 12,
+ 5,
+ 12,
+ 5,
+ 12,
+ 6,
+ 13,
+ 5,
+ 13,
+ 6,
+ 8,
+ 16,
+ 13,
+ 7,
+ 14,
+ 6,
+ 14,
+ 7,
+ 9,
+ 17,
+ 14,
+ 8,
+ 15,
+ 7,
+ 15,
+ 8,
+ 11,
+ 16,
+ 15,
+ 9,
+ 15,
+ 10,
+ 16,
+ 8,
+ 16,
+ 9,
+ 16,
+ 10,
+ 15,
+ 13,
+ 17,
+ 9,
+ 17,
+ 10,
+ 17,
+ 11,
+ 15,
+ 16,
+ 18,
+ 10,
+ 18,
+ 11,
+ 18,
+ 12,
+ 16,
+ 16,
+ 19,
+ 11,
+ 19,
+ 12,
+ 19,
+ 13,
+ 17,
+ 17,
+ 20,
+ 12,
+ 20,
+ 13,
+ 20,
+ 14,
+ 19,
+ 16,
+ 21,
+ 13,
+ 21,
+ 14,
+ 21,
+ 15,
+ 20,
+ 17,
+ 22,
+ 14,
+ 22,
+ 15,
+ 25,
+ 10,
+ 22,
+ 16,
+ 23,
+ 15,
+ 23,
+ 16,
+ 26,
+ 11,
+ 23,
+ 17,
+ 24,
+ 16,
+ 24,
+ 17,
+ 27,
+ 12,
+ 24,
+ 18,
+ 25,
+ 17,
+ 25,
+ 18,
+ 28,
+ 13,
+ 25,
+ 19,
+ 26,
+ 18,
+ 26,
+ 19,
+ 29,
+ 14,
+ 26,
+ 20,
+ 27,
+ 19,
+ 27,
+ 20,
+ 30,
+ 15,
+ 27,
+ 21,
+ 28,
+ 20,
+ 28,
+ 21,
+ 28,
+ 21,
+ 28,
+ 22,
+ 29,
+ 21,
+ 29,
+ 22,
+ 24,
+ 32,
+ 29,
+ 23,
+ 30,
+ 22,
+ 30,
+ 23,
+ 25,
+ 33,
+ 30,
+ 24,
+ 31,
+ 23,
+ 31,
+ 24,
+ 27,
+ 32,
+ 31,
+ 25,
+ 31,
+ 26,
+ 32,
+ 24,
+ 32,
+ 25,
+ 32,
+ 26,
+ 31,
+ 29,
+ 33,
+ 25,
+ 33,
+ 26,
+ 33,
+ 27,
+ 31,
+ 32,
+ 34,
+ 26,
+ 34,
+ 27,
+ 34,
+ 28,
+ 32,
+ 32,
+ 35,
+ 27,
+ 35,
+ 28,
+ 35,
+ 29,
+ 33,
+ 33,
+ 36,
+ 28,
+ 36,
+ 29,
+ 36,
+ 30,
+ 35,
+ 32,
+ 37,
+ 29,
+ 37,
+ 30,
+ 37,
+ 31,
+ 36,
+ 33,
+ 38,
+ 30,
+ 38,
+ 31,
+ 41,
+ 26,
+ 38,
+ 32,
+ 39,
+ 31,
+ 39,
+ 32,
+ 42,
+ 27,
+ 39,
+ 33,
+ 40,
+ 32,
+ 40,
+ 33,
+ 43,
+ 28,
+ 40,
+ 34,
+ 41,
+ 33,
+ 41,
+ 34,
+ 44,
+ 29,
+ 41,
+ 35,
+ 42,
+ 34,
+ 42,
+ 35,
+ 45,
+ 30,
+ 42,
+ 36,
+ 43,
+ 35,
+ 43,
+ 36,
+ 46,
+ 31,
+ 43,
+ 37,
+ 44,
+ 36,
+ 44,
+ 37,
+ 44,
+ 37,
+ 44,
+ 38,
+ 45,
+ 37,
+ 45,
+ 38,
+ 40,
+ 48,
+ 45,
+ 39,
+ 46,
+ 38,
+ 46,
+ 39,
+ 41,
+ 49,
+ 46,
+ 40,
+ 47,
+ 39,
+ 47,
+ 40,
+ 43,
+ 48,
+ 47,
+ 41,
+ 47,
+ 42,
+ 48,
+ 40,
+ 48,
+ 41,
+ 48,
+ 42,
+ 47,
+ 45,
+ 49,
+ 41,
+ 49,
+ 42,
+ 49,
+ 43,
+ 47,
+ 48,
+ 50,
+ 42,
+ 50,
+ 43,
+ 50,
+ 44,
+ 48,
+ 48,
+ 51,
+ 43,
+ 51,
+ 44,
+ 51,
+ 45,
+ 49,
+ 49,
+ 52,
+ 44,
+ 52,
+ 45,
+ 52,
+ 46,
+ 51,
+ 48,
+ 53,
+ 45,
+ 53,
+ 46,
+ 53,
+ 47,
+ 52,
+ 49,
+ 54,
+ 46,
+ 54,
+ 47,
+ 57,
+ 42,
+ 54,
+ 48,
+ 55,
+ 47,
+ 55,
+ 48,
+ 58,
+ 43,
+ 55,
+ 49,
+ 56,
+ 48,
+ 56,
+ 49,
+ 59,
+ 44,
+ 56,
+ 50,
+ 57,
+ 49,
+ 57,
+ 50,
+ 60,
+ 45,
+ 57,
+ 51,
+ 58,
+ 50,
+ 58,
+ 51,
+ 61,
+ 46,
+ 58,
+ 52,
+ 59,
+ 51,
+ 59,
+ 52,
+ 62,
+ 47,
+ 59,
+ 53,
+ 60,
+ 52,
+ 60,
+ 53,
+ 60,
+ 53,
+ 60,
+ 54,
+ 61,
+ 53,
+ 61,
+ 54,
+ 61,
+ 54,
+ 61,
+ 55,
+ 62,
+ 54,
+ 62,
+ 55,
+ 62,
+ 55,
+ 62,
+ 56,
+ 63,
+ 55,
+ 63,
+ 56,
+ 63,
+ 56,
+ 63,
+ 57,
+ 63,
+ 58,
+ 63,
+ 59,
+ 63,
+ 59,
+ 63,
+ 60,
+ 63,
+ 61,
+ 63,
+ 62,
+ 63,
+ 62,
+ 63,
+ 63,
+};
+
+#endif // BETSY_BC1_H
diff --git a/modules/betsy/image_compress_betsy.cpp b/modules/betsy/image_compress_betsy.cpp
index bc72203b2ff..07298d82fb5 100644
--- a/modules/betsy/image_compress_betsy.cpp
+++ b/modules/betsy/image_compress_betsy.cpp
@@ -30,6 +30,7 @@
#include "image_compress_betsy.h"
+#include "core/config/project_settings.h"
#include "servers/rendering/rendering_device_binds.h"
#include "servers/rendering/rendering_server_default.h"
@@ -40,113 +41,198 @@
#include "drivers/metal/rendering_context_driver_metal.h"
#endif
+#include "betsy_bc1.h"
+
+#include "bc1.glsl.gen.h"
#include "bc6h.glsl.gen.h"
-struct BC6PushConstant {
- float sizeX;
- float sizeY;
- uint32_t padding[2];
-};
+// Static variables (for caching).
+
+static RenderingDevice *compress_rd = nullptr;
+static RenderingContextDriver *compress_rcd = nullptr;
+
+static Mutex rd_mutex;
+static Mutex shader_mutex;
+
+static HashMap> cached_shaders;
+
+// Betsy shader (for caching).
+
+BetsyShader::BetsyShader() {
+}
+
+BetsyShader::~BetsyShader() {
+ // Free just the shader, the pipelines will be cleared automatically.
+ if (compress_rd && compiled.is_valid()) {
+ compress_rd->free(compiled);
+ }
+}
+
+// Helper functions.
static int get_next_multiple(int n, int m) {
return n + (m - (n % m));
}
-Error _compress_betsy(BetsyFormat p_format, Image *r_img) {
+static String get_shader_name(BetsyFormat p_format) {
+ switch (p_format) {
+ case BETSY_FORMAT_BC1:
+ case BETSY_FORMAT_BC1_DITHER:
+ return "BC1";
+
+ case BETSY_FORMAT_BC3:
+ return "BC3";
+
+ case BETSY_FORMAT_BC6_SIGNED:
+ case BETSY_FORMAT_BC6_UNSIGNED:
+ return "BC6";
+
+ default:
+ return "";
+ }
+}
+
+Error compress_betsy(BetsyFormat p_format, Image *r_img) {
uint64_t start_time = OS::get_singleton()->get_ticks_msec();
if (r_img->is_compressed()) {
return ERR_INVALID_DATA;
}
- ERR_FAIL_COND_V_MSG(r_img->get_format() < Image::FORMAT_RF || r_img->get_format() > Image::FORMAT_RGBE9995, ERR_INVALID_DATA, "Image is not an HDR image.");
-
Error err = OK;
- // Create local RD.
- RenderingContextDriver *rcd = nullptr;
- RenderingDevice *rd = RenderingServer::get_singleton()->create_local_rendering_device();
+ rd_mutex.lock();
+ if (!compress_rd) {
+ // Create local RD.
+ RenderingContextDriver *rcd = nullptr;
+ RenderingDevice *rd = RenderingServer::get_singleton()->create_local_rendering_device();
- if (rd == nullptr) {
+ if (rd == nullptr) {
#if defined(RD_ENABLED)
#if defined(METAL_ENABLED)
- rcd = memnew(RenderingContextDriverMetal);
- rd = memnew(RenderingDevice);
+ rcd = memnew(RenderingContextDriverMetal);
+ rd = memnew(RenderingDevice);
#endif
#if defined(VULKAN_ENABLED)
- if (rcd == nullptr) {
- rcd = memnew(RenderingContextDriverVulkan);
- rd = memnew(RenderingDevice);
- }
-#endif
-#endif
- if (rcd != nullptr && rd != nullptr) {
- err = rcd->initialize();
- if (err == OK) {
- err = rd->initialize(rcd);
+ if (rcd == nullptr) {
+ rcd = memnew(RenderingContextDriverVulkan);
+ rd = memnew(RenderingDevice);
}
+#endif
+#endif
+ if (rcd != nullptr && rd != nullptr) {
+ err = rcd->initialize();
+ if (err == OK) {
+ err = rd->initialize(rcd);
+ }
- if (err != OK) {
- memdelete(rd);
- memdelete(rcd);
- rd = nullptr;
- rcd = nullptr;
+ if (err != OK) {
+ memdelete(rd);
+ memdelete(rcd);
+ rd = nullptr;
+ rcd = nullptr;
+ }
}
}
+
+ ERR_FAIL_NULL_V_MSG(rd, err, "Unable to create a local RenderingDevice.");
+
+ compress_rd = rd;
+ compress_rcd = rcd;
}
-
- ERR_FAIL_NULL_V_MSG(rd, err, "Unable to create a local RenderingDevice.");
-
- Ref compute_shader;
- compute_shader.instantiate();
+ rd_mutex.unlock();
// Destination format.
Image::Format dest_format = Image::FORMAT_MAX;
+ RD::DataFormat dst_rd_format = RD::DATA_FORMAT_MAX;
String version = "";
switch (p_format) {
- case BETSY_FORMAT_BC6: {
- err = compute_shader->parse_versions_from_text(bc6h_shader_glsl);
+ case BETSY_FORMAT_BC1:
+ version = "standard";
+ dst_rd_format = RD::DATA_FORMAT_R32G32_UINT;
+ dest_format = Image::FORMAT_DXT1;
+ break;
- if (r_img->detect_signed(true)) {
- dest_format = Image::FORMAT_BPTC_RGBF;
- version = "signed";
- } else {
- dest_format = Image::FORMAT_BPTC_RGBFU;
- version = "unsigned";
- }
+ case BETSY_FORMAT_BC1_DITHER:
+ version = "dithered";
+ dst_rd_format = RD::DATA_FORMAT_R32G32_UINT;
+ dest_format = Image::FORMAT_DXT1;
+ break;
- } break;
+ case BETSY_FORMAT_BC6_SIGNED:
+ version = "signed";
+ dst_rd_format = RD::DATA_FORMAT_R32G32B32A32_UINT;
+ dest_format = Image::FORMAT_BPTC_RGBF;
+ break;
+
+ case BETSY_FORMAT_BC6_UNSIGNED:
+ version = "unsigned";
+ dst_rd_format = RD::DATA_FORMAT_R32G32B32A32_UINT;
+ dest_format = Image::FORMAT_BPTC_RGBFU;
+ break;
default:
err = ERR_INVALID_PARAMETER;
break;
}
- if (err != OK) {
- compute_shader->print_errors("Betsy compress shader");
- memdelete(rd);
- if (rcd != nullptr) {
- memdelete(rcd);
+ const String shader_name = get_shader_name(p_format) + "-" + version;
+ const BetsyShader *shader_ptr;
+
+ shader_mutex.lock();
+ if (cached_shaders.has(shader_name)) {
+ shader_ptr = cached_shaders[shader_name].ptr();
+
+ } else {
+ Ref shader;
+ shader.instantiate();
+
+ Ref source;
+ source.instantiate();
+
+ switch (p_format) {
+ case BETSY_FORMAT_BC1:
+ case BETSY_FORMAT_BC1_DITHER:
+ err = source->parse_versions_from_text(bc1_shader_glsl);
+ break;
+
+ case BETSY_FORMAT_BC6_UNSIGNED:
+ case BETSY_FORMAT_BC6_SIGNED:
+ err = source->parse_versions_from_text(bc6h_shader_glsl);
+ break;
+
+ default:
+ err = ERR_INVALID_PARAMETER;
+ break;
}
- return err;
- }
-
- // Compile the shader, return early if invalid.
- RID shader = rd->shader_create_from_spirv(compute_shader->get_spirv_stages(version));
-
- if (shader.is_null()) {
- memdelete(rd);
- if (rcd != nullptr) {
- memdelete(rcd);
+ if (err != OK) {
+ source->print_errors("Betsy compress shader");
+ return err;
}
- return err;
- }
+ // Compile the shader, return early if invalid.
+ shader->compiled = compress_rd->shader_create_from_spirv(source->get_spirv_stages(version));
+ if (shader->compiled.is_null()) {
+ return ERR_CANT_CREATE;
+ }
- RID pipeline = rd->compute_pipeline_create(shader);
+ // Compile the pipeline, return early if invalid.
+ shader->pipeline = compress_rd->compute_pipeline_create(shader->compiled);
+ if (shader->pipeline.is_null()) {
+ return ERR_CANT_CREATE;
+ }
+
+ cached_shaders[shader_name] = shader;
+ shader_ptr = cached_shaders[shader_name].ptr();
+ }
+ shader_mutex.unlock();
+
+ if (shader_ptr->compiled.is_null() || shader_ptr->pipeline.is_null()) {
+ return ERR_INVALID_DATA;
+ }
// src_texture format information.
RD::TextureFormat src_texture_format;
@@ -159,6 +245,33 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) {
}
switch (r_img->get_format()) {
+ case Image::FORMAT_L8:
+ r_img->convert(Image::FORMAT_RGBA8);
+ src_texture_format.format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+ break;
+
+ case Image::FORMAT_LA8:
+ r_img->convert(Image::FORMAT_RGBA8);
+ src_texture_format.format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+ break;
+
+ case Image::FORMAT_R8:
+ src_texture_format.format = RD::DATA_FORMAT_R8_UNORM;
+ break;
+
+ case Image::FORMAT_RG8:
+ src_texture_format.format = RD::DATA_FORMAT_R8G8_UNORM;
+ break;
+
+ case Image::FORMAT_RGB8:
+ r_img->convert(Image::FORMAT_RGBA8);
+ src_texture_format.format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+ break;
+
+ case Image::FORMAT_RGBA8:
+ src_texture_format.format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+ break;
+
case Image::FORMAT_RH:
src_texture_format.format = RD::DATA_FORMAT_R16_SFLOAT;
break;
@@ -198,13 +311,6 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) {
break;
default: {
- rd->free(shader);
-
- memdelete(rd);
- if (rcd != nullptr) {
- memdelete(rcd);
- }
-
return err;
}
}
@@ -219,12 +325,25 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) {
src_sampler_state.mip_filter = RD::SAMPLER_FILTER_NEAREST;
}
- RID src_sampler = rd->sampler_create(src_sampler_state);
+ RID src_sampler = compress_rd->sampler_create(src_sampler_state);
// For the destination format just copy the source format and change the usage bits.
RD::TextureFormat dst_texture_format = src_texture_format;
dst_texture_format.usage_bits = RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT | RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_CAN_COPY_FROM_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT | RD::TEXTURE_USAGE_CAN_UPDATE_BIT;
- dst_texture_format.format = RD::DATA_FORMAT_R32G32B32A32_UINT;
+ dst_texture_format.format = dst_rd_format;
+
+ RID encoding_table_buffer;
+ bool uses_encoding_table = false;
+
+ // Encoding table setup.
+ if (dest_format == Image::FORMAT_DXT1) {
+ Vector data;
+ data.resize(1024 * 4);
+ memcpy(data.ptrw(), dxt1_encoding_table, 1024 * 4);
+
+ encoding_table_buffer = compress_rd->storage_buffer_create(1024 * 4, data);
+ uses_encoding_table = true;
+ }
const int mip_count = r_img->get_mipmap_count() + 1;
@@ -256,8 +375,41 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) {
memcpy(src_image_ptr[0].ptrw(), r_img->ptr() + ofs, size);
// Create the textures on the GPU.
- RID src_texture = rd->texture_create(src_texture_format, RD::TextureView(), src_images);
- RID dst_texture = rd->texture_create(dst_texture_format, RD::TextureView());
+ RID src_texture = compress_rd->texture_create(src_texture_format, RD::TextureView(), src_images);
+ RID dst_texture = compress_rd->texture_create(dst_texture_format, RD::TextureView());
+
+ Vector uniforms;
+ {
+ {
+ RD::Uniform u;
+ u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
+ u.binding = 0;
+ u.append_id(src_sampler);
+ u.append_id(src_texture);
+ uniforms.push_back(u);
+ }
+ {
+ RD::Uniform u;
+ u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
+ u.binding = 1;
+ u.append_id(dst_texture);
+ uniforms.push_back(u);
+ }
+
+ if (uses_encoding_table) {
+ RD::Uniform u;
+ u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+ u.binding = 2;
+ u.append_id(encoding_table_buffer);
+ uniforms.push_back(u);
+ }
+ }
+
+ RID uniform_set = compress_rd->uniform_set_create(uniforms, shader_ptr->compiled, 0);
+ RD::ComputeListID compute_list = compress_rd->compute_list_begin();
+
+ compress_rd->compute_list_bind_compute_pipeline(compute_list, shader_ptr->pipeline);
+ compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
if (dest_format == Image::FORMAT_BPTC_RGBFU || dest_format == Image::FORMAT_BPTC_RGBF) {
BC6PushConstant push_constant;
@@ -266,47 +418,33 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) {
push_constant.padding[0] = 0;
push_constant.padding[1] = 0;
- Vector uniforms;
- {
- {
- RD::Uniform u;
- u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
- u.binding = 0;
- u.append_id(src_sampler);
- u.append_id(src_texture);
- uniforms.push_back(u);
- }
- {
- RD::Uniform u;
- u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
- u.binding = 1;
- u.append_id(dst_texture);
- uniforms.push_back(u);
- }
- }
+ compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant));
- RID uniform_set = rd->uniform_set_create(uniforms, shader, 0);
- RD::ComputeListID compute_list = rd->compute_list_begin();
+ } else {
+ BC1PushConstant push_constant;
+ push_constant.num_refines = 2;
+ push_constant.padding[0] = 0;
+ push_constant.padding[1] = 0;
+ push_constant.padding[2] = 0;
- rd->compute_list_bind_compute_pipeline(compute_list, pipeline);
- rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
- rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant));
- rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
- rd->compute_list_end();
+ compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant));
}
- rd->submit();
- rd->sync();
+ compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
+ compress_rd->compute_list_end();
+
+ compress_rd->submit();
+ compress_rd->sync();
// Copy data from the GPU to the buffer.
- const Vector texture_data = rd->texture_get_data(dst_texture, 0);
+ const Vector texture_data = compress_rd->texture_get_data(dst_texture, 0);
int64_t dst_ofs = Image::get_image_mipmap_offset(r_img->get_width(), r_img->get_height(), dest_format, i);
memcpy(dst_data_ptr + dst_ofs, texture_data.ptr(), texture_data.size());
// Free the source and dest texture.
- rd->free(dst_texture);
- rd->free(src_texture);
+ compress_rd->free(dst_texture);
+ compress_rd->free(src_texture);
}
src_images.clear();
@@ -315,14 +453,11 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) {
r_img->set_data(r_img->get_width(), r_img->get_height(), r_img->has_mipmaps(), dest_format, dst_data);
// Free the shader (dependencies will be cleared automatically).
- rd->free(src_sampler);
- rd->free(shader);
-
- memdelete(rd);
- if (rcd != nullptr) {
- memdelete(rcd);
+ if (uses_encoding_table) {
+ compress_rd->free(encoding_table_buffer);
}
+ compress_rd->free(src_sampler);
print_verbose(vformat("Betsy: Encoding took %d ms.", OS::get_singleton()->get_ticks_msec() - start_time));
return OK;
@@ -330,10 +465,61 @@ Error _compress_betsy(BetsyFormat p_format, Image *r_img) {
Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels) {
Image::Format format = r_img->get_format();
+ Error result = ERR_UNAVAILABLE;
if (format >= Image::FORMAT_RF && format <= Image::FORMAT_RGBE9995) {
- return _compress_betsy(BETSY_FORMAT_BC6, r_img);
+ if (r_img->detect_signed()) {
+ result = compress_betsy(BETSY_FORMAT_BC6_SIGNED, r_img);
+ } else {
+ result = compress_betsy(BETSY_FORMAT_BC6_UNSIGNED, r_img);
+ }
}
- return ERR_UNAVAILABLE;
+ if (!GLOBAL_GET("rendering/textures/vram_compression/cache_gpu_compressor")) {
+ free_device();
+ }
+
+ return result;
+}
+
+Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels) {
+ Error result = ERR_UNAVAILABLE;
+
+ switch (p_channels) {
+ case Image::USED_CHANNELS_RGB:
+ result = compress_betsy(BETSY_FORMAT_BC1_DITHER, r_img);
+ break;
+
+ case Image::USED_CHANNELS_L:
+ result = compress_betsy(BETSY_FORMAT_BC1, r_img);
+ break;
+
+ default:
+ break;
+ }
+
+ if (!GLOBAL_GET("rendering/textures/vram_compression/cache_gpu_compressor")) {
+ free_device();
+ }
+
+ return result;
+}
+
+void free_device() {
+ if (compress_rd != nullptr) {
+ // Clear the shader cache, shaders will be unreferenced automatically.
+ shader_mutex.lock();
+ cached_shaders.clear();
+ shader_mutex.unlock();
+
+ // Free the RD (and RCD if necessary).
+ rd_mutex.lock();
+ memdelete(compress_rd);
+ compress_rd = nullptr;
+ if (compress_rcd != nullptr) {
+ memdelete(compress_rcd);
+ compress_rcd = nullptr;
+ }
+ rd_mutex.unlock();
+ }
}
diff --git a/modules/betsy/image_compress_betsy.h b/modules/betsy/image_compress_betsy.h
index a64e586c76c..522b09160a6 100644
--- a/modules/betsy/image_compress_betsy.h
+++ b/modules/betsy/image_compress_betsy.h
@@ -34,11 +34,38 @@
#include "core/io/image.h"
enum BetsyFormat {
- BETSY_FORMAT_BC6,
+ BETSY_FORMAT_BC1,
+ BETSY_FORMAT_BC1_DITHER,
+ BETSY_FORMAT_BC3,
+ BETSY_FORMAT_BC6_SIGNED,
+ BETSY_FORMAT_BC6_UNSIGNED,
};
-Error _compress_betsy(BetsyFormat p_format, Image *r_img);
+class BetsyShader : public RefCounted {
+public:
+ RID compiled;
+ RID pipeline;
+
+ BetsyShader();
+ ~BetsyShader();
+};
+
+struct BC6PushConstant {
+ float sizeX;
+ float sizeY;
+ uint32_t padding[2];
+};
+
+struct BC1PushConstant {
+ uint32_t num_refines;
+ uint32_t padding[3];
+};
+
+void free_device();
+
+Error compress_betsy(BetsyFormat p_format, Image *r_img);
Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels);
+Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels);
#endif // IMAGE_COMPRESS_BETSY_H
diff --git a/modules/betsy/register_types.cpp b/modules/betsy/register_types.cpp
index 019099e67c6..a3a3b5a99bf 100644
--- a/modules/betsy/register_types.cpp
+++ b/modules/betsy/register_types.cpp
@@ -38,10 +38,13 @@ void initialize_betsy_module(ModuleInitializationLevel p_level) {
}
Image::_image_compress_bptc_rd_func = _betsy_compress_bptc;
+ Image::_image_compress_bc_rd_func = _betsy_compress_s3tc;
}
void uninitialize_betsy_module(ModuleInitializationLevel p_level) {
if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
return;
}
+
+ free_device();
}
diff --git a/servers/rendering_server.cpp b/servers/rendering_server.cpp
index f354e83893b..020fa94e9dc 100644
--- a/servers/rendering_server.cpp
+++ b/servers/rendering_server.cpp
@@ -3528,6 +3528,7 @@ void RenderingServer::init() {
GLOBAL_DEF_RST("rendering/textures/vram_compression/import_s3tc_bptc", false);
GLOBAL_DEF_RST("rendering/textures/vram_compression/import_etc2_astc", false);
GLOBAL_DEF("rendering/textures/vram_compression/compress_with_gpu", true);
+ GLOBAL_DEF("rendering/textures/vram_compression/cache_gpu_compressor", true);
GLOBAL_DEF("rendering/textures/lossless_compression/force_png", false);
diff --git a/thirdparty/README.md b/thirdparty/README.md
index dbe20ba2a58..ca6be902e32 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -78,7 +78,7 @@ fix build with our own copy of zstd (patch in `patches`).
Files extracted from upstream source:
-- `bc6h.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`.
+- `bc6h.glsl`, `bc1.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`.
- `LICENSE.md`