godot/servers/rendering/renderer_rd/shaders/sort.glsl

#[compute]

#version 450

#VERSION_DEFINES

// Original version here:
// https://github.com/GPUOpen-LibrariesAndSDKs/GPUParticles11/blob/master/gpuparticles11/src/Shaders

//
// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//

#define SORT_SIZE 512
#define NUM_THREADS (SORT_SIZE / 2)
#define INVERSION (16 * 2 + 8 * 3)
#define ITERATIONS 1

layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;

#ifndef MODE_SORT_STEP

shared vec2 g_LDS[SORT_SIZE];

#endif

layout(set = 1, binding = 0, std430) restrict buffer SortBuffer {
	vec2 data[];
}
sort_buffer;

layout(push_constant, binding = 0, std430) uniform Params {
	uint total_elements;
	uint pad[3];
	ivec4 job_params;
}
params;

void main() {
#ifdef MODE_SORT_BLOCK

	uvec3 Gid = gl_WorkGroupID;
	uvec3 DTid = gl_GlobalInvocationID;
	uvec3 GTid = gl_LocalInvocationID;
	uint GI = gl_LocalInvocationIndex;

	int GlobalBaseIndex = int((Gid.x * SORT_SIZE) + GTid.x);
	int LocalBaseIndex = int(GI);
	int numElementsInThreadGroup = int(min(SORT_SIZE, params.total_elements - (Gid.x * SORT_SIZE)));

	// Load shared data

	int i;
	for (i = 0; i < 2 * ITERATIONS; ++i) {
		if (GI + i * NUM_THREADS < numElementsInThreadGroup)
			g_LDS[LocalBaseIndex + i * NUM_THREADS] = sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS];
	}

	groupMemoryBarrier();
	barrier();

	// Bitonic sort
	for (int nMergeSize = 2; nMergeSize <= SORT_SIZE; nMergeSize = nMergeSize * 2) {
		for (int nMergeSubSize = nMergeSize >> 1; nMergeSubSize > 0; nMergeSubSize = nMergeSubSize >> 1) {
			for (i = 0; i < ITERATIONS; ++i) {
				int tmp_index = int(GI + NUM_THREADS * i);
				int index_low = tmp_index & (nMergeSubSize - 1);
				int index_high = 2 * (tmp_index - index_low);
				int index = index_high + index_low;

				int nSwapElem = nMergeSubSize == nMergeSize >> 1 ? index_high + (2 * nMergeSubSize - 1) - index_low : index_high + nMergeSubSize + index_low;
				if (nSwapElem < numElementsInThreadGroup) {
					vec2 a = g_LDS[index];
					vec2 b = g_LDS[nSwapElem];

					if (a.x > b.x) {
						g_LDS[index] = b;
						g_LDS[nSwapElem] = a;
					}
				}
				groupMemoryBarrier();
				barrier();
			}
		}
	}

	// Store shared data
	for (i = 0; i < 2 * ITERATIONS; ++i) {
		if (GI + i * NUM_THREADS < numElementsInThreadGroup) {
			sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS] = g_LDS[LocalBaseIndex + i * NUM_THREADS];
		}
	}

#endif

#ifdef MODE_SORT_STEP

	uvec3 Gid = gl_WorkGroupID;
	uvec3 GTid = gl_LocalInvocationID;

	ivec4 tgp;

	tgp.x = int(Gid.x) * 256;
	tgp.y = 0;
	tgp.z = int(params.total_elements);
	tgp.w = min(512, max(0, tgp.z - int(Gid.x) * 512));

	uint localID = int(tgp.x) + GTid.x; // calculate threadID within this sortable-array

	uint index_low = localID & (params.job_params.x - 1);
	uint index_high = 2 * (localID - index_low);

	uint index = tgp.y + index_high + index_low;
	uint nSwapElem = tgp.y + index_high + params.job_params.y + params.job_params.z * index_low;

	if (nSwapElem < tgp.y + tgp.z) {
		vec2 a = sort_buffer.data[index];
		vec2 b = sort_buffer.data[nSwapElem];

		if (a.x > b.x) {
			sort_buffer.data[index] = b;
			sort_buffer.data[nSwapElem] = a;
		}
	}

#endif

#ifdef MODE_SORT_INNER

	uvec3 Gid = gl_WorkGroupID;
	uvec3 DTid = gl_GlobalInvocationID;
	uvec3 GTid = gl_LocalInvocationID;
	uint GI = gl_LocalInvocationIndex;

	ivec4 tgp;

	tgp.x = int(Gid.x * 256);
	tgp.y = 0;
	tgp.z = int(params.total_elements.x);
	tgp.w = int(min(512, max(0, params.total_elements - Gid.x * 512)));

	int GlobalBaseIndex = int(tgp.y + tgp.x * 2 + GTid.x);
	int LocalBaseIndex = int(GI);
	int i;

	// Load shared data
	for (i = 0; i < 2; ++i) {
		if (GI + i * NUM_THREADS < tgp.w)
			g_LDS[LocalBaseIndex + i * NUM_THREADS] = sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS];
	}

	groupMemoryBarrier();
	barrier();

	// sort threadgroup shared memory
	for (int nMergeSubSize = SORT_SIZE >> 1; nMergeSubSize > 0; nMergeSubSize = nMergeSubSize >> 1) {
		int tmp_index = int(GI);
		int index_low = tmp_index & (nMergeSubSize - 1);
		int index_high = 2 * (tmp_index - index_low);
		int index = index_high + index_low;

		int nSwapElem = index_high + nMergeSubSize + index_low;

		if (nSwapElem < tgp.w) {
			vec2 a = g_LDS[index];
			vec2 b = g_LDS[nSwapElem];

			if (a.x > b.x) {
				g_LDS[index] = b;
				g_LDS[nSwapElem] = a;
			}
		}
		groupMemoryBarrier();
		barrier();
	}

	// Store shared data
	for (i = 0; i < 2; ++i) {
		if (GI + i * NUM_THREADS < tgp.w) {
			sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS] = g_LDS[LocalBaseIndex + i * NUM_THREADS];
		}
	}

#endif
}
Re-Implement GPU particles on master. -No new features yet -Unlike godot 3.x, sorting happens using GPU 2020-08-19 13:38:24 +00:00			`#[compute]`

			`#version 450`

Refactor GLSL shader compilation -Used a more consistent set of keywords for the shader -Remove all harcoded entry points -Re-wrote the GLSL shader parser, new system is more flexible. Allows any entry point organization. -Entry point for sky shaders is now sky(). -Entry point for particle shaders is now process(). 2021-04-13 20:01:43 +00:00			`#VERSION_DEFINES`
Re-Implement GPU particles on master. -No new features yet -Unlike godot 3.x, sorting happens using GPU 2020-08-19 13:38:24 +00:00
			`// Original version here:`
			`// https://github.com/GPUOpen-LibrariesAndSDKs/GPUParticles11/blob/master/gpuparticles11/src/Shaders`

			`//`
			`// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.`
			`//`
			`// Permission is hereby granted, free of charge, to any person obtaining a copy`
			`// of this software and associated documentation files (the "Software"), to deal`
			`// in the Software without restriction, including without limitation the rights`
			`// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`// copies of the Software, and to permit persons to whom the Software is`
			`// furnished to do so, subject to the following conditions:`
			`//`
			`// The above copyright notice and this permission notice shall be included in`
			`// all copies or substantial portions of the Software.`
			`//`
			`// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN`
			`// THE SOFTWARE.`
			`//`

			`#define SORT_SIZE 512`
			`#define NUM_THREADS (SORT_SIZE / 2)`
			`#define INVERSION (16 * 2 + 8 * 3)`
			`#define ITERATIONS 1`

			`layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;`

			`#ifndef MODE_SORT_STEP`

			`shared vec2 g_LDS[SORT_SIZE];`

			`#endif`

			`layout(set = 1, binding = 0, std430) restrict buffer SortBuffer {`
			`vec2 data[];`
			`}`
			`sort_buffer;`

			`layout(push_constant, binding = 0, std430) uniform Params {`
			`uint total_elements;`
			`uint pad[3];`
			`ivec4 job_params;`
			`}`
			`params;`

			`void main() {`
			`#ifdef MODE_SORT_BLOCK`

			`uvec3 Gid = gl_WorkGroupID;`
			`uvec3 DTid = gl_GlobalInvocationID;`
			`uvec3 GTid = gl_LocalInvocationID;`
			`uint GI = gl_LocalInvocationIndex;`

			`int GlobalBaseIndex = int((Gid.x * SORT_SIZE) + GTid.x);`
			`int LocalBaseIndex = int(GI);`
			`int numElementsInThreadGroup = int(min(SORT_SIZE, params.total_elements - (Gid.x * SORT_SIZE)));`

			`// Load shared data`

			`int i;`
			`for (i = 0; i < 2 * ITERATIONS; ++i) {`
			`if (GI + i * NUM_THREADS < numElementsInThreadGroup)`
			`g_LDS[LocalBaseIndex + i * NUM_THREADS] = sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS];`
			`}`

			`groupMemoryBarrier();`
			`barrier();`

			`// Bitonic sort`
			`for (int nMergeSize = 2; nMergeSize <= SORT_SIZE; nMergeSize = nMergeSize * 2) {`
			`for (int nMergeSubSize = nMergeSize >> 1; nMergeSubSize > 0; nMergeSubSize = nMergeSubSize >> 1) {`
			`for (i = 0; i < ITERATIONS; ++i) {`
			`int tmp_index = int(GI + NUM_THREADS * i);`
			`int index_low = tmp_index & (nMergeSubSize - 1);`
			`int index_high = 2 * (tmp_index - index_low);`
			`int index = index_high + index_low;`

			`int nSwapElem = nMergeSubSize == nMergeSize >> 1 ? index_high + (2 * nMergeSubSize - 1) - index_low : index_high + nMergeSubSize + index_low;`
			`if (nSwapElem < numElementsInThreadGroup) {`
			`vec2 a = g_LDS[index];`
			`vec2 b = g_LDS[nSwapElem];`

			`if (a.x > b.x) {`
			`g_LDS[index] = b;`
			`g_LDS[nSwapElem] = a;`
			`}`
			`}`
			`groupMemoryBarrier();`
			`barrier();`
			`}`
			`}`
			`}`

			`// Store shared data`
			`for (i = 0; i < 2 * ITERATIONS; ++i) {`
			`if (GI + i * NUM_THREADS < numElementsInThreadGroup) {`
			`sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS] = g_LDS[LocalBaseIndex + i * NUM_THREADS];`
			`}`
			`}`

			`#endif`

			`#ifdef MODE_SORT_STEP`

			`uvec3 Gid = gl_WorkGroupID;`
			`uvec3 GTid = gl_LocalInvocationID;`

			`ivec4 tgp;`

			`tgp.x = int(Gid.x) * 256;`
			`tgp.y = 0;`
			`tgp.z = int(params.total_elements);`
			`tgp.w = min(512, max(0, tgp.z - int(Gid.x) * 512));`

			`uint localID = int(tgp.x) + GTid.x; // calculate threadID within this sortable-array`

			`uint index_low = localID & (params.job_params.x - 1);`
			`uint index_high = 2 * (localID - index_low);`

			`uint index = tgp.y + index_high + index_low;`
			`uint nSwapElem = tgp.y + index_high + params.job_params.y + params.job_params.z * index_low;`

			`if (nSwapElem < tgp.y + tgp.z) {`
			`vec2 a = sort_buffer.data[index];`
			`vec2 b = sort_buffer.data[nSwapElem];`

			`if (a.x > b.x) {`
			`sort_buffer.data[index] = b;`
			`sort_buffer.data[nSwapElem] = a;`
			`}`
			`}`

			`#endif`

			`#ifdef MODE_SORT_INNER`

			`uvec3 Gid = gl_WorkGroupID;`
			`uvec3 DTid = gl_GlobalInvocationID;`
			`uvec3 GTid = gl_LocalInvocationID;`
			`uint GI = gl_LocalInvocationIndex;`

			`ivec4 tgp;`

			`tgp.x = int(Gid.x * 256);`
			`tgp.y = 0;`
			`tgp.z = int(params.total_elements.x);`
			`tgp.w = int(min(512, max(0, params.total_elements - Gid.x * 512)));`

			`int GlobalBaseIndex = int(tgp.y + tgp.x * 2 + GTid.x);`
			`int LocalBaseIndex = int(GI);`
			`int i;`

			`// Load shared data`
			`for (i = 0; i < 2; ++i) {`
			`if (GI + i * NUM_THREADS < tgp.w)`
			`g_LDS[LocalBaseIndex + i * NUM_THREADS] = sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS];`
			`}`

			`groupMemoryBarrier();`
			`barrier();`

			`// sort threadgroup shared memory`
			`for (int nMergeSubSize = SORT_SIZE >> 1; nMergeSubSize > 0; nMergeSubSize = nMergeSubSize >> 1) {`
			`int tmp_index = int(GI);`
			`int index_low = tmp_index & (nMergeSubSize - 1);`
			`int index_high = 2 * (tmp_index - index_low);`
			`int index = index_high + index_low;`

			`int nSwapElem = index_high + nMergeSubSize + index_low;`

			`if (nSwapElem < tgp.w) {`
			`vec2 a = g_LDS[index];`
			`vec2 b = g_LDS[nSwapElem];`

			`if (a.x > b.x) {`
			`g_LDS[index] = b;`
			`g_LDS[nSwapElem] = a;`
			`}`
			`}`
			`groupMemoryBarrier();`
			`barrier();`
			`}`

			`// Store shared data`
			`for (i = 0; i < 2; ++i) {`
			`if (GI + i * NUM_THREADS < tgp.w) {`
			`sort_buffer.data[GlobalBaseIndex + i * NUM_THREADS] = g_LDS[LocalBaseIndex + i * NUM_THREADS];`
			`}`
			`}`

			`#endif`
			`}`