godot/servers/rendering/renderer_rd/shaders/cluster_render.glsl

#[vertex]

#version 450

#VERSION_DEFINES

layout(location = 0) in vec3 vertex_attrib;

layout(location = 0) out float depth_interp;
layout(location = 1) out flat uint element_index;

layout(push_constant, binding = 0, std430) uniform Params {
	uint base_index;
	uint pad0;
	uint pad1;
	uint pad2;
}
params;

layout(set = 0, binding = 1, std140) uniform State {
	mat4 projection;

	float inv_z_far;
	uint screen_to_clusters_shift; // shift to obtain coordinates in block indices
	uint cluster_screen_width; //
	uint cluster_data_size; // how much data for a single cluster takes

	uint cluster_depth_offset;
	uint pad0;
	uint pad1;
	uint pad2;
}
state;

struct RenderElement {
	uint type; //0-4
	bool touches_near;
	bool touches_far;
	uint original_index;
	mat3x4 transform_inv;
	vec3 scale;
	uint pad;
};

layout(set = 0, binding = 2, std430) buffer restrict readonly RenderElements {
	RenderElement data[];
}
render_elements;

void main() {
	element_index = params.base_index + gl_InstanceIndex;

	vec3 vertex = vertex_attrib;
	vertex *= render_elements.data[element_index].scale;

	vertex = vec4(vertex, 1.0) * render_elements.data[element_index].transform_inv;
	depth_interp = -vertex.z;

	gl_Position = state.projection * vec4(vertex, 1.0);
}

#[fragment]

#version 450

#VERSION_DEFINES

#if defined(has_GL_KHR_shader_subgroup_ballot) && defined(has_GL_KHR_shader_subgroup_arithmetic) && defined(has_GL_KHR_shader_subgroup_vote)

#extension GL_KHR_shader_subgroup_ballot : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_vote : enable

#define USE_SUBGROUPS
#endif

layout(location = 0) in float depth_interp;
layout(location = 1) in flat uint element_index;

layout(set = 0, binding = 1, std140) uniform State {
	mat4 projection;
	float inv_z_far;
	uint screen_to_clusters_shift; // shift to obtain coordinates in block indices
	uint cluster_screen_width; //
	uint cluster_data_size; // how much data for a single cluster takes
	uint cluster_depth_offset;
	uint pad0;
	uint pad1;
	uint pad2;
}
state;

//cluster data is layout linearly, each cell contains the follow information:
// - list of bits for every element to mark as used, so (max_elem_count/32)*4 uints
// - a uint for each element to mark the depth bits used when rendering (0-31)

layout(set = 0, binding = 3, std430) buffer restrict ClusterRender {
	uint data[];
}
cluster_render;

void main() {
	//convert from screen to cluster
	uvec2 cluster = uvec2(gl_FragCoord.xy) >> state.screen_to_clusters_shift;

	//get linear cluster offset from screen poss
	uint cluster_offset = cluster.x + state.cluster_screen_width * cluster.y;
	//multiply by data size to position at the beginning of the element list for this cluster
	cluster_offset *= state.cluster_data_size;

	//find the current element in the list and plot the bit to mark it as used
	uint usage_write_offset = cluster_offset + (element_index >> 5);
	uint usage_write_bit = 1 << (element_index & 0x1F);

#ifdef USE_SUBGROUPS

	uint cluster_thread_group_index;

	if (!gl_HelperInvocation) {
		//http://advances.realtimerendering.com/s2017/2017_Sig_Improved_Culling_final.pdf

		uvec4 mask;

		while (true) {
			// find the cluster offset of the first active thread
			// threads that did break; go inactive and no longer count
			uint first = subgroupBroadcastFirst(cluster_offset);
			// update the mask for thread that match this cluster
			mask = subgroupBallot(first == cluster_offset);
			if (first == cluster_offset) {
				// This thread belongs to the group of threads that match this offset,
				// so exit the loop.
				break;
			}
		}

		cluster_thread_group_index = subgroupBallotExclusiveBitCount(mask);

		if (cluster_thread_group_index == 0) {
			atomicOr(cluster_render.data[usage_write_offset], usage_write_bit);
		}
	}
#else
	if (!gl_HelperInvocation) {
		atomicOr(cluster_render.data[usage_write_offset], usage_write_bit);
	}
#endif
	//find the current element in the depth usage list and mark the current depth as used
	float unit_depth = depth_interp * state.inv_z_far;

	uint z_bit = clamp(uint(floor(unit_depth * 32.0)), 0, 31);

	uint z_write_offset = cluster_offset + state.cluster_depth_offset + element_index;
	uint z_write_bit = 1 << z_bit;

#ifdef USE_SUBGROUPS
	if (!gl_HelperInvocation) {
		z_write_bit = subgroupOr(z_write_bit); //merge all Zs
		if (cluster_thread_group_index == 0) {
			atomicOr(cluster_render.data[z_write_offset], z_write_bit);
		}
	}
#else
	if (!gl_HelperInvocation) {
		atomicOr(cluster_render.data[z_write_offset], z_write_bit);
	}
#endif
}
Added GPU based cluster builder Clustering is now GPU based, uses an implementation based on the Activision algorithm. 2021-01-17 16:25:38 +00:00			`#[vertex]`

			`#version 450`

Refactor GLSL shader compilation -Used a more consistent set of keywords for the shader -Remove all harcoded entry points -Re-wrote the GLSL shader parser, new system is more flexible. Allows any entry point organization. -Entry point for sky shaders is now sky(). -Entry point for particle shaders is now process(). 2021-04-13 20:01:43 +00:00			`#VERSION_DEFINES`
Added GPU based cluster builder Clustering is now GPU based, uses an implementation based on the Activision algorithm. 2021-01-17 16:25:38 +00:00
			`layout(location = 0) in vec3 vertex_attrib;`

			`layout(location = 0) out float depth_interp;`
			`layout(location = 1) out flat uint element_index;`

			`layout(push_constant, binding = 0, std430) uniform Params {`
			`uint base_index;`
			`uint pad0;`
			`uint pad1;`
			`uint pad2;`
			`}`
			`params;`

			`layout(set = 0, binding = 1, std140) uniform State {`
			`mat4 projection;`

			`float inv_z_far;`
			`uint screen_to_clusters_shift; // shift to obtain coordinates in block indices`
			`uint cluster_screen_width; //`
			`uint cluster_data_size; // how much data for a single cluster takes`

			`uint cluster_depth_offset;`
			`uint pad0;`
			`uint pad1;`
			`uint pad2;`
			`}`
			`state;`

			`struct RenderElement {`
			`uint type; //0-4`
			`bool touches_near;`
			`bool touches_far;`
			`uint original_index;`
			`mat3x4 transform_inv;`
			`vec3 scale;`
			`uint pad;`
			`};`

			`layout(set = 0, binding = 2, std430) buffer restrict readonly RenderElements {`
			`RenderElement data[];`
			`}`
			`render_elements;`

			`void main() {`
			`element_index = params.base_index + gl_InstanceIndex;`

			`vec3 vertex = vertex_attrib;`
			`vertex *= render_elements.data[element_index].scale;`

			`vertex = vec4(vertex, 1.0) * render_elements.data[element_index].transform_inv;`
			`depth_interp = -vertex.z;`

			`gl_Position = state.projection * vec4(vertex, 1.0);`
			`}`

			`#[fragment]`

			`#version 450`

Refactor GLSL shader compilation -Used a more consistent set of keywords for the shader -Remove all harcoded entry points -Re-wrote the GLSL shader parser, new system is more flexible. Allows any entry point organization. -Entry point for sky shaders is now sky(). -Entry point for particle shaders is now process(). 2021-04-13 20:01:43 +00:00			`#VERSION_DEFINES`
Added GPU based cluster builder Clustering is now GPU based, uses an implementation based on the Activision algorithm. 2021-01-17 16:25:38 +00:00
As GLSLang seems to be all or nothing, added our own defines 2021-03-31 10:31:27 +00:00			`#if defined(has_GL_KHR_shader_subgroup_ballot) && defined(has_GL_KHR_shader_subgroup_arithmetic) && defined(has_GL_KHR_shader_subgroup_vote)`
Added GPU based cluster builder Clustering is now GPU based, uses an implementation based on the Activision algorithm. 2021-01-17 16:25:38 +00:00
			`#extension GL_KHR_shader_subgroup_ballot : enable`
			`#extension GL_KHR_shader_subgroup_arithmetic : enable`
			`#extension GL_KHR_shader_subgroup_vote : enable`

			`#define USE_SUBGROUPS`
			`#endif`

			`layout(location = 0) in float depth_interp;`
			`layout(location = 1) in flat uint element_index;`

			`layout(set = 0, binding = 1, std140) uniform State {`
			`mat4 projection;`
			`float inv_z_far;`
			`uint screen_to_clusters_shift; // shift to obtain coordinates in block indices`
			`uint cluster_screen_width; //`
			`uint cluster_data_size; // how much data for a single cluster takes`
			`uint cluster_depth_offset;`
			`uint pad0;`
			`uint pad1;`
			`uint pad2;`
			`}`
			`state;`

			`//cluster data is layout linearly, each cell contains the follow information:`
			`// - list of bits for every element to mark as used, so (max_elem_count/32)*4 uints`
			`// - a uint for each element to mark the depth bits used when rendering (0-31)`

			`layout(set = 0, binding = 3, std430) buffer restrict ClusterRender {`
			`uint data[];`
			`}`
			`cluster_render;`

			`void main() {`
			`//convert from screen to cluster`
			`uvec2 cluster = uvec2(gl_FragCoord.xy) >> state.screen_to_clusters_shift;`

			`//get linear cluster offset from screen poss`
			`uint cluster_offset = cluster.x + state.cluster_screen_width * cluster.y;`
			`//multiply by data size to position at the beginning of the element list for this cluster`
			`cluster_offset *= state.cluster_data_size;`

			`//find the current element in the list and plot the bit to mark it as used`
			`uint usage_write_offset = cluster_offset + (element_index >> 5);`
			`uint usage_write_bit = 1 << (element_index & 0x1F);`

			`#ifdef USE_SUBGROUPS`

			`uint cluster_thread_group_index;`

			`if (!gl_HelperInvocation) {`
			`//http://advances.realtimerendering.com/s2017/2017_Sig_Improved_Culling_final.pdf`

			`uvec4 mask;`

			`while (true) {`
			`// find the cluster offset of the first active thread`
			`// threads that did break; go inactive and no longer count`
			`uint first = subgroupBroadcastFirst(cluster_offset);`
			`// update the mask for thread that match this cluster`
			`mask = subgroupBallot(first == cluster_offset);`
			`if (first == cluster_offset) {`
			`// This thread belongs to the group of threads that match this offset,`
			`// so exit the loop.`
			`break;`
			`}`
			`}`

			`cluster_thread_group_index = subgroupBallotExclusiveBitCount(mask);`

			`if (cluster_thread_group_index == 0) {`
			`atomicOr(cluster_render.data[usage_write_offset], usage_write_bit);`
			`}`
			`}`
			`#else`
			`if (!gl_HelperInvocation) {`
			`atomicOr(cluster_render.data[usage_write_offset], usage_write_bit);`
			`}`
			`#endif`
			`//find the current element in the depth usage list and mark the current depth as used`
			`float unit_depth = depth_interp * state.inv_z_far;`

			`uint z_bit = clamp(uint(floor(unit_depth * 32.0)), 0, 31);`

			`uint z_write_offset = cluster_offset + state.cluster_depth_offset + element_index;`
			`uint z_write_bit = 1 << z_bit;`

			`#ifdef USE_SUBGROUPS`
			`if (!gl_HelperInvocation) {`
			`z_write_bit = subgroupOr(z_write_bit); //merge all Zs`
			`if (cluster_thread_group_index == 0) {`
			`atomicOr(cluster_render.data[z_write_offset], z_write_bit);`
			`}`
			`}`
			`#else`
			`if (!gl_HelperInvocation) {`
			`atomicOr(cluster_render.data[z_write_offset], z_write_bit);`
			`}`
			`#endif`
			`}`