diff --git a/modules/lightmapper_rd/lightmapper_rd.cpp b/modules/lightmapper_rd/lightmapper_rd.cpp
index feb9a2274e1..fe919953c1e 100644
--- a/modules/lightmapper_rd/lightmapper_rd.cpp
+++ b/modules/lightmapper_rd/lightmapper_rd.cpp
@@ -124,7 +124,7 @@ void LightmapperRD::add_probe(const Vector3 &p_position) {
 	probe_positions.push_back(probe);
 }
 
-void LightmapperRD::_plot_triangle_into_triangle_index_list(int p_size, const Vector3i &p_ofs, const AABB &p_bounds, const Vector3 p_points[3], uint32_t p_triangle_index, LocalVector<TriangleSort> &triangles, uint32_t p_grid_size) {
+void LightmapperRD::_plot_triangle_into_triangle_index_list(int p_size, const Vector3i &p_ofs, const AABB &p_bounds, const Vector3 p_points[3], uint32_t p_triangle_index, LocalVector<TriangleSort> &p_triangles_sort, uint32_t p_grid_size) {
 	int half_size = p_size / 2;
 
 	for (int i = 0; i < 8; i++) {
@@ -159,13 +159,69 @@ void LightmapperRD::_plot_triangle_into_triangle_index_list(int p_size, const Ve
 			TriangleSort ts;
 			ts.cell_index = n.x + (n.y * p_grid_size) + (n.z * p_grid_size * p_grid_size);
 			ts.triangle_index = p_triangle_index;
-			triangles.push_back(ts);
+			ts.triangle_aabb.position = p_points[0];
+			ts.triangle_aabb.size = Vector3();
+			ts.triangle_aabb.expand_to(p_points[1]);
+			ts.triangle_aabb.expand_to(p_points[2]);
+			p_triangles_sort.push_back(ts);
 		} else {
-			_plot_triangle_into_triangle_index_list(half_size, n, aabb, p_points, p_triangle_index, triangles, p_grid_size);
+			_plot_triangle_into_triangle_index_list(half_size, n, aabb, p_points, p_triangle_index, p_triangles_sort, p_grid_size);
 		}
 	}
 }
 
+void LightmapperRD::_sort_triangle_clusters(uint32_t p_cluster_size, uint32_t p_cluster_index, uint32_t p_index_start, uint32_t p_count, LocalVector<TriangleSort> &p_triangle_sort, LocalVector<ClusterAABB> &p_cluster_aabb) {
+	if (p_count == 0) {
+		return;
+	}
+
+	// Compute AABB for all triangles in the range.
+	SortArray<TriangleSort, TriangleSortAxis<0>> triangle_sorter_x;
+	SortArray<TriangleSort, TriangleSortAxis<1>> triangle_sorter_y;
+	SortArray<TriangleSort, TriangleSortAxis<2>> triangle_sorter_z;
+	AABB cluster_aabb = p_triangle_sort[p_index_start].triangle_aabb;
+	for (uint32_t i = 1; i < p_count; i++) {
+		cluster_aabb.merge_with(p_triangle_sort[p_index_start + i].triangle_aabb);
+	}
+
+	if (p_count > p_cluster_size) {
+		int longest_axis_index = cluster_aabb.get_longest_axis_index();
+		switch (longest_axis_index) {
+			case 0:
+				triangle_sorter_x.sort(&p_triangle_sort[p_index_start], p_count);
+				break;
+			case 1:
+				triangle_sorter_y.sort(&p_triangle_sort[p_index_start], p_count);
+				break;
+			case 2:
+				triangle_sorter_z.sort(&p_triangle_sort[p_index_start], p_count);
+				break;
+			default:
+				DEV_ASSERT(false && "Invalid axis returned by AABB.");
+				break;
+		}
+
+		uint32_t left_cluster_count = next_power_of_2(p_count / 2);
+		left_cluster_count = MAX(left_cluster_count, p_cluster_size);
+		left_cluster_count = MIN(left_cluster_count, p_count);
+		_sort_triangle_clusters(p_cluster_size, p_cluster_index, p_index_start, left_cluster_count, p_triangle_sort, p_cluster_aabb);
+
+		if (left_cluster_count < p_count) {
+			uint32_t cluster_index_right = p_cluster_index + (left_cluster_count / p_cluster_size);
+			_sort_triangle_clusters(p_cluster_size, cluster_index_right, p_index_start + left_cluster_count, p_count - left_cluster_count, p_triangle_sort, p_cluster_aabb);
+		}
+	} else {
+		ClusterAABB &aabb = p_cluster_aabb[p_cluster_index];
+		Vector3 aabb_end = cluster_aabb.get_end();
+		aabb.min_bounds[0] = cluster_aabb.position.x;
+		aabb.min_bounds[1] = cluster_aabb.position.y;
+		aabb.min_bounds[2] = cluster_aabb.position.z;
+		aabb.max_bounds[0] = aabb_end.x;
+		aabb.max_bounds[1] = aabb_end.y;
+		aabb.max_bounds[2] = aabb_end.z;
+	}
+}
+
 Lightmapper::BakeError LightmapperRD::_blit_meshes_into_atlas(int p_max_texture_size, Vector<Ref<Image>> &albedo_images, Vector<Ref<Image>> &emission_images, AABB &bounds, Size2i &atlas_size, int &atlas_slices, BakeStepFunc p_step_function, void *p_bake_userdata) {
 	Vector<Size2i> sizes;
 
@@ -281,7 +337,7 @@ Lightmapper::BakeError LightmapperRD::_blit_meshes_into_atlas(int p_max_texture_
 	return BAKE_OK;
 }
 
-void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, Vector<Probe> &p_probe_positions, GenerateProbes p_generate_probes, Vector<int> &slice_triangle_count, Vector<int> &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &lights_buffer, RID &triangle_cell_indices_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata) {
+void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, uint32_t p_cluster_size, Vector<Probe> &p_probe_positions, GenerateProbes p_generate_probes, Vector<int> &slice_triangle_count, Vector<int> &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &lights_buffer, RID &r_triangle_indices_buffer, RID &r_cluster_indices_buffer, RID &r_cluster_aabbs_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata) {
 	HashMap<Vertex, uint32_t, VertexHash> vertex_map;
 
 	//fill triangles array and vertex array
@@ -433,31 +489,70 @@ void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i
 	//sort it
 	triangle_sort.sort();
 
+	LocalVector<uint32_t> cluster_indices;
+	LocalVector<ClusterAABB> cluster_aabbs;
 	Vector<uint32_t> triangle_indices;
 	triangle_indices.resize(triangle_sort.size());
 	Vector<uint32_t> grid_indices;
 	grid_indices.resize(grid_size * grid_size * grid_size * 2);
 	memset(grid_indices.ptrw(), 0, grid_indices.size() * sizeof(uint32_t));
-	Vector<bool> solid;
-	solid.resize(grid_size * grid_size * grid_size);
-	memset(solid.ptrw(), 0, solid.size() * sizeof(bool));
 
 	{
-		uint32_t *tiw = triangle_indices.ptrw();
+		// Fill grid with cell indices.
 		uint32_t last_cell = 0xFFFFFFFF;
 		uint32_t *giw = grid_indices.ptrw();
-		bool *solidw = solid.ptrw();
+		uint32_t cluster_count = 0;
+		uint32_t solid_cell_count = 0;
 		for (uint32_t i = 0; i < triangle_sort.size(); i++) {
 			uint32_t cell = triangle_sort[i].cell_index;
 			if (cell != last_cell) {
-				//cell changed, update pointer to indices
-				giw[cell * 2 + 1] = i;
-				solidw[cell] = true;
+				giw[cell * 2 + 1] = solid_cell_count;
+				solid_cell_count++;
 			}
-			tiw[i] = triangle_sort[i].triangle_index;
-			giw[cell * 2]++; //update counter
+
+			if ((giw[cell * 2] % p_cluster_size) == 0) {
+				// Add an extra cluster every time the triangle counter reaches a multiple of the cluster size.
+				cluster_count++;
+			}
+
+			giw[cell * 2]++;
 			last_cell = cell;
 		}
+
+		// Build fixed-size triangle clusters for all the cells to speed up the traversal. A cell can hold multiple clusters that each contain a fixed
+		// amount of triangles and an AABB. The tracer will check against the AABBs first to know whether it needs to visit the cell's triangles.
+		//
+		// The building algorithm will divide the triangles recursively contained inside each cell, sorting by the longest axis of the AABB on each step.
+		//
+		// - If the amount of triangles is less or equal to the cluster size, the AABB will be stored and the algorithm stops.
+		//
+		// - The division by two is increased to the next power of two of half the amount of triangles (with cluster size as the minimum value) to
+		//   ensure the first half always fills the cluster.
+
+		cluster_indices.resize(solid_cell_count * 2);
+		cluster_aabbs.resize(cluster_count);
+
+		uint32_t i = 0;
+		uint32_t cluster_index = 0;
+		uint32_t solid_cell_index = 0;
+		uint32_t *tiw = triangle_indices.ptrw();
+		while (i < triangle_sort.size()) {
+			cluster_indices[solid_cell_index * 2] = cluster_index;
+			cluster_indices[solid_cell_index * 2 + 1] = i;
+
+			uint32_t cell = triangle_sort[i].cell_index;
+			uint32_t triangle_count = giw[cell * 2];
+			uint32_t cell_cluster_count = (triangle_count + p_cluster_size - 1) / p_cluster_size;
+			_sort_triangle_clusters(p_cluster_size, cluster_index, i, triangle_count, triangle_sort, cluster_aabbs);
+
+			for (uint32_t j = 0; j < triangle_count; j++) {
+				tiw[i + j] = triangle_sort[i + j].triangle_index;
+			}
+
+			i += triangle_count;
+			cluster_index += cell_cluster_count;
+			solid_cell_index++;
+		}
 	}
 #if 0
 	for (int i = 0; i < grid_size; i++) {
@@ -507,7 +602,13 @@ void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i
 		triangle_buffer = rd->storage_buffer_create(tb.size(), tb);
 
 		Vector<uint8_t> tib = triangle_indices.to_byte_array();
-		triangle_cell_indices_buffer = rd->storage_buffer_create(tib.size(), tib);
+		r_triangle_indices_buffer = rd->storage_buffer_create(tib.size(), tib);
+
+		Vector<uint8_t> cib = cluster_indices.to_byte_array();
+		r_cluster_indices_buffer = rd->storage_buffer_create(cib.size(), cib);
+
+		Vector<uint8_t> cab = cluster_aabbs.to_byte_array();
+		r_cluster_aabbs_buffer = rd->storage_buffer_create(cab.size(), cab);
 
 		Vector<uint8_t> lb = lights.to_byte_array();
 		if (lb.size() == 0) {
@@ -1020,24 +1121,29 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 	RID vertex_buffer;
 	RID triangle_buffer;
 	RID lights_buffer;
-	RID triangle_cell_indices_buffer;
+	RID triangle_indices_buffer;
+	RID cluster_indices_buffer;
+	RID cluster_aabbs_buffer;
 	RID grid_texture;
 	RID seams_buffer;
 	RID probe_positions_buffer;
 
 	Vector<int> slice_seam_count;
 
-#define FREE_BUFFERS                        \
-	rd->free(bake_parameters_buffer);       \
-	rd->free(vertex_buffer);                \
-	rd->free(triangle_buffer);              \
-	rd->free(lights_buffer);                \
-	rd->free(triangle_cell_indices_buffer); \
-	rd->free(grid_texture);                 \
-	rd->free(seams_buffer);                 \
+#define FREE_BUFFERS                   \
+	rd->free(bake_parameters_buffer);  \
+	rd->free(vertex_buffer);           \
+	rd->free(triangle_buffer);         \
+	rd->free(lights_buffer);           \
+	rd->free(triangle_indices_buffer); \
+	rd->free(cluster_indices_buffer);  \
+	rd->free(cluster_aabbs_buffer);    \
+	rd->free(grid_texture);            \
+	rd->free(seams_buffer);            \
 	rd->free(probe_positions_buffer);
 
-	_create_acceleration_structures(rd, atlas_size, atlas_slices, bounds, grid_size, probe_positions, p_generate_probes, slice_triangle_count, slice_seam_count, vertex_buffer, triangle_buffer, lights_buffer, triangle_cell_indices_buffer, probe_positions_buffer, grid_texture, seams_buffer, p_step_function, p_bake_userdata);
+	const uint32_t cluster_size = 16;
+	_create_acceleration_structures(rd, atlas_size, atlas_slices, bounds, grid_size, cluster_size, probe_positions, p_generate_probes, slice_triangle_count, slice_seam_count, vertex_buffer, triangle_buffer, lights_buffer, triangle_indices_buffer, cluster_indices_buffer, cluster_aabbs_buffer, probe_positions_buffer, grid_texture, seams_buffer, p_step_function, p_bake_userdata);
 
 	// Create global bake parameters buffer.
 	BakeParameters bake_parameters;
@@ -1133,7 +1239,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 			RD::Uniform u;
 			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
 			u.binding = 3;
-			u.append_id(triangle_cell_indices_buffer);
+			u.append_id(triangle_indices_buffer);
 			base_uniforms.push_back(u);
 		}
 		{
@@ -1185,6 +1291,20 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 			u.append_id(sampler);
 			base_uniforms.push_back(u);
 		}
+		{
+			RD::Uniform u;
+			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+			u.binding = 11;
+			u.append_id(cluster_indices_buffer);
+			base_uniforms.push_back(u);
+		}
+		{
+			RD::Uniform u;
+			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+			u.binding = 12;
+			u.append_id(cluster_aabbs_buffer);
+			base_uniforms.push_back(u);
+		}
 	}
 
 	RID raster_base_uniform = rd->uniform_set_create(base_uniforms, rasterize_shader, 0);
@@ -1230,6 +1350,8 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 
 	Ref<RDShaderFile> compute_shader;
 	String defines = "";
+	defines += "\n#define CLUSTER_SIZE " + uitos(cluster_size) + "\n";
+
 	if (p_bake_sh) {
 		defines += "\n#define USE_SH_LIGHTMAPS\n";
 	}
diff --git a/modules/lightmapper_rd/lightmapper_rd.h b/modules/lightmapper_rd/lightmapper_rd.h
index 8c1c4deba6f..5414048ddcc 100644
--- a/modules/lightmapper_rd/lightmapper_rd.h
+++ b/modules/lightmapper_rd/lightmapper_rd.h
@@ -192,6 +192,13 @@ class LightmapperRD : public Lightmapper {
 		}
 	};
 
+	struct ClusterAABB {
+		float min_bounds[3];
+		float pad0 = 0.0f;
+		float max_bounds[3];
+		float pad1 = 0.0f;
+	};
+
 	Vector<MeshInstance> mesh_instances;
 
 	Vector<Light> lights;
@@ -199,12 +206,22 @@ class LightmapperRD : public Lightmapper {
 	struct TriangleSort {
 		uint32_t cell_index = 0;
 		uint32_t triangle_index = 0;
+		AABB triangle_aabb;
+
 		bool operator<(const TriangleSort &p_triangle_sort) const {
 			return cell_index < p_triangle_sort.cell_index; //sorting by triangle index in this case makes no sense
 		}
 	};
 
+	template <int T>
+	struct TriangleSortAxis {
+		bool operator()(const TriangleSort &p_a, const TriangleSort &p_b) const {
+			return p_a.triangle_aabb.get_center()[T] < p_b.triangle_aabb.get_center()[T];
+		}
+	};
+
 	void _plot_triangle_into_triangle_index_list(int p_size, const Vector3i &p_ofs, const AABB &p_bounds, const Vector3 p_points[3], uint32_t p_triangle_index, LocalVector<TriangleSort> &triangles, uint32_t p_grid_size);
+	void _sort_triangle_clusters(uint32_t p_cluster_size, uint32_t p_cluster_index, uint32_t p_index_start, uint32_t p_count, LocalVector<TriangleSort> &p_triangle_sort, LocalVector<ClusterAABB> &p_cluster_aabb);
 
 	struct RasterPushConstant {
 		float atlas_size[2] = {};
@@ -250,7 +267,7 @@ class LightmapperRD : public Lightmapper {
 	};
 
 	BakeError _blit_meshes_into_atlas(int p_max_texture_size, Vector<Ref<Image>> &albedo_images, Vector<Ref<Image>> &emission_images, AABB &bounds, Size2i &atlas_size, int &atlas_slices, BakeStepFunc p_step_function, void *p_bake_userdata);
-	void _create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, Vector<Probe> &probe_positions, GenerateProbes p_generate_probes, Vector<int> &slice_triangle_count, Vector<int> &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &lights_buffer, RID &triangle_cell_indices_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata);
+	void _create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, uint32_t p_cluster_size, Vector<Probe> &probe_positions, GenerateProbes p_generate_probes, Vector<int> &slice_triangle_count, Vector<int> &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &lights_buffer, RID &r_triangle_indices_buffer, RID &r_cluster_indices_buffer, RID &r_cluster_aabbs_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata);
 	void _raster_geometry(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, int grid_size, AABB bounds, float p_bias, Vector<int> slice_triangle_count, RID position_tex, RID unocclude_tex, RID normal_tex, RID raster_depth_buffer, RID rasterize_shader, RID raster_base_uniform);
 
 	BakeError _dilate(RenderingDevice *rd, Ref<RDShaderFile> &compute_shader, RID &compute_base_uniform_set, PushConstant &push_constant, RID &source_light_tex, RID &dest_light_tex, const Size2i &atlas_size, int atlas_slices);
diff --git a/modules/lightmapper_rd/lm_common_inc.glsl b/modules/lightmapper_rd/lm_common_inc.glsl
index c91f06d0f30..98d11b9e69e 100644
--- a/modules/lightmapper_rd/lm_common_inc.glsl
+++ b/modules/lightmapper_rd/lm_common_inc.glsl
@@ -42,15 +42,22 @@ struct Triangle {
 	uint pad1;
 };
 
+struct ClusterAABB {
+	vec3 min_bounds;
+	uint pad0;
+	vec3 max_bounds;
+	uint pad1;
+};
+
 layout(set = 0, binding = 2, std430) restrict readonly buffer Triangles {
 	Triangle data[];
 }
 triangles;
 
-layout(set = 0, binding = 3, std430) restrict readonly buffer GridIndices {
+layout(set = 0, binding = 3, std430) restrict readonly buffer TriangleIndices {
 	uint data[];
 }
-grid_indices;
+triangle_indices;
 
 #define LIGHT_TYPE_DIRECTIONAL 0
 #define LIGHT_TYPE_OMNI 1
@@ -104,6 +111,16 @@ layout(set = 0, binding = 9) uniform texture2DArray emission_tex;
 
 layout(set = 0, binding = 10) uniform sampler linear_sampler;
 
+layout(set = 0, binding = 11, std430) restrict readonly buffer ClusterIndices {
+	uint data[];
+}
+cluster_indices;
+
+layout(set = 0, binding = 12, std430) restrict readonly buffer ClusterAABBs {
+	ClusterAABB data[];
+}
+cluster_aabbs;
+
 // Fragment action constants
 const uint FA_NONE = 0;
 const uint FA_SMOOTHEN_POSITION = 1;
diff --git a/modules/lightmapper_rd/lm_compute.glsl b/modules/lightmapper_rd/lm_compute.glsl
index 572e6d55d8d..a2a480043ab 100644
--- a/modules/lightmapper_rd/lm_compute.glsl
+++ b/modules/lightmapper_rd/lm_compute.glsl
@@ -119,6 +119,17 @@ const uint RAY_FRONT = 1;
 const uint RAY_BACK = 2;
 const uint RAY_ANY = 3;
 
+bool ray_box_test(vec3 p_from, vec3 p_inv_dir, vec3 p_box_min, vec3 p_box_max) {
+	vec3 t0 = (p_box_min - p_from) * p_inv_dir;
+	vec3 t1 = (p_box_max - p_from) * p_inv_dir;
+	vec3 tmin = min(t0, t1), tmax = max(t0, t1);
+	return max(tmin.x, max(tmin.y, tmin.z)) <= min(tmax.x, min(tmax.y, tmax.z));
+}
+
+#if CLUSTER_SIZE > 32
+#define CLUSTER_TRIANGLE_ITERATION
+#endif
+
 uint trace_ray(vec3 p_from, vec3 p_to, bool p_any_hit, out float r_distance, out vec3 r_normal, out uint r_triangle, out vec3 r_barycentric) {
 	// World coordinates.
 	vec3 rel = p_to - p_from;
@@ -142,60 +153,106 @@ uint trace_ray(vec3 p_from, vec3 p_to, bool p_any_hit, out float r_distance, out
 	uint iters = 0;
 	while (all(greaterThanEqual(icell, ivec3(0))) && all(lessThan(icell, ivec3(bake_params.grid_size))) && (iters < 1000)) {
 		uvec2 cell_data = texelFetch(usampler3D(grid, linear_sampler), icell, 0).xy;
-		if (cell_data.x > 0) { //triangles here
+		uint triangle_count = cell_data.x;
+		if (triangle_count > 0) {
 			uint hit = RAY_MISS;
 			float best_distance = 1e20;
-
-			for (uint i = 0; i < cell_data.x; i++) {
-				uint tidx = grid_indices.data[cell_data.y + i];
-
-				// Ray-Box test.
-				Triangle triangle = triangles.data[tidx];
-				vec3 t0 = (triangle.min_bounds - p_from) * inv_dir;
-				vec3 t1 = (triangle.max_bounds - p_from) * inv_dir;
-				vec3 tmin = min(t0, t1), tmax = max(t0, t1);
-
-				if (max(tmin.x, max(tmin.y, tmin.z)) > min(tmax.x, min(tmax.y, tmax.z))) {
-					continue; // Ray-Box test failed.
-				}
-
-				// Prepare triangle vertices.
-				vec3 vtx0 = vertices.data[triangle.indices.x].position;
-				vec3 vtx1 = vertices.data[triangle.indices.y].position;
-				vec3 vtx2 = vertices.data[triangle.indices.z].position;
-				vec3 normal = -normalize(cross((vtx0 - vtx1), (vtx0 - vtx2)));
-				bool backface = dot(normal, dir) >= 0.0;
-				float distance;
-				vec3 barycentric;
-				if (ray_hits_triangle(p_from, dir, rel_len, vtx0, vtx1, vtx2, distance, barycentric)) {
-					if (p_any_hit) {
-						// Return early if any hit was requested.
-						return RAY_ANY;
-					}
-
-					vec3 position = p_from + dir * distance;
-					vec3 hit_cell = (position - bake_params.to_cell_offset) * bake_params.to_cell_size;
-					if (icell != ivec3(hit_cell)) {
-						// It's possible for the ray to hit a triangle in a position outside the bounds of the cell
-						// if it's large enough to cover multiple ones. The hit must be ignored if this is the case.
-						continue;
-					}
-
-					if (!backface) {
-						// The case of meshes having both a front and back face in the same plane is more common than expected.
-						// If this is a front-face, bias it closer to the ray origin, so it always wins over the back-face.
-						distance = max(bake_params.bias, distance - bake_params.bias);
-					}
-
-					if (distance < best_distance) {
-						hit = backface ? RAY_BACK : RAY_FRONT;
-						best_distance = distance;
-						r_distance = distance;
-						r_normal = normal;
-						r_triangle = tidx;
-						r_barycentric = barycentric;
+			uint cluster_start = cluster_indices.data[cell_data.y * 2];
+			uint cell_triangle_start = cluster_indices.data[cell_data.y * 2 + 1];
+			uint cluster_count = (triangle_count + CLUSTER_SIZE - 1) / CLUSTER_SIZE;
+			uint cluster_base_index = 0;
+			while (cluster_base_index < cluster_count) {
+				// To minimize divergence, all Ray-AABB tests on the clusters contained in the cell are performed
+				// before checking against the triangles. We do this 32 clusters at a time and store the intersected
+				// clusters on each bit of the 32-bit integer.
+				uint cluster_test_count = min(32, cluster_count - cluster_base_index);
+				uint cluster_hits = 0;
+				for (uint i = 0; i < cluster_test_count; i++) {
+					uint cluster_index = cluster_start + cluster_base_index + i;
+					ClusterAABB cluster_aabb = cluster_aabbs.data[cluster_index];
+					if (ray_box_test(p_from, inv_dir, cluster_aabb.min_bounds, cluster_aabb.max_bounds)) {
+						cluster_hits |= (1 << i);
 					}
 				}
+
+				// Check the triangles in any of the clusters that were intersected by toggling off the bits in the
+				// 32-bit integer counter until no bits are left.
+				while (cluster_hits > 0) {
+					uint cluster_index = findLSB(cluster_hits);
+					cluster_hits &= ~(1 << cluster_index);
+					cluster_index += cluster_base_index;
+
+					// Do the same divergence execution trick with triangles as well.
+					uint triangle_base_index = 0;
+#ifdef CLUSTER_TRIANGLE_ITERATION
+					while (triangle_base_index < triangle_count)
+#endif
+					{
+						uint triangle_start_index = cell_triangle_start + cluster_index * CLUSTER_SIZE + triangle_base_index;
+						uint triangle_test_count = min(CLUSTER_SIZE, triangle_count - triangle_base_index);
+						uint triangle_hits = 0;
+						for (uint i = 0; i < triangle_test_count; i++) {
+							uint triangle_index = triangle_indices.data[triangle_start_index + i];
+							if (ray_box_test(p_from, inv_dir, triangles.data[triangle_index].min_bounds, triangles.data[triangle_index].max_bounds)) {
+								triangle_hits |= (1 << i);
+							}
+						}
+
+						while (triangle_hits > 0) {
+							uint cluster_triangle_index = findLSB(triangle_hits);
+							triangle_hits &= ~(1 << cluster_triangle_index);
+							cluster_triangle_index += triangle_start_index;
+
+							uint triangle_index = triangle_indices.data[cluster_triangle_index];
+							Triangle triangle = triangles.data[triangle_index];
+
+							// Gather the triangle vertex positions.
+							vec3 vtx0 = vertices.data[triangle.indices.x].position;
+							vec3 vtx1 = vertices.data[triangle.indices.y].position;
+							vec3 vtx2 = vertices.data[triangle.indices.z].position;
+							vec3 normal = -normalize(cross((vtx0 - vtx1), (vtx0 - vtx2)));
+							bool backface = dot(normal, dir) >= 0.0;
+							float distance;
+							vec3 barycentric;
+							if (ray_hits_triangle(p_from, dir, rel_len, vtx0, vtx1, vtx2, distance, barycentric)) {
+								if (p_any_hit) {
+									// Return early if any hit was requested.
+									return RAY_ANY;
+								}
+
+								vec3 position = p_from + dir * distance;
+								vec3 hit_cell = (position - bake_params.to_cell_offset) * bake_params.to_cell_size;
+								if (icell != ivec3(hit_cell)) {
+									// It's possible for the ray to hit a triangle in a position outside the bounds of the cell
+									// if it's large enough to cover multiple ones. The hit must be ignored if this is the case.
+									continue;
+								}
+
+								if (!backface) {
+									// The case of meshes having both a front and back face in the same plane is more common than
+									// expected, so if this is a front-face, bias it closer to the ray origin, so it always wins
+									// over the back-face.
+									distance = max(bake_params.bias, distance - bake_params.bias);
+								}
+
+								if (distance < best_distance) {
+									hit = backface ? RAY_BACK : RAY_FRONT;
+									best_distance = distance;
+									r_distance = distance;
+									r_normal = normal;
+									r_triangle = triangle_index;
+									r_barycentric = barycentric;
+								}
+							}
+						}
+
+#ifdef CLUSTER_TRIANGLE_ITERATION
+						triangle_base_index += CLUSTER_SIZE;
+#endif
+					}
+				}
+
+				cluster_base_index += 32;
 			}
 
 			if (hit != RAY_MISS) {