From 47214ea9f5e76fb4a54b902ffafe1081a57601d3 Mon Sep 17 00:00:00 2001 From: Dario Date: Fri, 13 Oct 2023 11:39:48 -0300 Subject: [PATCH] Optimize lightmapper using triangle clusters on the acceleration structure. Add an additional layer of indirection to the grid used by the lightmapper to store fixed-size triangle clusters. Greatly speeds up baking times on scenes with high triangle density, as the clusters will help to avoid unnecessary checks when the triangle density is high on the scene. --- modules/lightmapper_rd/lightmapper_rd.cpp | 174 ++++++++++++++++++---- modules/lightmapper_rd/lightmapper_rd.h | 19 ++- modules/lightmapper_rd/lm_common_inc.glsl | 21 ++- modules/lightmapper_rd/lm_compute.glsl | 157 ++++++++++++------- 4 files changed, 292 insertions(+), 79 deletions(-) diff --git a/modules/lightmapper_rd/lightmapper_rd.cpp b/modules/lightmapper_rd/lightmapper_rd.cpp index feb9a2274e1..fe919953c1e 100644 --- a/modules/lightmapper_rd/lightmapper_rd.cpp +++ b/modules/lightmapper_rd/lightmapper_rd.cpp @@ -124,7 +124,7 @@ void LightmapperRD::add_probe(const Vector3 &p_position) { probe_positions.push_back(probe); } -void LightmapperRD::_plot_triangle_into_triangle_index_list(int p_size, const Vector3i &p_ofs, const AABB &p_bounds, const Vector3 p_points[3], uint32_t p_triangle_index, LocalVector &triangles, uint32_t p_grid_size) { +void LightmapperRD::_plot_triangle_into_triangle_index_list(int p_size, const Vector3i &p_ofs, const AABB &p_bounds, const Vector3 p_points[3], uint32_t p_triangle_index, LocalVector &p_triangles_sort, uint32_t p_grid_size) { int half_size = p_size / 2; for (int i = 0; i < 8; i++) { @@ -159,13 +159,69 @@ void LightmapperRD::_plot_triangle_into_triangle_index_list(int p_size, const Ve TriangleSort ts; ts.cell_index = n.x + (n.y * p_grid_size) + (n.z * p_grid_size * p_grid_size); ts.triangle_index = p_triangle_index; - triangles.push_back(ts); + ts.triangle_aabb.position = p_points[0]; + ts.triangle_aabb.size = Vector3(); + ts.triangle_aabb.expand_to(p_points[1]); + ts.triangle_aabb.expand_to(p_points[2]); + p_triangles_sort.push_back(ts); } else { - _plot_triangle_into_triangle_index_list(half_size, n, aabb, p_points, p_triangle_index, triangles, p_grid_size); + _plot_triangle_into_triangle_index_list(half_size, n, aabb, p_points, p_triangle_index, p_triangles_sort, p_grid_size); } } } +void LightmapperRD::_sort_triangle_clusters(uint32_t p_cluster_size, uint32_t p_cluster_index, uint32_t p_index_start, uint32_t p_count, LocalVector &p_triangle_sort, LocalVector &p_cluster_aabb) { + if (p_count == 0) { + return; + } + + // Compute AABB for all triangles in the range. + SortArray> triangle_sorter_x; + SortArray> triangle_sorter_y; + SortArray> triangle_sorter_z; + AABB cluster_aabb = p_triangle_sort[p_index_start].triangle_aabb; + for (uint32_t i = 1; i < p_count; i++) { + cluster_aabb.merge_with(p_triangle_sort[p_index_start + i].triangle_aabb); + } + + if (p_count > p_cluster_size) { + int longest_axis_index = cluster_aabb.get_longest_axis_index(); + switch (longest_axis_index) { + case 0: + triangle_sorter_x.sort(&p_triangle_sort[p_index_start], p_count); + break; + case 1: + triangle_sorter_y.sort(&p_triangle_sort[p_index_start], p_count); + break; + case 2: + triangle_sorter_z.sort(&p_triangle_sort[p_index_start], p_count); + break; + default: + DEV_ASSERT(false && "Invalid axis returned by AABB."); + break; + } + + uint32_t left_cluster_count = next_power_of_2(p_count / 2); + left_cluster_count = MAX(left_cluster_count, p_cluster_size); + left_cluster_count = MIN(left_cluster_count, p_count); + _sort_triangle_clusters(p_cluster_size, p_cluster_index, p_index_start, left_cluster_count, p_triangle_sort, p_cluster_aabb); + + if (left_cluster_count < p_count) { + uint32_t cluster_index_right = p_cluster_index + (left_cluster_count / p_cluster_size); + _sort_triangle_clusters(p_cluster_size, cluster_index_right, p_index_start + left_cluster_count, p_count - left_cluster_count, p_triangle_sort, p_cluster_aabb); + } + } else { + ClusterAABB &aabb = p_cluster_aabb[p_cluster_index]; + Vector3 aabb_end = cluster_aabb.get_end(); + aabb.min_bounds[0] = cluster_aabb.position.x; + aabb.min_bounds[1] = cluster_aabb.position.y; + aabb.min_bounds[2] = cluster_aabb.position.z; + aabb.max_bounds[0] = aabb_end.x; + aabb.max_bounds[1] = aabb_end.y; + aabb.max_bounds[2] = aabb_end.z; + } +} + Lightmapper::BakeError LightmapperRD::_blit_meshes_into_atlas(int p_max_texture_size, Vector> &albedo_images, Vector> &emission_images, AABB &bounds, Size2i &atlas_size, int &atlas_slices, BakeStepFunc p_step_function, void *p_bake_userdata) { Vector sizes; @@ -281,7 +337,7 @@ Lightmapper::BakeError LightmapperRD::_blit_meshes_into_atlas(int p_max_texture_ return BAKE_OK; } -void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, Vector &p_probe_positions, GenerateProbes p_generate_probes, Vector &slice_triangle_count, Vector &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &lights_buffer, RID &triangle_cell_indices_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata) { +void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, uint32_t p_cluster_size, Vector &p_probe_positions, GenerateProbes p_generate_probes, Vector &slice_triangle_count, Vector &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &lights_buffer, RID &r_triangle_indices_buffer, RID &r_cluster_indices_buffer, RID &r_cluster_aabbs_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata) { HashMap vertex_map; //fill triangles array and vertex array @@ -433,31 +489,70 @@ void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i //sort it triangle_sort.sort(); + LocalVector cluster_indices; + LocalVector cluster_aabbs; Vector triangle_indices; triangle_indices.resize(triangle_sort.size()); Vector grid_indices; grid_indices.resize(grid_size * grid_size * grid_size * 2); memset(grid_indices.ptrw(), 0, grid_indices.size() * sizeof(uint32_t)); - Vector solid; - solid.resize(grid_size * grid_size * grid_size); - memset(solid.ptrw(), 0, solid.size() * sizeof(bool)); { - uint32_t *tiw = triangle_indices.ptrw(); + // Fill grid with cell indices. uint32_t last_cell = 0xFFFFFFFF; uint32_t *giw = grid_indices.ptrw(); - bool *solidw = solid.ptrw(); + uint32_t cluster_count = 0; + uint32_t solid_cell_count = 0; for (uint32_t i = 0; i < triangle_sort.size(); i++) { uint32_t cell = triangle_sort[i].cell_index; if (cell != last_cell) { - //cell changed, update pointer to indices - giw[cell * 2 + 1] = i; - solidw[cell] = true; + giw[cell * 2 + 1] = solid_cell_count; + solid_cell_count++; } - tiw[i] = triangle_sort[i].triangle_index; - giw[cell * 2]++; //update counter + + if ((giw[cell * 2] % p_cluster_size) == 0) { + // Add an extra cluster every time the triangle counter reaches a multiple of the cluster size. + cluster_count++; + } + + giw[cell * 2]++; last_cell = cell; } + + // Build fixed-size triangle clusters for all the cells to speed up the traversal. A cell can hold multiple clusters that each contain a fixed + // amount of triangles and an AABB. The tracer will check against the AABBs first to know whether it needs to visit the cell's triangles. + // + // The building algorithm will divide the triangles recursively contained inside each cell, sorting by the longest axis of the AABB on each step. + // + // - If the amount of triangles is less or equal to the cluster size, the AABB will be stored and the algorithm stops. + // + // - The division by two is increased to the next power of two of half the amount of triangles (with cluster size as the minimum value) to + // ensure the first half always fills the cluster. + + cluster_indices.resize(solid_cell_count * 2); + cluster_aabbs.resize(cluster_count); + + uint32_t i = 0; + uint32_t cluster_index = 0; + uint32_t solid_cell_index = 0; + uint32_t *tiw = triangle_indices.ptrw(); + while (i < triangle_sort.size()) { + cluster_indices[solid_cell_index * 2] = cluster_index; + cluster_indices[solid_cell_index * 2 + 1] = i; + + uint32_t cell = triangle_sort[i].cell_index; + uint32_t triangle_count = giw[cell * 2]; + uint32_t cell_cluster_count = (triangle_count + p_cluster_size - 1) / p_cluster_size; + _sort_triangle_clusters(p_cluster_size, cluster_index, i, triangle_count, triangle_sort, cluster_aabbs); + + for (uint32_t j = 0; j < triangle_count; j++) { + tiw[i + j] = triangle_sort[i + j].triangle_index; + } + + i += triangle_count; + cluster_index += cell_cluster_count; + solid_cell_index++; + } } #if 0 for (int i = 0; i < grid_size; i++) { @@ -507,7 +602,13 @@ void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i triangle_buffer = rd->storage_buffer_create(tb.size(), tb); Vector tib = triangle_indices.to_byte_array(); - triangle_cell_indices_buffer = rd->storage_buffer_create(tib.size(), tib); + r_triangle_indices_buffer = rd->storage_buffer_create(tib.size(), tib); + + Vector cib = cluster_indices.to_byte_array(); + r_cluster_indices_buffer = rd->storage_buffer_create(cib.size(), cib); + + Vector cab = cluster_aabbs.to_byte_array(); + r_cluster_aabbs_buffer = rd->storage_buffer_create(cab.size(), cab); Vector lb = lights.to_byte_array(); if (lb.size() == 0) { @@ -1020,24 +1121,29 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d RID vertex_buffer; RID triangle_buffer; RID lights_buffer; - RID triangle_cell_indices_buffer; + RID triangle_indices_buffer; + RID cluster_indices_buffer; + RID cluster_aabbs_buffer; RID grid_texture; RID seams_buffer; RID probe_positions_buffer; Vector slice_seam_count; -#define FREE_BUFFERS \ - rd->free(bake_parameters_buffer); \ - rd->free(vertex_buffer); \ - rd->free(triangle_buffer); \ - rd->free(lights_buffer); \ - rd->free(triangle_cell_indices_buffer); \ - rd->free(grid_texture); \ - rd->free(seams_buffer); \ +#define FREE_BUFFERS \ + rd->free(bake_parameters_buffer); \ + rd->free(vertex_buffer); \ + rd->free(triangle_buffer); \ + rd->free(lights_buffer); \ + rd->free(triangle_indices_buffer); \ + rd->free(cluster_indices_buffer); \ + rd->free(cluster_aabbs_buffer); \ + rd->free(grid_texture); \ + rd->free(seams_buffer); \ rd->free(probe_positions_buffer); - _create_acceleration_structures(rd, atlas_size, atlas_slices, bounds, grid_size, probe_positions, p_generate_probes, slice_triangle_count, slice_seam_count, vertex_buffer, triangle_buffer, lights_buffer, triangle_cell_indices_buffer, probe_positions_buffer, grid_texture, seams_buffer, p_step_function, p_bake_userdata); + const uint32_t cluster_size = 16; + _create_acceleration_structures(rd, atlas_size, atlas_slices, bounds, grid_size, cluster_size, probe_positions, p_generate_probes, slice_triangle_count, slice_seam_count, vertex_buffer, triangle_buffer, lights_buffer, triangle_indices_buffer, cluster_indices_buffer, cluster_aabbs_buffer, probe_positions_buffer, grid_texture, seams_buffer, p_step_function, p_bake_userdata); // Create global bake parameters buffer. BakeParameters bake_parameters; @@ -1133,7 +1239,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d RD::Uniform u; u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER; u.binding = 3; - u.append_id(triangle_cell_indices_buffer); + u.append_id(triangle_indices_buffer); base_uniforms.push_back(u); } { @@ -1185,6 +1291,20 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d u.append_id(sampler); base_uniforms.push_back(u); } + { + RD::Uniform u; + u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER; + u.binding = 11; + u.append_id(cluster_indices_buffer); + base_uniforms.push_back(u); + } + { + RD::Uniform u; + u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER; + u.binding = 12; + u.append_id(cluster_aabbs_buffer); + base_uniforms.push_back(u); + } } RID raster_base_uniform = rd->uniform_set_create(base_uniforms, rasterize_shader, 0); @@ -1230,6 +1350,8 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d Ref compute_shader; String defines = ""; + defines += "\n#define CLUSTER_SIZE " + uitos(cluster_size) + "\n"; + if (p_bake_sh) { defines += "\n#define USE_SH_LIGHTMAPS\n"; } diff --git a/modules/lightmapper_rd/lightmapper_rd.h b/modules/lightmapper_rd/lightmapper_rd.h index 8c1c4deba6f..5414048ddcc 100644 --- a/modules/lightmapper_rd/lightmapper_rd.h +++ b/modules/lightmapper_rd/lightmapper_rd.h @@ -192,6 +192,13 @@ class LightmapperRD : public Lightmapper { } }; + struct ClusterAABB { + float min_bounds[3]; + float pad0 = 0.0f; + float max_bounds[3]; + float pad1 = 0.0f; + }; + Vector mesh_instances; Vector lights; @@ -199,12 +206,22 @@ class LightmapperRD : public Lightmapper { struct TriangleSort { uint32_t cell_index = 0; uint32_t triangle_index = 0; + AABB triangle_aabb; + bool operator<(const TriangleSort &p_triangle_sort) const { return cell_index < p_triangle_sort.cell_index; //sorting by triangle index in this case makes no sense } }; + template + struct TriangleSortAxis { + bool operator()(const TriangleSort &p_a, const TriangleSort &p_b) const { + return p_a.triangle_aabb.get_center()[T] < p_b.triangle_aabb.get_center()[T]; + } + }; + void _plot_triangle_into_triangle_index_list(int p_size, const Vector3i &p_ofs, const AABB &p_bounds, const Vector3 p_points[3], uint32_t p_triangle_index, LocalVector &triangles, uint32_t p_grid_size); + void _sort_triangle_clusters(uint32_t p_cluster_size, uint32_t p_cluster_index, uint32_t p_index_start, uint32_t p_count, LocalVector &p_triangle_sort, LocalVector &p_cluster_aabb); struct RasterPushConstant { float atlas_size[2] = {}; @@ -250,7 +267,7 @@ class LightmapperRD : public Lightmapper { }; BakeError _blit_meshes_into_atlas(int p_max_texture_size, Vector> &albedo_images, Vector> &emission_images, AABB &bounds, Size2i &atlas_size, int &atlas_slices, BakeStepFunc p_step_function, void *p_bake_userdata); - void _create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, Vector &probe_positions, GenerateProbes p_generate_probes, Vector &slice_triangle_count, Vector &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &lights_buffer, RID &triangle_cell_indices_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata); + void _create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, uint32_t p_cluster_size, Vector &probe_positions, GenerateProbes p_generate_probes, Vector &slice_triangle_count, Vector &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &lights_buffer, RID &r_triangle_indices_buffer, RID &r_cluster_indices_buffer, RID &r_cluster_aabbs_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata); void _raster_geometry(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, int grid_size, AABB bounds, float p_bias, Vector slice_triangle_count, RID position_tex, RID unocclude_tex, RID normal_tex, RID raster_depth_buffer, RID rasterize_shader, RID raster_base_uniform); BakeError _dilate(RenderingDevice *rd, Ref &compute_shader, RID &compute_base_uniform_set, PushConstant &push_constant, RID &source_light_tex, RID &dest_light_tex, const Size2i &atlas_size, int atlas_slices); diff --git a/modules/lightmapper_rd/lm_common_inc.glsl b/modules/lightmapper_rd/lm_common_inc.glsl index c91f06d0f30..98d11b9e69e 100644 --- a/modules/lightmapper_rd/lm_common_inc.glsl +++ b/modules/lightmapper_rd/lm_common_inc.glsl @@ -42,15 +42,22 @@ struct Triangle { uint pad1; }; +struct ClusterAABB { + vec3 min_bounds; + uint pad0; + vec3 max_bounds; + uint pad1; +}; + layout(set = 0, binding = 2, std430) restrict readonly buffer Triangles { Triangle data[]; } triangles; -layout(set = 0, binding = 3, std430) restrict readonly buffer GridIndices { +layout(set = 0, binding = 3, std430) restrict readonly buffer TriangleIndices { uint data[]; } -grid_indices; +triangle_indices; #define LIGHT_TYPE_DIRECTIONAL 0 #define LIGHT_TYPE_OMNI 1 @@ -104,6 +111,16 @@ layout(set = 0, binding = 9) uniform texture2DArray emission_tex; layout(set = 0, binding = 10) uniform sampler linear_sampler; +layout(set = 0, binding = 11, std430) restrict readonly buffer ClusterIndices { + uint data[]; +} +cluster_indices; + +layout(set = 0, binding = 12, std430) restrict readonly buffer ClusterAABBs { + ClusterAABB data[]; +} +cluster_aabbs; + // Fragment action constants const uint FA_NONE = 0; const uint FA_SMOOTHEN_POSITION = 1; diff --git a/modules/lightmapper_rd/lm_compute.glsl b/modules/lightmapper_rd/lm_compute.glsl index 572e6d55d8d..a2a480043ab 100644 --- a/modules/lightmapper_rd/lm_compute.glsl +++ b/modules/lightmapper_rd/lm_compute.glsl @@ -119,6 +119,17 @@ const uint RAY_FRONT = 1; const uint RAY_BACK = 2; const uint RAY_ANY = 3; +bool ray_box_test(vec3 p_from, vec3 p_inv_dir, vec3 p_box_min, vec3 p_box_max) { + vec3 t0 = (p_box_min - p_from) * p_inv_dir; + vec3 t1 = (p_box_max - p_from) * p_inv_dir; + vec3 tmin = min(t0, t1), tmax = max(t0, t1); + return max(tmin.x, max(tmin.y, tmin.z)) <= min(tmax.x, min(tmax.y, tmax.z)); +} + +#if CLUSTER_SIZE > 32 +#define CLUSTER_TRIANGLE_ITERATION +#endif + uint trace_ray(vec3 p_from, vec3 p_to, bool p_any_hit, out float r_distance, out vec3 r_normal, out uint r_triangle, out vec3 r_barycentric) { // World coordinates. vec3 rel = p_to - p_from; @@ -142,60 +153,106 @@ uint trace_ray(vec3 p_from, vec3 p_to, bool p_any_hit, out float r_distance, out uint iters = 0; while (all(greaterThanEqual(icell, ivec3(0))) && all(lessThan(icell, ivec3(bake_params.grid_size))) && (iters < 1000)) { uvec2 cell_data = texelFetch(usampler3D(grid, linear_sampler), icell, 0).xy; - if (cell_data.x > 0) { //triangles here + uint triangle_count = cell_data.x; + if (triangle_count > 0) { uint hit = RAY_MISS; float best_distance = 1e20; - - for (uint i = 0; i < cell_data.x; i++) { - uint tidx = grid_indices.data[cell_data.y + i]; - - // Ray-Box test. - Triangle triangle = triangles.data[tidx]; - vec3 t0 = (triangle.min_bounds - p_from) * inv_dir; - vec3 t1 = (triangle.max_bounds - p_from) * inv_dir; - vec3 tmin = min(t0, t1), tmax = max(t0, t1); - - if (max(tmin.x, max(tmin.y, tmin.z)) > min(tmax.x, min(tmax.y, tmax.z))) { - continue; // Ray-Box test failed. - } - - // Prepare triangle vertices. - vec3 vtx0 = vertices.data[triangle.indices.x].position; - vec3 vtx1 = vertices.data[triangle.indices.y].position; - vec3 vtx2 = vertices.data[triangle.indices.z].position; - vec3 normal = -normalize(cross((vtx0 - vtx1), (vtx0 - vtx2))); - bool backface = dot(normal, dir) >= 0.0; - float distance; - vec3 barycentric; - if (ray_hits_triangle(p_from, dir, rel_len, vtx0, vtx1, vtx2, distance, barycentric)) { - if (p_any_hit) { - // Return early if any hit was requested. - return RAY_ANY; - } - - vec3 position = p_from + dir * distance; - vec3 hit_cell = (position - bake_params.to_cell_offset) * bake_params.to_cell_size; - if (icell != ivec3(hit_cell)) { - // It's possible for the ray to hit a triangle in a position outside the bounds of the cell - // if it's large enough to cover multiple ones. The hit must be ignored if this is the case. - continue; - } - - if (!backface) { - // The case of meshes having both a front and back face in the same plane is more common than expected. - // If this is a front-face, bias it closer to the ray origin, so it always wins over the back-face. - distance = max(bake_params.bias, distance - bake_params.bias); - } - - if (distance < best_distance) { - hit = backface ? RAY_BACK : RAY_FRONT; - best_distance = distance; - r_distance = distance; - r_normal = normal; - r_triangle = tidx; - r_barycentric = barycentric; + uint cluster_start = cluster_indices.data[cell_data.y * 2]; + uint cell_triangle_start = cluster_indices.data[cell_data.y * 2 + 1]; + uint cluster_count = (triangle_count + CLUSTER_SIZE - 1) / CLUSTER_SIZE; + uint cluster_base_index = 0; + while (cluster_base_index < cluster_count) { + // To minimize divergence, all Ray-AABB tests on the clusters contained in the cell are performed + // before checking against the triangles. We do this 32 clusters at a time and store the intersected + // clusters on each bit of the 32-bit integer. + uint cluster_test_count = min(32, cluster_count - cluster_base_index); + uint cluster_hits = 0; + for (uint i = 0; i < cluster_test_count; i++) { + uint cluster_index = cluster_start + cluster_base_index + i; + ClusterAABB cluster_aabb = cluster_aabbs.data[cluster_index]; + if (ray_box_test(p_from, inv_dir, cluster_aabb.min_bounds, cluster_aabb.max_bounds)) { + cluster_hits |= (1 << i); } } + + // Check the triangles in any of the clusters that were intersected by toggling off the bits in the + // 32-bit integer counter until no bits are left. + while (cluster_hits > 0) { + uint cluster_index = findLSB(cluster_hits); + cluster_hits &= ~(1 << cluster_index); + cluster_index += cluster_base_index; + + // Do the same divergence execution trick with triangles as well. + uint triangle_base_index = 0; +#ifdef CLUSTER_TRIANGLE_ITERATION + while (triangle_base_index < triangle_count) +#endif + { + uint triangle_start_index = cell_triangle_start + cluster_index * CLUSTER_SIZE + triangle_base_index; + uint triangle_test_count = min(CLUSTER_SIZE, triangle_count - triangle_base_index); + uint triangle_hits = 0; + for (uint i = 0; i < triangle_test_count; i++) { + uint triangle_index = triangle_indices.data[triangle_start_index + i]; + if (ray_box_test(p_from, inv_dir, triangles.data[triangle_index].min_bounds, triangles.data[triangle_index].max_bounds)) { + triangle_hits |= (1 << i); + } + } + + while (triangle_hits > 0) { + uint cluster_triangle_index = findLSB(triangle_hits); + triangle_hits &= ~(1 << cluster_triangle_index); + cluster_triangle_index += triangle_start_index; + + uint triangle_index = triangle_indices.data[cluster_triangle_index]; + Triangle triangle = triangles.data[triangle_index]; + + // Gather the triangle vertex positions. + vec3 vtx0 = vertices.data[triangle.indices.x].position; + vec3 vtx1 = vertices.data[triangle.indices.y].position; + vec3 vtx2 = vertices.data[triangle.indices.z].position; + vec3 normal = -normalize(cross((vtx0 - vtx1), (vtx0 - vtx2))); + bool backface = dot(normal, dir) >= 0.0; + float distance; + vec3 barycentric; + if (ray_hits_triangle(p_from, dir, rel_len, vtx0, vtx1, vtx2, distance, barycentric)) { + if (p_any_hit) { + // Return early if any hit was requested. + return RAY_ANY; + } + + vec3 position = p_from + dir * distance; + vec3 hit_cell = (position - bake_params.to_cell_offset) * bake_params.to_cell_size; + if (icell != ivec3(hit_cell)) { + // It's possible for the ray to hit a triangle in a position outside the bounds of the cell + // if it's large enough to cover multiple ones. The hit must be ignored if this is the case. + continue; + } + + if (!backface) { + // The case of meshes having both a front and back face in the same plane is more common than + // expected, so if this is a front-face, bias it closer to the ray origin, so it always wins + // over the back-face. + distance = max(bake_params.bias, distance - bake_params.bias); + } + + if (distance < best_distance) { + hit = backface ? RAY_BACK : RAY_FRONT; + best_distance = distance; + r_distance = distance; + r_normal = normal; + r_triangle = triangle_index; + r_barycentric = barycentric; + } + } + } + +#ifdef CLUSTER_TRIANGLE_ITERATION + triangle_base_index += CLUSTER_SIZE; +#endif + } + } + + cluster_base_index += 32; } if (hit != RAY_MISS) {