// Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../builders/primrefgen.h" #include "../builders/heuristic_spatial.h" #include "../builders/splitter.h" #include "../../common/algorithms/parallel_for_for.h" #include "../../common/algorithms/parallel_for_for_prefix_sum.h" #define DBG_PRESPLIT(x) #define CHECK_PRESPLIT(x) #define GRID_SIZE 1024 #define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5 #define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG) #define PRIORITY_CUTOFF_THRESHOLD 1.0f #define PRIORITY_SPLIT_POS_WEIGHT 1.5f namespace embree { namespace isa { struct PresplitItem { union { float priority; unsigned int data; }; unsigned int index; __forceinline operator unsigned() const { return reinterpret_cast<const unsigned&>(priority); } __forceinline bool operator < (const PresplitItem& item) const { return (priority < item.priority); } template<typename Mesh> __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc) { const unsigned int geomID = ref.geomID(); const unsigned int primID = ref.primID(); const float area_aabb = area(ref.bounds()); const float area_prim = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID); const unsigned int diff = 31 - lzcnt(mc.x^mc.y); assert(area_prim <= area_aabb); //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f); const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) )); assert(priority >= 0.0f && priority < FLT_LARGE); return priority; } }; inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) { return cout << "index " << item.index << " priority " << item.priority; }; template<typename SplitterFactory> void splitPrimitive(SplitterFactory &Splitter, const PrimRef &prim, const unsigned int geomID, const unsigned int primID, const unsigned int split_level, const Vec3fa &grid_base, const float grid_scale, const float grid_extend, PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE], unsigned int& numSubPrims) { assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); if (split_level == 0) { assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); subPrims[numSubPrims++] = prim; } else { const Vec3fa lower = prim.lower; const Vec3fa upper = prim.upper; const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); Vec3ia ilower(floor(glower)); Vec3ia iupper(floor(gupper)); /* this ignores dimensions that are empty */ iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper))); /* compute a morton code for the lower and upper grid coordinates. */ const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); /* if all bits are equal then we cannot split */ if(unlikely(lower_code == upper_code)) { assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); subPrims[numSubPrims++] = prim; return; } /* compute octree level and dimension to perform the split in */ const unsigned int diff = 31 - lzcnt(lower_code^upper_code); const unsigned int level = diff / 3; const unsigned int dim = diff % 3; /* now we compute the grid position of the split */ const unsigned int isplit = iupper[dim] & ~((1<<level)-1); /* compute world space position of split */ const float inv_grid_size = 1.0f / GRID_SIZE; const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend; assert(prim.lower[dim] <= fsplit && prim.upper[dim] >= fsplit); /* split primitive */ const auto splitter = Splitter(prim); BBox3fa left,right; splitter(prim.bounds(),dim,fsplit,left,right); assert(!left.empty()); assert(!right.empty()); splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); } } template<typename Mesh, typename SplitterFactory> PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) { ParallelPrefixSumState<PrimInfo> pstate; /* first try */ progressMonitor(0); PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { return geometry->createPrimRefArray(prims,r,r.begin(),geomID); }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); /* if we need to filter out geometry, run again */ if (pinfo.size() != numPrimRefs) { progressMonitor(0); pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { return geometry->createPrimRefArray(prims,r,base.size(),geomID); }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); } return pinfo; } __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref) { const Vec3fa lower = ref.lower; const Vec3fa upper = ref.upper; const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); Vec3ia ilower(floor(glower)); Vec3ia iupper(floor(gupper)); /* this ignores dimensions that are empty */ iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)); /* compute a morton code for the lower and upper grid coordinates. */ const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); return Vec2i(lower_code,upper_code); } template<typename Mesh, typename SplitterFactory> PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) { static const size_t MIN_STEP_SIZE = 128; ParallelForForPrefixSumState<PrimInfo> pstate; Scene::Iterator2 iter(scene,types,mblur); /* first try */ progressMonitor(0); pstate.init(iter,size_t(1024)); PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); /* if we need to filter out geometry, run again */ if (pinfo.size() != numPrimRefs) { progressMonitor(0); pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); } /* use correct number of primitives */ size_t numPrimitives = pinfo.size(); const size_t alloc_numPrimitives = prims.size(); const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives; /* set up primitive splitter */ SplitterFactory Splitter(scene); DBG_PRESPLIT( const size_t org_numPrimitives = pinfo.size(); PRINT(numPrimitives); PRINT(alloc_numPrimitives); PRINT(numSplitPrimitivesBudget); ); /* allocate double buffer presplit items */ const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives; PresplitItem *presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); /* compute grid */ const Vec3fa grid_base = pinfo.geomBounds.lower; const Vec3fa grid_diag = pinfo.geomBounds.size(); const float grid_extend = max(grid_diag.x,max(grid_diag.y,grid_diag.z)); const float grid_scale = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend; /* init presplit items and get total sum */ const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float { float sum = 0.0f; for (size_t i=r.begin(); i<r.end(); i++) { presplitItem[i].index = (unsigned int)i; const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]); /* if all bits are equal then we cannot split */ presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f; /* FIXME: sum undeterministic */ sum += presplitItem[i].priority; } return sum; },[](const float& a, const float& b) -> float { return a+b; }); /* compute number of splits per primitive */ const float inv_psum = 1.0f / psum; parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { for (size_t i=r.begin(); i<r.end(); i++) { if (presplitItem[i].priority > 0.0f) { const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum; if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims { presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f); //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); } else presplitItem[i].priority = 0.0f; } } }); auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; }; size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024); /* anything to split ? */ if (center < numPrimitives) { size_t numPrimitivesToSplit = numPrimitives - center; assert(presplitItem[center].priority >= 1.0f); /* sort presplit items in ascending order */ radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024); CHECK_PRESPLIT( parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { for (size_t i=r.begin(); i<r.end(); i++) assert(presplitItem[i-1].priority <= presplitItem[i].priority); }); ); unsigned int* primOffset0 = (unsigned int*)tmp_presplitItem; unsigned int* primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit; /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */ const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t { size_t sum = 0; for (size_t i=t.begin(); i<t.end(); i++) { PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; assert(presplitItem[i].priority >= 1.0f); const unsigned int primrefID = presplitItem[i].index; const float prio = presplitItem[i].priority; const unsigned int geomID = prims[primrefID].geomID(); const unsigned int primID = prims[primrefID].primID(); const unsigned int split_levels = (unsigned int)prio; unsigned int numSubPrims = 0; splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); assert(numSubPrims); numSubPrims--; // can reuse slot sum+=numSubPrims; presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels; primOffset0[i-center] = numSubPrims; } return sum; },[](const size_t& a, const size_t& b) -> size_t { return a+b; }); /* if we are over budget, need to shrink the range */ if (totalNumSubPrims > numSplitPrimitivesBudget) { size_t new_center = numPrimitives-1; size_t sum = 0; for (;new_center>=center;new_center--) { const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG; if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break; sum += numSubPrims; } new_center++; primOffset0 += new_center - center; numPrimitivesToSplit -= new_center - center; center = new_center; assert(numPrimitivesToSplit == (numPrimitives - center)); } /* parallel prefix sum to compute offsets for storing sub-primitives */ const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>()); assert(numPrimitives+offset <= alloc_numPrimitives); /* iterate over range, and split primitives into sub primitives and append them to prims array */ parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void { for (size_t j=rn.begin(); j<rn.end(); j++) { PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; const unsigned int primrefID = presplitItem[j].index; const unsigned int geomID = prims[primrefID].geomID(); const unsigned int primID = prims[primrefID].primID(); const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1); assert(split_levels); assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); unsigned int numSubPrims = 0; splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); const size_t newID = numPrimitives + primOffset1[j-center]; assert(newID+numSubPrims-1 <= alloc_numPrimitives); prims[primrefID] = subPrims[0]; for (size_t i=1;i<numSubPrims;i++) prims[newID+i-1] = subPrims[i]; } }); numPrimitives += offset; DBG_PRESPLIT( PRINT(pinfo.size()); PRINT(numPrimitives); PRINT((float)numPrimitives/org_numPrimitives)); } /* recompute centroid bounding boxes */ pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo { PrimInfo p(empty); for (size_t j=r.begin(); j<r.end(); j++) p.add_center2(prims[j]); return p; }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); assert(pinfo.size() == numPrimitives); /* free double buffer presplit items */ alignedFree(tmp_presplitItem); alignedFree(presplitItem); return pinfo; } } }