parent
39b1c06a59
commit
f694ab1c64
|
@ -1,5 +1,7 @@
|
|||
import glob, os, shutil, subprocess, re
|
||||
|
||||
git_tag = "v3.13.5"
|
||||
|
||||
include_dirs = [
|
||||
"common/tasking",
|
||||
"kernels/bvh",
|
||||
|
@ -12,6 +14,7 @@ include_dirs = [
|
|||
"common/lexers",
|
||||
"common/simd",
|
||||
"common/simd/arm",
|
||||
"common/simd/wasm",
|
||||
"include/embree3",
|
||||
"kernels/subdiv",
|
||||
"kernels/geometry",
|
||||
|
@ -76,6 +79,7 @@ if os.path.exists(dir_name):
|
|||
|
||||
subprocess.run(["git", "clone", "https://github.com/embree/embree.git", "embree-tmp"])
|
||||
os.chdir("embree-tmp")
|
||||
subprocess.run(["git", "checkout", git_tag])
|
||||
|
||||
commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip()
|
||||
|
||||
|
@ -94,8 +98,7 @@ for f in all_files:
|
|||
|
||||
with open(os.path.join(dest_dir, "kernels/hash.h"), "w") as hash_file:
|
||||
hash_file.write(
|
||||
f"""
|
||||
// Copyright 2009-2020 Intel Corporation
|
||||
f"""// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#define RTC_HASH "{commit_hash}"
|
||||
|
@ -104,8 +107,7 @@ with open(os.path.join(dest_dir, "kernels/hash.h"), "w") as hash_file:
|
|||
|
||||
with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file:
|
||||
config_file.write(
|
||||
"""
|
||||
// Copyright 2009-2020 Intel Corporation
|
||||
"""// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/* #undef EMBREE_RAY_MASK */
|
||||
|
@ -126,6 +128,7 @@ with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file:
|
|||
/* #undef EMBREE_COMPACT_POLYS */
|
||||
|
||||
#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
|
||||
#define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE
|
||||
|
||||
#if defined(EMBREE_GEOMETRY_TRIANGLE)
|
||||
#define IF_ENABLED_TRIS(x) x
|
||||
|
@ -192,8 +195,7 @@ with open("CMakeLists.txt", "r") as cmake_file:
|
|||
|
||||
with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as config_file:
|
||||
config_file.write(
|
||||
f"""
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
f"""// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
@ -209,14 +211,16 @@ with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as con
|
|||
#define EMBREE_MIN_WIDTH 0
|
||||
#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
|
||||
|
||||
#define EMBREE_STATIC_LIB
|
||||
/* #undef EMBREE_API_NAMESPACE */
|
||||
#if !defined(EMBREE_STATIC_LIB)
|
||||
# define EMBREE_STATIC_LIB
|
||||
#endif
|
||||
/* #undef EMBREE_API_NAMESPACE*/
|
||||
|
||||
#if defined(EMBREE_API_NAMESPACE)
|
||||
# define RTC_NAMESPACE
|
||||
# define RTC_NAMESPACE_BEGIN namespace {{
|
||||
# define RTC_NAMESPACE_BEGIN namespace {{
|
||||
# define RTC_NAMESPACE_END }}
|
||||
# define RTC_NAMESPACE_USE using namespace ;
|
||||
# define RTC_NAMESPACE_USE using namespace;
|
||||
# define RTC_API_EXTERN_C
|
||||
# undef EMBREE_API_NAMESPACE
|
||||
#else
|
||||
|
|
|
@ -53,7 +53,7 @@ Files extracted from upstream source:
|
|||
## embree
|
||||
|
||||
- Upstream: https://github.com/embree/embree
|
||||
- Version: 3.13.1 (12b99393438a4cc9e478e33459eed78bec6233fd, 2021)
|
||||
- Version: 3.13.5 (698442324ccddd11725fb8875275dc1384f7fb40, 2022)
|
||||
- License: Apache 2.0
|
||||
|
||||
Files extracted from upstream:
|
||||
|
|
|
@ -26,7 +26,6 @@ namespace embree
|
|||
abort();
|
||||
// -- GODOT end --
|
||||
}
|
||||
|
||||
#elif defined(TASKING_TBB)
|
||||
#if TBB_INTERFACE_VERSION >= 12002
|
||||
tbb::task_group_context context;
|
||||
|
|
|
@ -30,15 +30,20 @@ namespace embree
|
|||
template<typename ArrayArray>
|
||||
__forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
|
||||
init(array2,minStepSize);
|
||||
}
|
||||
|
||||
template<typename SizeFunc>
|
||||
__forceinline ParallelForForState (const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize) {
|
||||
init(numArrays,getSize,minStepSize);
|
||||
}
|
||||
|
||||
template<typename ArrayArray>
|
||||
__forceinline void init ( ArrayArray& array2, const size_t minStepSize )
|
||||
template<typename SizeFunc>
|
||||
__forceinline void init ( const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize )
|
||||
{
|
||||
/* first calculate total number of elements */
|
||||
size_t N = 0;
|
||||
for (size_t i=0; i<array2.size(); i++) {
|
||||
N += array2[i] ? array2[i]->size() : 0;
|
||||
for (size_t i=0; i<numArrays; i++) {
|
||||
N += getSize(i);
|
||||
}
|
||||
this->N = N;
|
||||
|
||||
|
@ -54,8 +59,8 @@ namespace embree
|
|||
size_t k0 = (++taskIndex)*N/taskCount;
|
||||
for (size_t i=0, k=0; taskIndex < taskCount; i++)
|
||||
{
|
||||
assert(i<array2.size());
|
||||
size_t j=0, M = array2[i] ? array2[i]->size() : 0;
|
||||
assert(i<numArrays);
|
||||
size_t j=0, M = getSize(i);
|
||||
while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
|
||||
assert(taskIndex<taskCount);
|
||||
i0[taskIndex] = i;
|
||||
|
@ -67,6 +72,12 @@ namespace embree
|
|||
}
|
||||
}
|
||||
|
||||
template<typename ArrayArray>
|
||||
__forceinline void init ( ArrayArray& array2, const size_t minStepSize )
|
||||
{
|
||||
init(array2.size(),[&](size_t i) { return array2[i] ? array2[i]->size() : 0; },minStepSize);
|
||||
}
|
||||
|
||||
__forceinline size_t size() const {
|
||||
return N;
|
||||
}
|
||||
|
|
|
@ -17,12 +17,56 @@ namespace embree
|
|||
__forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
|
||||
: ParallelForForState(array2,minStepSize) {}
|
||||
|
||||
template<typename SizeFunc>
|
||||
__forceinline ParallelForForPrefixSumState (size_t numArrays, const SizeFunc& getSize, const size_t minStepSize)
|
||||
: ParallelForForState(numArrays,getSize,minStepSize) {}
|
||||
|
||||
ParallelPrefixSumState<Value> prefix_state;
|
||||
};
|
||||
|
||||
template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum0_( ParallelForForPrefixSumState<Value>& state, Index minStepSize,
|
||||
const SizeFunc& getSize, const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
/* calculate number of tasks to use */
|
||||
const size_t taskCount = state.taskCount;
|
||||
|
||||
/* perform parallel prefix sum */
|
||||
parallel_for(taskCount, [&](const size_t taskIndex)
|
||||
{
|
||||
const size_t k0 = (taskIndex+0)*state.size()/taskCount;
|
||||
const size_t k1 = (taskIndex+1)*state.size()/taskCount;
|
||||
size_t i0 = state.i0[taskIndex];
|
||||
size_t j0 = state.j0[taskIndex];
|
||||
|
||||
/* iterate over arrays */
|
||||
size_t k=k0;
|
||||
Value N=identity;
|
||||
for (size_t i=i0; k<k1; i++) {
|
||||
const size_t size = getSize(i);
|
||||
const size_t r0 = j0, r1 = min(size,r0+k1-k);
|
||||
if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k));
|
||||
k+=r1-r0; j0 = 0;
|
||||
}
|
||||
state.prefix_state.counts[taskIndex] = N;
|
||||
});
|
||||
|
||||
/* calculate prefix sum */
|
||||
Value sum=identity;
|
||||
for (size_t i=0; i<taskCount; i++)
|
||||
{
|
||||
const Value c = state.prefix_state.counts[i];
|
||||
state.prefix_state.sums[i] = sum;
|
||||
sum=reduction(sum,c);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum1_( ParallelForForPrefixSumState<Value>& state, Index minStepSize,
|
||||
const SizeFunc& getSize,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
/* calculate number of tasks to use */
|
||||
const size_t taskCount = state.taskCount;
|
||||
|
@ -38,9 +82,9 @@ namespace embree
|
|||
size_t k=k0;
|
||||
Value N=identity;
|
||||
for (size_t i=i0; k<k1; i++) {
|
||||
const size_t size = array2[i] ? array2[i]->size() : 0;
|
||||
const size_t size = getSize(i);
|
||||
const size_t r0 = j0, r1 = min(size,r0+k1-k);
|
||||
if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
|
||||
if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k,reduction(state.prefix_state.sums[taskIndex],N)));
|
||||
k+=r1-r0; j0 = 0;
|
||||
}
|
||||
state.prefix_state.counts[taskIndex] = N;
|
||||
|
@ -59,43 +103,29 @@ namespace embree
|
|||
}
|
||||
|
||||
template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
__forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state,
|
||||
ArrayArray& array2, Index minStepSize,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
/* calculate number of tasks to use */
|
||||
const size_t taskCount = state.taskCount;
|
||||
/* perform parallel prefix sum */
|
||||
parallel_for(taskCount, [&](const size_t taskIndex)
|
||||
{
|
||||
const size_t k0 = (taskIndex+0)*state.size()/taskCount;
|
||||
const size_t k1 = (taskIndex+1)*state.size()/taskCount;
|
||||
size_t i0 = state.i0[taskIndex];
|
||||
size_t j0 = state.j0[taskIndex];
|
||||
|
||||
/* iterate over arrays */
|
||||
size_t k=k0;
|
||||
Value N=identity;
|
||||
for (size_t i=i0; k<k1; i++) {
|
||||
const size_t size = array2[i] ? array2[i]->size() : 0;
|
||||
const size_t r0 = j0, r1 = min(size,r0+k1-k);
|
||||
if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
|
||||
k+=r1-r0; j0 = 0;
|
||||
}
|
||||
state.prefix_state.counts[taskIndex] = N;
|
||||
});
|
||||
|
||||
/* calculate prefix sum */
|
||||
Value sum=identity;
|
||||
for (size_t i=0; i<taskCount; i++)
|
||||
{
|
||||
const Value c = state.prefix_state.counts[i];
|
||||
state.prefix_state.sums[i] = sum;
|
||||
sum=reduction(sum,c);
|
||||
}
|
||||
|
||||
return sum;
|
||||
return parallel_for_for_prefix_sum0_(state,minStepSize,
|
||||
[&](Index i) { return array2[i] ? array2[i]->size() : 0; },
|
||||
identity,
|
||||
[&](Index i, const range<Index>& r, Index k) { return func(array2[i], r, k, i); },
|
||||
reduction);
|
||||
}
|
||||
|
||||
template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state,
|
||||
ArrayArray& array2, Index minStepSize,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
return parallel_for_for_prefix_sum1_(state,minStepSize,
|
||||
[&](Index i) { return array2[i] ? array2[i]->size() : 0; },
|
||||
identity,
|
||||
[&](Index i, const range<Index>& r, Index k, const Value& base) { return func(array2[i], r, k, i, base); },
|
||||
reduction);
|
||||
}
|
||||
|
||||
template<typename ArrayArray, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
|
|
|
@ -26,7 +26,7 @@ namespace embree
|
|||
const Index threadCount = (Index) TaskScheduler::threadCount();
|
||||
taskCount = min(taskCount,threadCount,maxTasks);
|
||||
|
||||
/* parallel invokation of all tasks */
|
||||
/* parallel invocation of all tasks */
|
||||
dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
|
||||
parallel_for(taskCount, [&](const Index taskIndex) {
|
||||
const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
|
||||
|
|
|
@ -77,7 +77,7 @@ namespace embree
|
|||
return lower > upper;
|
||||
}
|
||||
|
||||
#if defined(__SSE__)
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
template<> __forceinline bool BBox<Vec3fa>::empty() const {
|
||||
return !all(le_mask(lower,upper));
|
||||
}
|
||||
|
@ -196,11 +196,11 @@ namespace embree
|
|||
}
|
||||
|
||||
template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
|
||||
return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
|
||||
return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper));
|
||||
}
|
||||
|
||||
template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
|
||||
return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
|
||||
return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper));
|
||||
}
|
||||
|
||||
/*! blending */
|
||||
|
@ -228,11 +228,11 @@ namespace embree
|
|||
/// SSE / AVX / MIC specializations
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined __SSE__
|
||||
#if defined (__SSE__) || defined(__ARM_NEON)
|
||||
#include "../simd/sse.h"
|
||||
#endif
|
||||
|
||||
#if defined __AVX__
|
||||
#if defined (__AVX__)
|
||||
#include "../simd/avx.h"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -152,21 +152,38 @@ namespace embree
|
|||
}
|
||||
__forceinline const Color rcp ( const Color& a )
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
__m128 reciprocal = _mm_rcp_ps(a.m128);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
|
||||
return (const Color)reciprocal;
|
||||
#else
|
||||
#if defined(__AVX512VL__)
|
||||
const Color r = _mm_rcp14_ps(a.m128);
|
||||
#else
|
||||
const Color r = _mm_rcp_ps(a.m128);
|
||||
#endif
|
||||
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
|
||||
return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r)))); // computes r + r * (1 - a * r)
|
||||
|
||||
#endif //defined(__aarch64__)
|
||||
}
|
||||
__forceinline const Color rsqrt( const Color& a )
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
__m128 r = _mm_rsqrt_ps(a.m128);
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
|
||||
return r;
|
||||
#else
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
__m128 r = _mm_rsqrt14_ps(a.m128);
|
||||
#else
|
||||
__m128 r = _mm_rsqrt_ps(a.m128);
|
||||
#endif
|
||||
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
|
||||
|
||||
#endif //defined(__aarch64__)
|
||||
}
|
||||
__forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
|
||||
|
||||
|
|
|
@ -5,23 +5,4 @@
|
|||
|
||||
namespace embree
|
||||
{
|
||||
TrueTy True;
|
||||
FalseTy False;
|
||||
ZeroTy zero;
|
||||
OneTy one;
|
||||
NegInfTy neg_inf;
|
||||
PosInfTy inf;
|
||||
PosInfTy pos_inf;
|
||||
NaNTy nan;
|
||||
UlpTy ulp;
|
||||
PiTy pi;
|
||||
OneOverPiTy one_over_pi;
|
||||
TwoPiTy two_pi;
|
||||
OneOverTwoPiTy one_over_two_pi;
|
||||
FourPiTy four_pi;
|
||||
OneOverFourPiTy one_over_four_pi;
|
||||
StepTy step;
|
||||
ReverseStepTy reverse_step;
|
||||
EmptyTy empty;
|
||||
UndefinedTy undefined;
|
||||
}
|
||||
|
|
|
@ -24,13 +24,13 @@ namespace embree
|
|||
__forceinline operator bool( ) const { return true; }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED TrueTy True;
|
||||
const constexpr TrueTy True = TrueTy();
|
||||
|
||||
struct FalseTy {
|
||||
__forceinline operator bool( ) const { return false; }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED FalseTy False;
|
||||
const constexpr FalseTy False = FalseTy();
|
||||
|
||||
struct ZeroTy
|
||||
{
|
||||
|
@ -48,7 +48,7 @@ namespace embree
|
|||
__forceinline operator unsigned char ( ) const { return 0; }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED ZeroTy zero;
|
||||
const constexpr ZeroTy zero = ZeroTy();
|
||||
|
||||
struct OneTy
|
||||
{
|
||||
|
@ -66,7 +66,7 @@ namespace embree
|
|||
__forceinline operator unsigned char ( ) const { return 1; }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED OneTy one;
|
||||
const constexpr OneTy one = OneTy();
|
||||
|
||||
struct NegInfTy
|
||||
{
|
||||
|
@ -85,7 +85,7 @@ namespace embree
|
|||
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED NegInfTy neg_inf;
|
||||
const constexpr NegInfTy neg_inf = NegInfTy();
|
||||
|
||||
struct PosInfTy
|
||||
{
|
||||
|
@ -103,8 +103,8 @@ namespace embree
|
|||
__forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::max(); }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED PosInfTy inf;
|
||||
extern MAYBE_UNUSED PosInfTy pos_inf;
|
||||
const constexpr PosInfTy inf = PosInfTy();
|
||||
const constexpr PosInfTy pos_inf = PosInfTy();
|
||||
|
||||
struct NaNTy
|
||||
{
|
||||
|
@ -112,15 +112,15 @@ namespace embree
|
|||
__forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED NaNTy nan;
|
||||
const constexpr NaNTy nan = NaNTy();
|
||||
|
||||
struct UlpTy
|
||||
{
|
||||
__forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
|
||||
__forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED UlpTy ulp;
|
||||
|
||||
const constexpr UlpTy ulp = UlpTy();
|
||||
|
||||
struct PiTy
|
||||
{
|
||||
|
@ -128,7 +128,7 @@ namespace embree
|
|||
__forceinline operator float ( ) const { return float(M_PI); }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED PiTy pi;
|
||||
const constexpr PiTy pi = PiTy();
|
||||
|
||||
struct OneOverPiTy
|
||||
{
|
||||
|
@ -136,7 +136,7 @@ namespace embree
|
|||
__forceinline operator float ( ) const { return float(M_1_PI); }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED OneOverPiTy one_over_pi;
|
||||
const constexpr OneOverPiTy one_over_pi = OneOverPiTy();
|
||||
|
||||
struct TwoPiTy
|
||||
{
|
||||
|
@ -144,7 +144,7 @@ namespace embree
|
|||
__forceinline operator float ( ) const { return float(2.0*M_PI); }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED TwoPiTy two_pi;
|
||||
const constexpr TwoPiTy two_pi = TwoPiTy();
|
||||
|
||||
struct OneOverTwoPiTy
|
||||
{
|
||||
|
@ -152,7 +152,7 @@ namespace embree
|
|||
__forceinline operator float ( ) const { return float(0.5*M_1_PI); }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
|
||||
const constexpr OneOverTwoPiTy one_over_two_pi = OneOverTwoPiTy();
|
||||
|
||||
struct FourPiTy
|
||||
{
|
||||
|
@ -160,7 +160,7 @@ namespace embree
|
|||
__forceinline operator float ( ) const { return float(4.0*M_PI); }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED FourPiTy four_pi;
|
||||
const constexpr FourPiTy four_pi = FourPiTy();
|
||||
|
||||
struct OneOverFourPiTy
|
||||
{
|
||||
|
@ -168,30 +168,42 @@ namespace embree
|
|||
__forceinline operator float ( ) const { return float(0.25*M_1_PI); }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
|
||||
const constexpr OneOverFourPiTy one_over_four_pi = OneOverFourPiTy();
|
||||
|
||||
struct StepTy {
|
||||
__forceinline operator double ( ) const { return 0; }
|
||||
__forceinline operator float ( ) const { return 0; }
|
||||
__forceinline operator long long( ) const { return 0; }
|
||||
__forceinline operator unsigned long long( ) const { return 0; }
|
||||
__forceinline operator long ( ) const { return 0; }
|
||||
__forceinline operator unsigned long ( ) const { return 0; }
|
||||
__forceinline operator int ( ) const { return 0; }
|
||||
__forceinline operator unsigned int ( ) const { return 0; }
|
||||
__forceinline operator short ( ) const { return 0; }
|
||||
__forceinline operator unsigned short ( ) const { return 0; }
|
||||
__forceinline operator char ( ) const { return 0; }
|
||||
__forceinline operator unsigned char ( ) const { return 0; }
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED StepTy step;
|
||||
const constexpr StepTy step = StepTy();
|
||||
|
||||
struct ReverseStepTy {
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED ReverseStepTy reverse_step;
|
||||
const constexpr ReverseStepTy reverse_step = ReverseStepTy();
|
||||
|
||||
struct EmptyTy {
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED EmptyTy empty;
|
||||
const constexpr EmptyTy empty = EmptyTy();
|
||||
|
||||
struct FullTy {
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED FullTy full;
|
||||
const constexpr FullTy full = FullTy();
|
||||
|
||||
struct UndefinedTy {
|
||||
};
|
||||
|
||||
extern MAYBE_UNUSED UndefinedTy undefined;
|
||||
const constexpr UndefinedTy undefined = UndefinedTy();
|
||||
}
|
||||
|
|
|
@ -53,6 +53,16 @@ namespace embree
|
|||
|
||||
__forceinline float rcp ( const float x )
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
// Move scalar to vector register and do rcp.
|
||||
__m128 a;
|
||||
a[0] = x;
|
||||
float32x4_t reciprocal = vrecpeq_f32(a);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
|
||||
return reciprocal[0];
|
||||
#else
|
||||
|
||||
const __m128 a = _mm_set_ss(x);
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
@ -66,30 +76,71 @@ namespace embree
|
|||
#else
|
||||
return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
|
||||
#endif
|
||||
|
||||
#endif //defined(__aarch64__)
|
||||
}
|
||||
|
||||
__forceinline float signmsk ( const float x ) {
|
||||
#if defined(__aarch64__)
|
||||
// FP and Neon shares same vector register in arm64
|
||||
__m128 a;
|
||||
__m128i b;
|
||||
a[0] = x;
|
||||
b[0] = 0x80000000;
|
||||
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
|
||||
return a[0];
|
||||
#else
|
||||
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
|
||||
#endif
|
||||
}
|
||||
__forceinline float xorf( const float x, const float y ) {
|
||||
#if defined(__aarch64__)
|
||||
// FP and Neon shares same vector register in arm64
|
||||
__m128 a;
|
||||
__m128 b;
|
||||
a[0] = x;
|
||||
b[0] = y;
|
||||
a = _mm_xor_ps(a, b);
|
||||
return a[0];
|
||||
#else
|
||||
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
|
||||
#endif
|
||||
}
|
||||
__forceinline float andf( const float x, const unsigned y ) {
|
||||
#if defined(__aarch64__)
|
||||
// FP and Neon shares same vector register in arm64
|
||||
__m128 a;
|
||||
__m128i b;
|
||||
a[0] = x;
|
||||
b[0] = y;
|
||||
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
|
||||
return a[0];
|
||||
#else
|
||||
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
|
||||
#endif
|
||||
}
|
||||
__forceinline float rsqrt( const float x )
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
// FP and Neon shares same vector register in arm64
|
||||
__m128 a;
|
||||
a[0] = x;
|
||||
__m128 value = _mm_rsqrt_ps(a);
|
||||
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
|
||||
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
|
||||
return value[0];
|
||||
#else
|
||||
|
||||
const __m128 a = _mm_set_ss(x);
|
||||
#if defined(__AVX512VL__)
|
||||
__m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
|
||||
#else
|
||||
__m128 r = _mm_rsqrt_ss(a);
|
||||
#endif
|
||||
r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
|
||||
#if defined(__ARM_NEON)
|
||||
r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
|
||||
const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
|
||||
_mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
|
||||
return _mm_cvtss_f32(c);
|
||||
#endif
|
||||
return _mm_cvtss_f32(r);
|
||||
}
|
||||
|
||||
#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
|
||||
|
@ -146,7 +197,17 @@ namespace embree
|
|||
__forceinline double floor( const double x ) { return ::floor (x); }
|
||||
__forceinline double ceil ( const double x ) { return ::ceil (x); }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
__forceinline float mini(float a, float b) {
|
||||
// FP and Neon shares same vector register in arm64
|
||||
__m128 x;
|
||||
__m128 y;
|
||||
x[0] = a;
|
||||
y[0] = b;
|
||||
x = _mm_min_ps(x, y);
|
||||
return x[0];
|
||||
}
|
||||
#elif defined(__SSE4_1__)
|
||||
__forceinline float mini(float a, float b) {
|
||||
const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
|
||||
const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
|
||||
|
@ -155,7 +216,17 @@ namespace embree
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
__forceinline float maxi(float a, float b) {
|
||||
// FP and Neon shares same vector register in arm64
|
||||
__m128 x;
|
||||
__m128 y;
|
||||
x[0] = a;
|
||||
y[0] = b;
|
||||
x = _mm_max_ps(x, y);
|
||||
return x[0];
|
||||
}
|
||||
#elif defined(__SSE4_1__)
|
||||
__forceinline float maxi(float a, float b) {
|
||||
const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
|
||||
const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
|
||||
|
@ -172,9 +243,12 @@ namespace embree
|
|||
__forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; }
|
||||
__forceinline float min(float a, float b) { return a<b ? a:b; }
|
||||
__forceinline double min(double a, double b) { return a<b ? a:b; }
|
||||
#if defined(__64BIT__)
|
||||
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
|
||||
__forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; }
|
||||
#endif
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
__forceinline long min(long a, long b) { return a<b ? a:b; }
|
||||
#endif
|
||||
|
||||
template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
|
||||
template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
|
||||
|
@ -189,9 +263,12 @@ namespace embree
|
|||
__forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; }
|
||||
__forceinline float max(float a, float b) { return a<b ? b:a; }
|
||||
__forceinline double max(double a, double b) { return a<b ? b:a; }
|
||||
#if defined(__64BIT__)
|
||||
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
|
||||
__forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; }
|
||||
#endif
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
__forceinline long max(long a, long b) { return a<b ? b:a; }
|
||||
#endif
|
||||
|
||||
template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
|
||||
template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
|
||||
|
@ -231,6 +308,15 @@ namespace embree
|
|||
__forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
|
||||
__forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
|
||||
__forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
|
||||
|
||||
#elif defined (__aarch64__) && defined(__clang__)
|
||||
#pragma clang fp contract(fast)
|
||||
__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; }
|
||||
__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; }
|
||||
__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
|
||||
__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
|
||||
#pragma clang fp contract(on)
|
||||
|
||||
#else
|
||||
__forceinline float madd ( const float a, const float b, const float c) { return a*b+c; }
|
||||
__forceinline float msub ( const float a, const float b, const float c) { return a*b-c; }
|
||||
|
@ -326,7 +412,7 @@ namespace embree
|
|||
return x | (y << 1) | (z << 2);
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
|
||||
template<>
|
||||
__forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
|
||||
|
|
|
@ -242,13 +242,17 @@ namespace embree
|
|||
T cosTheta = dot(q0, q1_);
|
||||
QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
|
||||
cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta);
|
||||
if (unlikely(all(cosTheta > 0.9995f))) {
|
||||
return normalize(lerp(q0, q1, t));
|
||||
}
|
||||
|
||||
// spherical linear interpolation
|
||||
const T phi = t * fastapprox::acos(cosTheta);
|
||||
T sinPhi, cosPhi;
|
||||
fastapprox::sincos(phi, sinPhi, cosPhi);
|
||||
QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
|
||||
return msub(cosPhi, q0, qperp);
|
||||
QuaternionT<T> qslerp = msub(cosPhi, q0, qperp);
|
||||
|
||||
// regular linear interpolation as fallback
|
||||
QuaternionT<T> qlerp = normalize(lerp(q0, q1, t));
|
||||
|
||||
return select(cosTheta > 0.9995f, qlerp, qslerp);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ __forceinline T sin(const T &v)
|
|||
// Reduced range version of x
|
||||
auto x = v - kReal * piOverTwoVec;
|
||||
auto kMod4 = k & 3;
|
||||
auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
|
||||
auto sinUseCos = (kMod4 == 1) | (kMod4 == 3);
|
||||
auto flipSign = (kMod4 > 1);
|
||||
|
||||
// These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
|
||||
|
@ -76,8 +76,8 @@ __forceinline T cos(const T &v)
|
|||
auto x = v - kReal * piOverTwoVec;
|
||||
|
||||
auto kMod4 = k & 3;
|
||||
auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
|
||||
auto flipSign = (kMod4 == 1 | kMod4 == 2);
|
||||
auto cosUseCos = (kMod4 == 0) | (kMod4 == 2);
|
||||
auto flipSign = (kMod4 == 1) | (kMod4 == 2);
|
||||
|
||||
const float sinC2 = -0.16666667163372039794921875;
|
||||
const float sinC4 = +8.333347737789154052734375e-3;
|
||||
|
|
|
@ -144,7 +144,7 @@ namespace embree
|
|||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Euclidian Space Operators
|
||||
/// Euclidean Space Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T> __forceinline T dot ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
|
||||
|
@ -205,11 +205,11 @@ namespace embree
|
|||
|
||||
#include "vec2fa.h"
|
||||
|
||||
#if defined __SSE__
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
#include "../simd/sse.h"
|
||||
#endif
|
||||
|
||||
#if defined __AVX__
|
||||
#if defined(__AVX__)
|
||||
#include "../simd/avx.h"
|
||||
#endif
|
||||
|
||||
|
@ -221,7 +221,7 @@ namespace embree
|
|||
{
|
||||
template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
|
||||
|
||||
#if defined(__SSE__)
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -97,6 +97,12 @@ namespace embree
|
|||
|
||||
__forceinline Vec2fa rcp ( const Vec2fa& a )
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
__m128 reciprocal = _mm_rcp_ps(a.m128);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
|
||||
return (const Vec2fa)reciprocal;
|
||||
#else
|
||||
#if defined(__AVX512VL__)
|
||||
const Vec2fa r = _mm_rcp14_ps(a.m128);
|
||||
#else
|
||||
|
@ -104,13 +110,15 @@ namespace embree
|
|||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
|
||||
const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
|
||||
const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n
|
||||
#else
|
||||
const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
|
||||
//return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
|
||||
const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0)
|
||||
const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n
|
||||
#endif
|
||||
|
||||
return res;
|
||||
#endif //defined(__aarch64__)
|
||||
}
|
||||
|
||||
__forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
|
||||
|
@ -118,12 +126,21 @@ namespace embree
|
|||
|
||||
__forceinline Vec2fa rsqrt( const Vec2fa& a )
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
__m128 r = _mm_rsqrt_ps(a.m128);
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
|
||||
return r;
|
||||
#else
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
__m128 r = _mm_rsqrt14_ps(a.m128);
|
||||
#else
|
||||
__m128 r = _mm_rsqrt_ps(a.m128);
|
||||
#endif
|
||||
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline Vec2fa zero_fix(const Vec2fa& a) {
|
||||
|
@ -156,7 +173,7 @@ namespace embree
|
|||
__forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
|
||||
__forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
__forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
|
||||
const vint4 ai = _mm_castps_si128(a);
|
||||
const vint4 bi = _mm_castps_si128(b);
|
||||
|
@ -165,7 +182,7 @@ namespace embree
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
__forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
|
||||
const vint4 ai = _mm_castps_si128(a);
|
||||
const vint4 bi = _mm_castps_si128(b);
|
||||
|
@ -227,7 +244,7 @@ namespace embree
|
|||
__forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Euclidian Space Operators
|
||||
/// Euclidean Space Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
|
|
@ -197,7 +197,7 @@ namespace embree
|
|||
template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Euclidian Space Operators
|
||||
/// Euclidean Space Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T> __forceinline T sqr ( const Vec3<T>& a ) { return dot(a,a); }
|
||||
|
@ -207,7 +207,6 @@ namespace embree
|
|||
template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); }
|
||||
template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
|
||||
template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); }
|
||||
|
||||
template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
|
||||
{
|
||||
const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
|
||||
|
@ -266,11 +265,11 @@ namespace embree
|
|||
/// SSE / AVX / MIC specializations
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined __SSE__
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
#include "../simd/sse.h"
|
||||
#endif
|
||||
|
||||
#if defined __AVX__
|
||||
#if defined(__AVX__)
|
||||
#include "../simd/avx.h"
|
||||
#endif
|
||||
|
||||
|
@ -291,14 +290,14 @@ namespace embree
|
|||
template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
|
||||
x = a.x; y = a.y; z = a.z;
|
||||
}
|
||||
#elif defined(__SSE__)
|
||||
#elif defined(__SSE__) || defined(__ARM_NEON)
|
||||
template<>
|
||||
__forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
|
||||
const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE__)
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
template<>
|
||||
__forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
|
||||
return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
|
||||
|
|
|
@ -55,7 +55,13 @@ namespace embree
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static __forceinline Vec3fa load( const void* const a ) {
|
||||
#if defined(__aarch64__)
|
||||
__m128 t = _mm_load_ps((float*)a);
|
||||
t[3] = 0.0f;
|
||||
return Vec3fa(t);
|
||||
#else
|
||||
return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
|
||||
#endif
|
||||
}
|
||||
|
||||
static __forceinline Vec3fa loadu( const void* const a ) {
|
||||
|
@ -89,12 +95,20 @@ namespace embree
|
|||
|
||||
__forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
|
||||
__forceinline Vec3fa operator -( const Vec3fa& a ) {
|
||||
#if defined(__aarch64__)
|
||||
return vnegq_f32(a.m128);
|
||||
#else
|
||||
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
|
||||
return _mm_xor_ps(a.m128, mask);
|
||||
#endif
|
||||
}
|
||||
__forceinline Vec3fa abs ( const Vec3fa& a ) {
|
||||
#if defined(__aarch64__)
|
||||
return _mm_abs_ps(a.m128);
|
||||
#else
|
||||
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
|
||||
return _mm_and_ps(a.m128, mask);
|
||||
#endif
|
||||
}
|
||||
__forceinline Vec3fa sign ( const Vec3fa& a ) {
|
||||
return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
|
||||
|
@ -102,6 +116,10 @@ namespace embree
|
|||
|
||||
__forceinline Vec3fa rcp ( const Vec3fa& a )
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
|
||||
#else
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
const Vec3fa r = _mm_rcp14_ps(a.m128);
|
||||
#else
|
||||
|
@ -109,13 +127,15 @@ namespace embree
|
|||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
|
||||
const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
|
||||
const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n
|
||||
#else
|
||||
const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
|
||||
//return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
|
||||
const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0)
|
||||
const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n
|
||||
#endif
|
||||
|
||||
return res;
|
||||
#endif //defined(__aarch64__)
|
||||
}
|
||||
|
||||
__forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
|
||||
|
@ -123,12 +143,20 @@ namespace embree
|
|||
|
||||
__forceinline Vec3fa rsqrt( const Vec3fa& a )
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
__m128 r = _mm_rsqrt_ps(a.m128);
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
|
||||
return r;
|
||||
#else
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
__m128 r = _mm_rsqrt14_ps(a.m128);
|
||||
#else
|
||||
__m128 r = _mm_rsqrt_ps(a.m128);
|
||||
#endif
|
||||
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline Vec3fa zero_fix(const Vec3fa& a) {
|
||||
|
@ -161,7 +189,7 @@ namespace embree
|
|||
__forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
|
||||
__forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
__forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
|
||||
const vint4 ai = _mm_castps_si128(a.m128);
|
||||
const vint4 bi = _mm_castps_si128(b.m128);
|
||||
|
@ -170,7 +198,7 @@ namespace embree
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
__forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
|
||||
const vint4 ai = _mm_castps_si128(a.m128);
|
||||
const vint4 bi = _mm_castps_si128(b.m128);
|
||||
|
@ -187,16 +215,16 @@ namespace embree
|
|||
/// Ternary Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
|
||||
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
|
||||
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
|
||||
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
|
||||
#else
|
||||
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
|
||||
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
|
||||
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
|
||||
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
|
||||
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
|
||||
#endif
|
||||
|
||||
__forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
|
||||
|
@ -218,8 +246,26 @@ namespace embree
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Reductions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#if defined(__aarch64__)
|
||||
__forceinline float reduce_add(const Vec3fa& v) {
|
||||
float32x4_t t = v.m128;
|
||||
t[3] = 0.0f;
|
||||
return vaddvq_f32(t);
|
||||
}
|
||||
|
||||
__forceinline float reduce_add(const Vec3fa& v) {
|
||||
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
|
||||
__forceinline float reduce_min(const Vec3fa& v) {
|
||||
float32x4_t t = v.m128;
|
||||
t[3] = t[2];
|
||||
return vminvq_f32(t);
|
||||
}
|
||||
__forceinline float reduce_max(const Vec3fa& v) {
|
||||
float32x4_t t = v.m128;
|
||||
t[3] = t[2];
|
||||
return vmaxvq_f32(t);
|
||||
}
|
||||
#else
|
||||
__forceinline float reduce_add(const Vec3fa& v) {
|
||||
const vfloat4 a(v.m128);
|
||||
const vfloat4 b = shuffle<1>(a);
|
||||
const vfloat4 c = shuffle<2>(a);
|
||||
|
@ -229,6 +275,7 @@ namespace embree
|
|||
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
|
||||
__forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
|
||||
__forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Comparison Operators
|
||||
|
@ -241,8 +288,13 @@ namespace embree
|
|||
__forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
|
||||
__forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
|
||||
__forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
|
||||
__forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
|
||||
__forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
|
||||
#if defined(__aarch64__)
|
||||
__forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
|
||||
__forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
|
||||
#else
|
||||
__forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
|
||||
__forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
|
||||
#endif
|
||||
|
||||
__forceinline bool isvalid ( const Vec3fa& v ) {
|
||||
return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
|
||||
|
@ -261,7 +313,7 @@ namespace embree
|
|||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Euclidian Space Operators
|
||||
/// Euclidean Space Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
@ -335,7 +387,11 @@ namespace embree
|
|||
/// Rounding Functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined (__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
__forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
|
||||
__forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
|
||||
__forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
|
||||
#elif defined (__SSE4_1__)
|
||||
__forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
|
||||
__forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
|
||||
__forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
|
||||
|
@ -393,8 +449,10 @@ namespace embree
|
|||
|
||||
__forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
|
||||
__forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
|
||||
__forceinline Vec3fx( const Vec3fa& other, const float w1) {
|
||||
#if defined (__SSE4_1__)
|
||||
__forceinline Vec3fx( const Vec3fa& other, const float w1) {
|
||||
#if defined (__aarch64__)
|
||||
m128 = other.m128; m128[3] = w1;
|
||||
#elif defined (__SSE4_1__)
|
||||
m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
|
||||
#else
|
||||
const vint4 mask(-1,-1,-1,0);
|
||||
|
@ -526,7 +584,7 @@ namespace embree
|
|||
__forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
|
||||
__forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__SSE4_1__) || defined(__aarch64__)
|
||||
__forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
|
||||
const vint4 ai = _mm_castps_si128(a.m128);
|
||||
const vint4 bi = _mm_castps_si128(b.m128);
|
||||
|
@ -535,7 +593,7 @@ namespace embree
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__SSE4_1__) || defined(__aarch64__)
|
||||
__forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
|
||||
const vint4 ai = _mm_castps_si128(a.m128);
|
||||
const vint4 bi = _mm_castps_si128(b.m128);
|
||||
|
@ -626,7 +684,7 @@ namespace embree
|
|||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Euclidian Space Operators
|
||||
/// Euclidean Space Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
|
|
@ -65,7 +65,9 @@ namespace embree
|
|||
|
||||
__forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
|
||||
__forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
|
||||
#if defined(__SSSE3__)
|
||||
#if (defined(__aarch64__))
|
||||
__forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
|
||||
#elif defined(__SSSE3__)
|
||||
__forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
|
||||
#endif
|
||||
|
||||
|
@ -81,7 +83,7 @@ namespace embree
|
|||
__forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); }
|
||||
__forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
__forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
|
||||
__forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); }
|
||||
__forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; }
|
||||
|
@ -116,7 +118,7 @@ namespace embree
|
|||
__forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
|
||||
__forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
__forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
|
||||
__forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; }
|
||||
#endif
|
||||
|
@ -127,18 +129,38 @@ namespace embree
|
|||
__forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
|
||||
__forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; }
|
||||
|
||||
#if !defined(__ARM_NEON)
|
||||
__forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
|
||||
__forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Select
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
|
||||
#else
|
||||
return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f));
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Reductions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__aarch64__)
|
||||
__forceinline int reduce_add(const Vec3ia& v) { return vaddvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0))); }
|
||||
__forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
|
||||
__forceinline int reduce_min(const Vec3ia& v) { return vminvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x7FFFFFFF))); }
|
||||
__forceinline int reduce_max(const Vec3ia& v) { return vmaxvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x80000000))); }
|
||||
#else
|
||||
__forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
|
||||
__forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
|
||||
__forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
|
||||
__forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Comparison Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -156,19 +178,7 @@ namespace embree
|
|||
__forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
|
||||
__forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Select
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
|
||||
#else
|
||||
return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f));
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
__forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
|
||||
__forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
|
||||
#else
|
||||
|
|
|
@ -149,7 +149,7 @@ namespace embree
|
|||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Euclidian Space Operators
|
||||
/// Euclidean Space Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T> __forceinline T dot ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
|
||||
|
@ -205,7 +205,7 @@ namespace embree
|
|||
/// SSE / AVX / MIC specializations
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined __SSE__
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
#include "../simd/sse.h"
|
||||
#endif
|
||||
|
||||
|
@ -225,7 +225,7 @@ namespace embree
|
|||
template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
|
||||
x = a.x; y = a.y; z = a.z; w = a.w;
|
||||
}
|
||||
#elif defined(__SSE__)
|
||||
#elif defined(__SSE__) || defined(__ARM_NEON)
|
||||
template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
|
||||
const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -11,33 +11,28 @@
|
|||
|
||||
#include "sse2neon.h"
|
||||
|
||||
__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
|
||||
__m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c)));
|
||||
return _mm_fmadd_ps(a, b, neg_c);
|
||||
}
|
||||
|
||||
__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
|
||||
#if defined(__aarch64__)
|
||||
return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c),
|
||||
vreinterpretq_f32_m128(b),
|
||||
vreinterpretq_f32_m128(a)));
|
||||
#else
|
||||
return _mm_sub_ps(c, _mm_mul_ps(a, b));
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
|
||||
return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c))));
|
||||
__forceinline __m128 _mm_abs_ps(__m128 a) { return vabsq_f32(a); }
|
||||
|
||||
__forceinline __m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c) { return vfmaq_f32(c, a, b); }
|
||||
__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { return vfmsq_f32(c, a, b); }
|
||||
__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmaq_f32(c, a, b)); }
|
||||
__forceinline __m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmsq_f32(c, a, b)); }
|
||||
|
||||
__forceinline __m128 _mm_broadcast_ss (float const * mem_addr)
|
||||
{
|
||||
return vdupq_n_f32(*mem_addr);
|
||||
}
|
||||
|
||||
// AVX2 emulation leverages Intel FMA defs above. Include after them.
|
||||
#include "avx2neon.h"
|
||||
|
||||
/* Dummy defines for floating point control */
|
||||
#define _MM_MASK_MASK 0x1f80
|
||||
#define _MM_MASK_DIV_ZERO 0x200
|
||||
#define _MM_FLUSH_ZERO_ON 0x8000
|
||||
// #define _MM_FLUSH_ZERO_ON 0x8000
|
||||
#define _MM_MASK_DENORM 0x100
|
||||
#define _MM_SET_EXCEPTION_MASK(x)
|
||||
#define _MM_SET_FLUSH_ZERO_MODE(x)
|
||||
// #define _MM_SET_FLUSH_ZERO_MODE(x)
|
||||
|
||||
__forceinline int _mm_getcsr()
|
||||
{
|
||||
|
@ -48,3 +43,43 @@ __forceinline void _mm_mfence()
|
|||
{
|
||||
__sync_synchronize();
|
||||
}
|
||||
|
||||
__forceinline __m128i _mm_load4epu8_epi32(__m128i *ptr)
|
||||
{
|
||||
uint8x8_t t0 = vld1_u8((uint8_t*)ptr);
|
||||
uint16x8_t t1 = vmovl_u8(t0);
|
||||
uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
|
||||
return vreinterpretq_s32_u32(t2);
|
||||
}
|
||||
|
||||
__forceinline __m128i _mm_load4epu16_epi32(__m128i *ptr)
|
||||
{
|
||||
uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);
|
||||
uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));
|
||||
return vreinterpretq_s32_u32(t1);
|
||||
}
|
||||
|
||||
__forceinline __m128i _mm_load4epi8_f32(__m128i *ptr)
|
||||
{
|
||||
int8x8_t t0 = vld1_s8((int8_t*)ptr);
|
||||
int16x8_t t1 = vmovl_s8(t0);
|
||||
int32x4_t t2 = vmovl_s16(vget_low_s16(t1));
|
||||
float32x4_t t3 = vcvtq_f32_s32(t2);
|
||||
return vreinterpretq_s32_f32(t3);
|
||||
}
|
||||
|
||||
__forceinline __m128i _mm_load4epu8_f32(__m128i *ptr)
|
||||
{
|
||||
uint8x8_t t0 = vld1_u8((uint8_t*)ptr);
|
||||
uint16x8_t t1 = vmovl_u8(t0);
|
||||
uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
|
||||
return vreinterpretq_s32_u32(t2);
|
||||
}
|
||||
|
||||
__forceinline __m128i _mm_load4epi16_f32(__m128i *ptr)
|
||||
{
|
||||
int16x8_t t0 = vld1q_s16((int16_t*)ptr);
|
||||
int32x4_t t1 = vmovl_s16(vget_low_s16(t0));
|
||||
float32x4_t t2 = vcvtq_f32_s32(t1);
|
||||
return vreinterpretq_s32_f32(t2);
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -6,7 +6,7 @@
|
|||
#include "../math/math.h"
|
||||
|
||||
/* include SSE wrapper classes */
|
||||
#if defined(__SSE__)
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
# include "sse.h"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
namespace embree
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
__forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) {
|
||||
return _mm_blendv_ps(f,t,mask);
|
||||
}
|
||||
|
|
|
@ -62,7 +62,11 @@ namespace embree
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
|
||||
#if !defined(__aarch64__)
|
||||
__forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
|
||||
#else
|
||||
__forceinline vboold(TrueTy) : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Array Access
|
||||
|
@ -107,9 +111,10 @@ namespace embree
|
|||
/// Movement/Shifting/Shuffling Functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
__forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
|
||||
__forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
|
|
|
@ -116,7 +116,7 @@ namespace embree
|
|||
__forceinline size_t popcnt (const vboolf16& a) { return popcnt(a.v); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Convertion Operations
|
||||
/// Conversion Operations
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); }
|
||||
|
|
|
@ -36,9 +36,11 @@ namespace embree
|
|||
|
||||
__forceinline vboolf(__m128 input) : v(input) {}
|
||||
__forceinline operator const __m128&() const { return v; }
|
||||
#if !defined(__EMSCRIPTEN__)
|
||||
__forceinline operator const __m128i() const { return _mm_castps_si128(v); }
|
||||
__forceinline operator const __m128d() const { return _mm_castps_pd(v); }
|
||||
|
||||
#endif
|
||||
|
||||
__forceinline vboolf(bool a)
|
||||
: v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
|
||||
__forceinline vboolf(bool a, bool b)
|
||||
|
@ -100,7 +102,7 @@ namespace embree
|
|||
__forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
|
||||
|
||||
__forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
return _mm_blendv_ps(f, t, m);
|
||||
#else
|
||||
return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
|
||||
|
@ -114,6 +116,17 @@ namespace embree
|
|||
__forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
|
||||
__forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
|
||||
|
||||
#if defined(__aarch64__)
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vboolf4 shuffle(const vboolf4& v) {
|
||||
return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3)));
|
||||
}
|
||||
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
|
||||
return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
|
||||
}
|
||||
#else
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vboolf4 shuffle(const vboolf4& v) {
|
||||
return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
|
||||
|
@ -123,6 +136,7 @@ namespace embree
|
|||
__forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
|
||||
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
}
|
||||
#endif
|
||||
|
||||
template<int i0>
|
||||
__forceinline vboolf4 shuffle(const vboolf4& v) {
|
||||
|
@ -135,7 +149,7 @@ namespace embree
|
|||
template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__SSE4_1__) && !defined(__aarch64__)
|
||||
template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
|
||||
template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
|
||||
template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
|
||||
|
@ -157,7 +171,9 @@ namespace embree
|
|||
__forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
|
||||
|
||||
__forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
|
||||
#if defined(__SSE4_2__)
|
||||
#if defined(__aarch64__)
|
||||
__forceinline size_t popcnt(const vboolf4& a) { return vaddvq_s32(vandq_u32(vreinterpretq_u32_f32(a.v),_mm_set1_epi32(1))); }
|
||||
#elif defined(__SSE4_2__)
|
||||
__forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
|
||||
#else
|
||||
__forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
|
||||
|
|
|
@ -76,7 +76,7 @@ namespace embree
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
|
||||
__forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
|
||||
__forceinline vboolf(TrueTy) : v(_mm256_castsi256_ps(_mm256_set1_epi32(0xFFFFFFFF))) {}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Array Access
|
||||
|
|
|
@ -189,13 +189,20 @@ namespace embree
|
|||
__forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
|
||||
__forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
|
||||
__forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
|
||||
#else
|
||||
#elif !defined(__aarch64__)
|
||||
__forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
|
||||
__forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
|
||||
__forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
|
||||
__forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
|
||||
__forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
|
||||
__forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
|
||||
#else
|
||||
__forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b); }
|
||||
__forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); }
|
||||
__forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b); }
|
||||
__forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); }
|
||||
__forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); }
|
||||
__forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b); }
|
||||
#endif
|
||||
|
||||
__forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); }
|
||||
|
|
|
@ -177,9 +177,10 @@ namespace embree
|
|||
__forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
|
||||
__forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
|
||||
|
||||
__forceinline vfloat16 rcp(const vfloat16& a) {
|
||||
__forceinline vfloat16 rcp(const vfloat16& a)
|
||||
{
|
||||
const vfloat16 r = _mm512_rcp14_ps(a);
|
||||
return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f)));
|
||||
return _mm512_fmadd_ps(r, _mm512_fnmadd_ps(a, r, vfloat16(1.0)), r); // computes r + r * (1 - a*r)
|
||||
}
|
||||
|
||||
__forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
|
||||
|
|
|
@ -42,6 +42,11 @@ namespace embree
|
|||
__forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
|
||||
|
||||
__forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
|
||||
#if defined(__aarch64__)
|
||||
__forceinline explicit vfloat(const vuint4& x) {
|
||||
v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
|
||||
}
|
||||
#else
|
||||
__forceinline explicit vfloat(const vuint4& x) {
|
||||
const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
|
||||
const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31
|
||||
|
@ -49,7 +54,7 @@ namespace embree
|
|||
const __m128 bf = _mm_castsi128_ps(b);
|
||||
v = _mm_add_ps(af,bf);
|
||||
}
|
||||
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Constants
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -107,7 +112,11 @@ namespace embree
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
static __forceinline vfloat4 load(const char* ptr) {
|
||||
return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
|
||||
}
|
||||
#elif defined(__SSE4_1__)
|
||||
static __forceinline vfloat4 load(const char* ptr) {
|
||||
return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
|
||||
}
|
||||
|
@ -117,7 +126,11 @@ namespace embree
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
static __forceinline vfloat4 load(const unsigned char* ptr) {
|
||||
return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
|
||||
}
|
||||
#elif defined(__SSE4_1__)
|
||||
static __forceinline vfloat4 load(const unsigned char* ptr) {
|
||||
return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
|
||||
}
|
||||
|
@ -128,7 +141,11 @@ namespace embree
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
static __forceinline vfloat4 load(const short* ptr) {
|
||||
return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
|
||||
}
|
||||
#elif defined(__SSE4_1__)
|
||||
static __forceinline vfloat4 load(const short* ptr) {
|
||||
return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
|
||||
}
|
||||
|
@ -145,7 +162,11 @@ namespace embree
|
|||
static __forceinline void store_nt(void* ptr, const vfloat4& v)
|
||||
{
|
||||
#if defined (__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
_mm_stream_ps((float*)ptr,v);
|
||||
#else
|
||||
_mm_stream_ps((float*)ptr,v);
|
||||
#endif
|
||||
#else
|
||||
_mm_store_ps((float*)ptr,v);
|
||||
#endif
|
||||
|
@ -153,7 +174,7 @@ namespace embree
|
|||
|
||||
template<int scale = 4>
|
||||
static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _mm_i32gather_ps(ptr, index, scale);
|
||||
#else
|
||||
return vfloat4(
|
||||
|
@ -169,7 +190,7 @@ namespace embree
|
|||
vfloat4 r = zero;
|
||||
#if defined(__AVX512VL__)
|
||||
return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
|
||||
#elif defined(__AVX2__)
|
||||
#elif defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
|
||||
#else
|
||||
if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
|
||||
|
@ -223,8 +244,8 @@ namespace embree
|
|||
friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
|
||||
#if defined(__AVX512VL__)
|
||||
return _mm_mask_blend_ps(m, f, t);
|
||||
#elif defined(__SSE4_1__)
|
||||
return _mm_blendv_ps(f, t, m);
|
||||
#elif defined(__SSE4_1__) || (defined(__aarch64__))
|
||||
return _mm_blendv_ps(f, t, m);
|
||||
#else
|
||||
return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
|
||||
#endif
|
||||
|
@ -256,18 +277,34 @@ namespace embree
|
|||
__forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); }
|
||||
|
||||
__forceinline vfloat4 operator +(const vfloat4& a) { return a; }
|
||||
#if defined(__aarch64__)
|
||||
__forceinline vfloat4 operator -(const vfloat4& a) {
|
||||
return vnegq_f32(a);
|
||||
}
|
||||
#else
|
||||
__forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__)
|
||||
__forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
|
||||
#else
|
||||
__forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
__forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
|
||||
#else
|
||||
__forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
|
||||
#endif
|
||||
|
||||
__forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
|
||||
|
||||
|
||||
__forceinline vfloat4 rcp(const vfloat4& a)
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
|
||||
#else
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
const vfloat4 r = _mm_rcp14_ps(a);
|
||||
#else
|
||||
|
@ -275,29 +312,38 @@ namespace embree
|
|||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
|
||||
return _mm_fmadd_ps(r, _mm_fnmadd_ps(a, r, vfloat4(1.0f)), r); // computes r + r * (1 - a * r)
|
||||
#else
|
||||
return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
|
||||
return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)))); // computes r + r * (1 - a * r)
|
||||
#endif
|
||||
|
||||
#endif //defined(__aarch64__)
|
||||
}
|
||||
__forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
|
||||
__forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
|
||||
|
||||
__forceinline vfloat4 rsqrt(const vfloat4& a)
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
vfloat4 r = _mm_rsqrt_ps(a);
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
|
||||
return r;
|
||||
#else
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
vfloat4 r = _mm_rsqrt14_ps(a);
|
||||
#else
|
||||
vfloat4 r = _mm_rsqrt_ps(a);
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
|
||||
r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
|
||||
#elif defined(__AVX2__)
|
||||
#if defined(__AVX2__)
|
||||
r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
|
||||
#else
|
||||
r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
@ -344,7 +390,8 @@ namespace embree
|
|||
__forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); }
|
||||
__forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__SSE4_1__) || defined(__aarch64__)
|
||||
|
||||
__forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
|
||||
const vint4 ai = _mm_castps_si128(a);
|
||||
const vint4 bi = _mm_castps_si128(b);
|
||||
|
@ -393,9 +440,10 @@ namespace embree
|
|||
__forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
|
||||
#else
|
||||
__forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
|
||||
__forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
|
||||
__forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
|
||||
__forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
|
||||
__forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -429,8 +477,13 @@ namespace embree
|
|||
__forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
|
||||
__forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
|
||||
__forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
|
||||
#if defined(__aarch64__)
|
||||
__forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
|
||||
__forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
|
||||
#else
|
||||
__forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
|
||||
__forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
|
||||
#endif
|
||||
__forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
|
||||
#endif
|
||||
|
||||
|
@ -484,7 +537,7 @@ namespace embree
|
|||
return select(vboolf4(mask), t, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
__forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
|
||||
return madd(t,b-a,a);
|
||||
}
|
||||
|
@ -506,10 +559,10 @@ namespace embree
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__aarch64__)
|
||||
__forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); }
|
||||
__forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); }
|
||||
__forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); }
|
||||
__forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); }
|
||||
__forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
|
||||
__forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
|
||||
__forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
|
||||
__forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
|
||||
#elif defined (__SSE4_1__)
|
||||
__forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
|
||||
__forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
|
||||
|
@ -524,7 +577,9 @@ namespace embree
|
|||
__forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
|
||||
|
||||
__forceinline vint4 floori(const vfloat4& a) {
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
return vcvtq_s32_f32(floor(a));
|
||||
#elif defined(__SSE4_1__)
|
||||
return vint4(floor(a));
|
||||
#else
|
||||
return vint4(a-vfloat4(0.5f));
|
||||
|
@ -538,6 +593,16 @@ namespace embree
|
|||
__forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
|
||||
__forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
|
||||
|
||||
#if defined(__aarch64__)
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vfloat4 shuffle(const vfloat4& v) {
|
||||
return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
|
||||
}
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
|
||||
return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
|
||||
}
|
||||
#else
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vfloat4 shuffle(const vfloat4& v) {
|
||||
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
|
||||
|
@ -547,8 +612,9 @@ namespace embree
|
|||
__forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
|
||||
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE3__)
|
||||
#if defined(__SSE3__) && !defined(__aarch64__)
|
||||
template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
|
||||
template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
|
||||
template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
|
||||
|
@ -559,10 +625,14 @@ namespace embree
|
|||
return shuffle<i,i,i,i>(v);
|
||||
}
|
||||
|
||||
#if defined(__aarch64__)
|
||||
template<int i> __forceinline float extract(const vfloat4& a) { return a[i]; }
|
||||
#else
|
||||
template<int i> __forceinline float extract (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); }
|
||||
template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
|
||||
#endif
|
||||
|
||||
#if defined (__SSE4_1__)
|
||||
#if defined (__SSE4_1__) && !defined(__aarch64__)
|
||||
template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
|
||||
template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
|
||||
template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
|
||||
|
@ -664,14 +734,25 @@ namespace embree
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Reductions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__aarch64__)
|
||||
__forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
|
||||
__forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
|
||||
__forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
|
||||
#else
|
||||
__forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
|
||||
__forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
|
||||
__forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__)
|
||||
__forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
|
||||
__forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
|
||||
__forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
|
||||
#else
|
||||
__forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
|
||||
__forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
|
||||
__forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
|
||||
#endif
|
||||
|
||||
__forceinline size_t select_min(const vboolf4& valid, const vfloat4& v)
|
||||
{
|
||||
|
@ -687,7 +768,7 @@ namespace embree
|
|||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Euclidian Space Operators
|
||||
/// Euclidean Space Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline float dot(const vfloat4& a, const vfloat4& b) {
|
||||
|
|
|
@ -107,11 +107,11 @@ namespace embree
|
|||
static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
|
||||
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
|
||||
#else
|
||||
static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
|
||||
static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
|
||||
static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); }
|
||||
static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); }
|
||||
|
||||
static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
|
||||
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
|
||||
static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); }
|
||||
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); }
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
@ -126,7 +126,7 @@ namespace embree
|
|||
|
||||
template<int scale = 4>
|
||||
static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _mm256_i32gather_ps(ptr, index ,scale);
|
||||
#else
|
||||
return vfloat8(
|
||||
|
@ -146,7 +146,7 @@ namespace embree
|
|||
vfloat8 r = zero;
|
||||
#if defined(__AVX512VL__)
|
||||
return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
|
||||
#elif defined(__AVX2__)
|
||||
#elif defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
|
||||
#else
|
||||
if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
|
||||
|
@ -215,20 +215,52 @@ namespace embree
|
|||
__forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); }
|
||||
|
||||
__forceinline vfloat8 operator +(const vfloat8& a) { return a; }
|
||||
#if !defined(__aarch64__)
|
||||
__forceinline vfloat8 operator -(const vfloat8& a) {
|
||||
const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
|
||||
return _mm256_xor_ps(a, mask);
|
||||
}
|
||||
#else
|
||||
__forceinline vfloat8 operator -(const vfloat8& a) {
|
||||
__m256 res;
|
||||
res.lo = vnegq_f32(a.v.lo);
|
||||
res.hi = vnegq_f32(a.v.hi);
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
__forceinline vfloat8 abs(const vfloat8& a) {
|
||||
const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
|
||||
return _mm256_and_ps(a, mask);
|
||||
}
|
||||
#else
|
||||
__forceinline vfloat8 abs(const vfloat8& a) {
|
||||
__m256 res;
|
||||
res.lo = vabsq_f32(a.v.lo);
|
||||
res.hi = vabsq_f32(a.v.hi);
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
__forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
|
||||
#else
|
||||
__forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); }
|
||||
#endif
|
||||
__forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
|
||||
|
||||
|
||||
static __forceinline vfloat8 rcp(const vfloat8& a)
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
vfloat8 ret;
|
||||
const float32x4_t one = vdupq_n_f32(1.0f);
|
||||
ret.v.lo = vdivq_f32(one, a.v.lo);
|
||||
ret.v.hi = vdivq_f32(one, a.v.hi);
|
||||
return ret;
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
const vfloat8 r = _mm256_rcp14_ps(a);
|
||||
#else
|
||||
|
@ -236,9 +268,12 @@ namespace embree
|
|||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
|
||||
// First, compute 1 - a * r (which will be very close to 0)
|
||||
const vfloat8 h_n = _mm256_fnmadd_ps(a, r, vfloat8(1.0f));
|
||||
// Then compute r + r * h_n
|
||||
return _mm256_fmadd_ps(r, h_n, r);
|
||||
#else
|
||||
return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
|
||||
return _mm256_add_ps(r,_mm256_mul_ps(r, _mm256_sub_ps(vfloat8(1.0f), _mm256_mul_ps(a, r)))); // computes r + r * (1 - a * r)
|
||||
#endif
|
||||
}
|
||||
__forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); }
|
||||
|
@ -384,7 +419,7 @@ namespace embree
|
|||
static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
|
||||
return _mm256_mask_blend_ps(m, f, t);
|
||||
}
|
||||
#else
|
||||
#elif !defined(__aarch64__)
|
||||
static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
|
||||
static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
|
||||
static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
|
||||
|
@ -395,6 +430,18 @@ namespace embree
|
|||
static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
|
||||
return _mm256_blendv_ps(f, t, m);
|
||||
}
|
||||
#else
|
||||
static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b); }
|
||||
static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); }
|
||||
static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b); }
|
||||
static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b); }
|
||||
static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b); }
|
||||
static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b); }
|
||||
|
||||
static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
|
||||
return _mm256_blendv_ps(f, t, m);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template<int mask>
|
||||
|
@ -463,10 +510,17 @@ namespace embree
|
|||
/// Rounding Functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
__forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
|
||||
__forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); }
|
||||
__forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); }
|
||||
__forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
|
||||
#else
|
||||
__forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); }
|
||||
__forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); }
|
||||
#endif
|
||||
|
||||
|
||||
__forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -501,9 +555,11 @@ namespace embree
|
|||
return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
}
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
|
||||
template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
|
||||
template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
|
||||
#endif
|
||||
|
||||
__forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
|
||||
template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
|
||||
|
@ -512,7 +568,7 @@ namespace embree
|
|||
|
||||
__forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); }
|
||||
|
||||
#if defined (__AVX2__)
|
||||
#if defined (__AVX2__) && !defined(__aarch64__)
|
||||
static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
|
||||
return _mm256_permutevar8x32_ps(a, index);
|
||||
}
|
||||
|
@ -609,7 +665,7 @@ namespace embree
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Reductions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
__forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
|
||||
__forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
|
||||
__forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
|
||||
|
@ -625,7 +681,14 @@ namespace embree
|
|||
__forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
|
||||
__forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
|
||||
__forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
|
||||
#else
|
||||
__forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); }
|
||||
__forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); }
|
||||
__forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); }
|
||||
__forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); }
|
||||
__forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); }
|
||||
|
||||
#endif
|
||||
__forceinline size_t select_min(const vboolf8& valid, const vfloat8& v)
|
||||
{
|
||||
const vfloat8 a = select(valid,v,vfloat8(pos_inf));
|
||||
|
@ -642,7 +705,7 @@ namespace embree
|
|||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Euclidian Space Operators (pairs of Vec3fa's)
|
||||
/// Euclidean Space Operators (pairs of Vec3fa's)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
|
||||
|
|
|
@ -106,7 +106,14 @@ namespace embree
|
|||
#endif
|
||||
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
static __forceinline vint4 load(const unsigned char* ptr) {
|
||||
return _mm_load4epu8_epi32(((__m128i*)ptr));
|
||||
}
|
||||
static __forceinline vint4 loadu(const unsigned char* ptr) {
|
||||
return _mm_load4epu8_epi32(((__m128i*)ptr));
|
||||
}
|
||||
#elif defined(__SSE4_1__)
|
||||
static __forceinline vint4 load(const unsigned char* ptr) {
|
||||
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
|
||||
}
|
||||
|
@ -127,7 +134,9 @@ namespace embree
|
|||
#endif
|
||||
|
||||
static __forceinline vint4 load(const unsigned short* ptr) {
|
||||
#if defined (__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
return __m128i(vmovl_u16(vld1_u16(ptr)));
|
||||
#elif defined (__SSE4_1__)
|
||||
return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
|
||||
#else
|
||||
return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
|
||||
|
@ -135,7 +144,12 @@ namespace embree
|
|||
}
|
||||
|
||||
static __forceinline void store(unsigned char* ptr, const vint4& v) {
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
int32x4_t x = v;
|
||||
uint16x4_t y = vqmovn_u32(uint32x4_t(x));
|
||||
uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
|
||||
vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0);
|
||||
#elif defined(__SSE4_1__)
|
||||
__m128i x = v;
|
||||
x = _mm_packus_epi32(x, x);
|
||||
x = _mm_packus_epi16(x, x);
|
||||
|
@ -147,20 +161,26 @@ namespace embree
|
|||
}
|
||||
|
||||
static __forceinline void store(unsigned short* ptr, const vint4& v) {
|
||||
#if defined(__aarch64__)
|
||||
uint32x4_t x = uint32x4_t(v.v);
|
||||
uint16x4_t y = vqmovn_u32(x);
|
||||
vst1_u16(ptr, y);
|
||||
#else
|
||||
for (size_t i=0;i<4;i++)
|
||||
ptr[i] = (unsigned short)v[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
static __forceinline vint4 load_nt(void* ptr) {
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_stream_load_si128((__m128i*)ptr);
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
return _mm_stream_load_si128((__m128i*)ptr);
|
||||
#else
|
||||
return _mm_load_si128((__m128i*)ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __forceinline void store_nt(void* ptr, const vint4& v) {
|
||||
#if defined(__SSE4_1__)
|
||||
#if !defined(__aarch64__) && defined(__SSE4_1__)
|
||||
_mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
|
||||
#else
|
||||
_mm_store_si128((__m128i*)ptr,v);
|
||||
|
@ -169,7 +189,7 @@ namespace embree
|
|||
|
||||
template<int scale = 4>
|
||||
static __forceinline vint4 gather(const int* ptr, const vint4& index) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _mm_i32gather_epi32(ptr, index, scale);
|
||||
#else
|
||||
return vint4(
|
||||
|
@ -185,7 +205,7 @@ namespace embree
|
|||
vint4 r = zero;
|
||||
#if defined(__AVX512VL__)
|
||||
return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
|
||||
#elif defined(__AVX2__)
|
||||
#elif defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
|
||||
#else
|
||||
if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
|
||||
|
@ -222,7 +242,7 @@ namespace embree
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#if defined(__x86_64__) || defined(__aarch64__)
|
||||
static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
|
||||
#endif
|
||||
|
||||
|
@ -236,6 +256,8 @@ namespace embree
|
|||
friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
|
||||
#if defined(__AVX512VL__)
|
||||
return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
|
||||
#elif defined(__aarch64__)
|
||||
return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v));
|
||||
#elif defined(__SSE4_1__)
|
||||
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
|
||||
#else
|
||||
|
@ -256,7 +278,9 @@ namespace embree
|
|||
|
||||
__forceinline vint4 operator +(const vint4& a) { return a; }
|
||||
__forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
|
||||
#if defined(__SSSE3__)
|
||||
#if defined(__aarch64__)
|
||||
__forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); }
|
||||
#elif defined(__SSSE3__)
|
||||
__forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
|
||||
#endif
|
||||
|
||||
|
@ -272,7 +296,7 @@ namespace embree
|
|||
__forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); }
|
||||
__forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if (defined(__aarch64__)) || defined(__SSE4_1__)
|
||||
__forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
|
||||
#else
|
||||
__forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
|
||||
|
@ -292,8 +316,8 @@ namespace embree
|
|||
__forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); }
|
||||
__forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; }
|
||||
|
||||
__forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); }
|
||||
__forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); }
|
||||
__forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); }
|
||||
__forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); }
|
||||
|
||||
__forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
|
||||
__forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
|
||||
|
@ -309,7 +333,7 @@ namespace embree
|
|||
__forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
|
||||
__forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; }
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if (defined(__aarch64__)) || defined(__SSE4_1__)
|
||||
__forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
|
||||
__forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; }
|
||||
#endif
|
||||
|
@ -393,7 +417,7 @@ namespace embree
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
__forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
|
||||
__forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
|
||||
|
||||
|
@ -417,6 +441,16 @@ namespace embree
|
|||
__forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
|
||||
__forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
|
||||
|
||||
#if defined(__aarch64__)
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vint4 shuffle(const vint4& v) {
|
||||
return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
|
||||
}
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vint4 shuffle(const vint4& a, const vint4& b) {
|
||||
return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
|
||||
}
|
||||
#else
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vint4 shuffle(const vint4& v) {
|
||||
return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
|
@ -426,7 +460,7 @@ namespace embree
|
|||
__forceinline vint4 shuffle(const vint4& a, const vint4& b) {
|
||||
return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
|
||||
}
|
||||
|
||||
#endif
|
||||
#if defined(__SSE3__)
|
||||
template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
|
||||
template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
|
||||
|
@ -438,7 +472,7 @@ namespace embree
|
|||
return shuffle<i,i,i,i>(v);
|
||||
}
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__SSE4_1__) && !defined(__aarch64__)
|
||||
template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
|
||||
template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
|
||||
#else
|
||||
|
@ -446,18 +480,27 @@ namespace embree
|
|||
template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
|
||||
#endif
|
||||
|
||||
|
||||
template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
|
||||
|
||||
|
||||
__forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
|
||||
|
||||
__forceinline size_t toSizeT(const vint4& v) {
|
||||
|
||||
#if defined(__aarch64__)
|
||||
__forceinline size_t toSizeT(const vint4& v) {
|
||||
uint64x2_t x = uint64x2_t(v.v);
|
||||
return x[0];
|
||||
}
|
||||
#else
|
||||
__forceinline size_t toSizeT(const vint4& v) {
|
||||
#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
|
||||
return toScalar(v);
|
||||
#elif defined(__ARM_NEON)
|
||||
// FIXME(LTE): Do we need a swap(i.e. use lane 1)?
|
||||
return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0);
|
||||
#else
|
||||
return _mm_cvtsi128_si64(v);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
|
@ -475,7 +518,17 @@ namespace embree
|
|||
/// Reductions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__)
|
||||
|
||||
#if defined(__aarch64__)
|
||||
__forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); }
|
||||
__forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); }
|
||||
__forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); }
|
||||
|
||||
__forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); }
|
||||
__forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); }
|
||||
__forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); }
|
||||
#else
|
||||
__forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
|
||||
__forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
|
||||
__forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
|
||||
|
@ -483,6 +536,7 @@ namespace embree
|
|||
__forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
|
||||
__forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
|
||||
__forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
|
||||
#endif
|
||||
|
||||
__forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
|
||||
__forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
|
||||
|
@ -502,7 +556,7 @@ namespace embree
|
|||
/// Sorting networks
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if (defined(__aarch64__)) || defined(__SSE4_1__)
|
||||
|
||||
__forceinline vint4 usort_ascending(const vint4& v)
|
||||
{
|
||||
|
|
|
@ -79,8 +79,8 @@ namespace embree
|
|||
static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
|
||||
static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
|
||||
|
||||
static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
|
||||
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
|
||||
static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
|
||||
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
|
||||
|
||||
static __forceinline void store_nt(void* ptr, const vint8& v) {
|
||||
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
|
||||
|
|
|
@ -393,6 +393,7 @@ namespace embree
|
|||
|
||||
__forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
__forceinline vint8 permute(const vint8& v, const __m256i& index) {
|
||||
return _mm256_permutevar8x32_epi32(v, index);
|
||||
}
|
||||
|
@ -410,6 +411,9 @@ namespace embree
|
|||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Reductions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -95,7 +95,14 @@ namespace embree
|
|||
static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
static __forceinline vuint4 load(const unsigned char* ptr) {
|
||||
return _mm_load4epu8_epi32(((__m128i*)ptr));
|
||||
}
|
||||
static __forceinline vuint4 loadu(const unsigned char* ptr) {
|
||||
return _mm_load4epu8_epi32(((__m128i*)ptr));
|
||||
}
|
||||
#elif defined(__SSE4_1__)
|
||||
static __forceinline vuint4 load(const unsigned char* ptr) {
|
||||
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
|
||||
}
|
||||
|
@ -107,7 +114,9 @@ namespace embree
|
|||
#endif
|
||||
|
||||
static __forceinline vuint4 load(const unsigned short* ptr) {
|
||||
#if defined (__SSE4_1__)
|
||||
#if defined(__aarch64__)
|
||||
return _mm_load4epu16_epi32(((__m128i*)ptr));
|
||||
#elif defined (__SSE4_1__)
|
||||
return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
|
||||
#else
|
||||
return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
|
||||
|
@ -115,7 +124,7 @@ namespace embree
|
|||
}
|
||||
|
||||
static __forceinline vuint4 load_nt(void* ptr) {
|
||||
#if defined(__SSE4_1__)
|
||||
#if (defined(__aarch64__)) || defined(__SSE4_1__)
|
||||
return _mm_stream_load_si128((__m128i*)ptr);
|
||||
#else
|
||||
return _mm_load_si128((__m128i*)ptr);
|
||||
|
@ -123,8 +132,8 @@ namespace embree
|
|||
}
|
||||
|
||||
static __forceinline void store_nt(void* ptr, const vuint4& v) {
|
||||
#if defined(__SSE4_1__)
|
||||
_mm_stream_ps((float*)ptr,_mm_castsi128_ps(v));
|
||||
#if !defined(__aarch64__) && defined(__SSE4_1__)
|
||||
_mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
|
||||
#else
|
||||
_mm_store_si128((__m128i*)ptr,v);
|
||||
#endif
|
||||
|
@ -132,7 +141,7 @@ namespace embree
|
|||
|
||||
template<int scale = 4>
|
||||
static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _mm_i32gather_epi32((const int*)ptr, index, scale);
|
||||
#else
|
||||
return vuint4(
|
||||
|
@ -148,7 +157,7 @@ namespace embree
|
|||
vuint4 r = zero;
|
||||
#if defined(__AVX512VL__)
|
||||
return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
|
||||
#elif defined(__AVX2__)
|
||||
#elif defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
|
||||
#else
|
||||
if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
|
||||
|
@ -344,6 +353,16 @@ namespace embree
|
|||
__forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
|
||||
__forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
|
||||
|
||||
#if defined(__aarch64__)
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vuint4 shuffle(const vuint4& v) {
|
||||
return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
|
||||
}
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
|
||||
return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
|
||||
}
|
||||
#else
|
||||
template<int i0, int i1, int i2, int i3>
|
||||
__forceinline vuint4 shuffle(const vuint4& v) {
|
||||
return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
|
@ -353,7 +372,7 @@ namespace embree
|
|||
__forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
|
||||
return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
|
||||
}
|
||||
|
||||
#endif
|
||||
#if defined(__SSE3__)
|
||||
template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
|
||||
template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
|
||||
|
@ -365,7 +384,7 @@ namespace embree
|
|||
return shuffle<i,i,i,i>(v);
|
||||
}
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__SSE4_1__) && !defined(__aarch64__)
|
||||
template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
|
||||
template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
|
||||
#else
|
||||
|
@ -373,7 +392,6 @@ namespace embree
|
|||
template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
|
||||
#endif
|
||||
|
||||
|
||||
template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
|
||||
|
||||
__forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
|
||||
|
|
|
@ -77,8 +77,8 @@ namespace embree
|
|||
static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
|
||||
static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
|
||||
|
||||
static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
|
||||
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
|
||||
static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
|
||||
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
|
||||
|
||||
static __forceinline void store_nt(void* ptr, const vuint8& v) {
|
||||
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
|
||||
|
|
|
@ -385,6 +385,7 @@ namespace embree
|
|||
|
||||
__forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
__forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
|
||||
return _mm256_permutevar8x32_epi32(v, index);
|
||||
}
|
||||
|
@ -401,6 +402,7 @@ namespace embree
|
|||
return _mm256_alignr_epi8(a, b, 4*i);
|
||||
#endif
|
||||
}
|
||||
#endif // !defined(__aarch64__)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Reductions
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
// Copyright 2009-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
// According to https://emscripten.org/docs/porting/simd.html, _MM_SET_EXCEPTION_MASK and
|
||||
// _mm_setcsr are unavailable in WebAssembly.
|
||||
|
||||
#define _MM_SET_EXCEPTION_MASK(x)
|
||||
|
||||
__forceinline void _mm_setcsr(unsigned int)
|
||||
{
|
||||
}
|
|
@ -59,8 +59,8 @@ namespace embree
|
|||
|
||||
/********************** Iterators ****************************/
|
||||
|
||||
__forceinline T* begin() const { return items; };
|
||||
__forceinline T* end () const { return items+M; };
|
||||
__forceinline T* begin() const { return (T*)items; };
|
||||
__forceinline T* end () const { return (T*)items+M; };
|
||||
|
||||
|
||||
/********************** Capacity ****************************/
|
||||
|
@ -101,8 +101,8 @@ namespace embree
|
|||
__forceinline T& at(size_t i) { assert(i < M); return items[i]; }
|
||||
__forceinline const T& at(size_t i) const { assert(i < M); return items[i]; }
|
||||
|
||||
__forceinline T& front() const { assert(M > 0); return items[0]; };
|
||||
__forceinline T& back () const { assert(M > 0); return items[M-1]; };
|
||||
__forceinline T& front() { assert(M > 0); return items[0]; };
|
||||
__forceinline T& back () { assert(M > 0); return items[M-1]; };
|
||||
|
||||
__forceinline T* data() { return items; };
|
||||
__forceinline const T* data() const { return items; };
|
||||
|
@ -139,7 +139,7 @@ namespace embree
|
|||
__forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; }
|
||||
__forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
|
||||
|
||||
#if defined(__64BIT__)
|
||||
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
|
||||
__forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; }
|
||||
__forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
|
||||
#endif
|
||||
|
@ -196,7 +196,7 @@ namespace embree
|
|||
__forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
|
||||
__forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
|
||||
|
||||
#if defined(__64BIT__)
|
||||
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
|
||||
__forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
|
||||
#endif
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ namespace embree
|
|||
BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement
|
||||
|
||||
public:
|
||||
/*! intializes the barrier with some number of threads */
|
||||
/*! initializes the barrier with some number of threads */
|
||||
void init(size_t count);
|
||||
|
||||
/*! lets calling thread wait in barrier */
|
||||
|
@ -94,7 +94,7 @@ namespace embree
|
|||
LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement
|
||||
|
||||
public:
|
||||
/*! intializes the barrier with some number of threads */
|
||||
/*! initializes the barrier with some number of threads */
|
||||
void init(size_t threadCount);
|
||||
|
||||
/*! thread with threadIndex waits in the barrier */
|
||||
|
|
|
@ -13,6 +13,9 @@
|
|||
#include "../simd/arm/emulation.h"
|
||||
#else
|
||||
#include <immintrin.h>
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
#include "../simd/wasm/emulation.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
|
||||
|
@ -24,24 +27,26 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__LZCNT__)
|
||||
#if defined(__aarch64__)
|
||||
#if !defined(_lzcnt_u32)
|
||||
#define _lzcnt_u32 __lzcnt32
|
||||
#define _lzcnt_u32 __builtin_clz
|
||||
#endif
|
||||
#if !defined(_lzcnt_u64)
|
||||
#define _lzcnt_u64 __lzcnt64
|
||||
#else
|
||||
#if defined(__LZCNT__)
|
||||
#if !defined(_lzcnt_u32)
|
||||
#define _lzcnt_u32 __lzcnt32
|
||||
#endif
|
||||
#if !defined(_lzcnt_u64)
|
||||
#define _lzcnt_u64 __lzcnt64
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__)
|
||||
// -- GODOT start --
|
||||
#if !defined(NOMINMAX)
|
||||
// -- GODOT end --
|
||||
#define NOMINMAX
|
||||
// -- GODOT start --
|
||||
#endif
|
||||
#include "windows.h"
|
||||
// -- GODOT end --
|
||||
# if !defined(NOMINMAX)
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#endif
|
||||
|
||||
/* normally defined in pmmintrin.h, but we always need this */
|
||||
|
@ -69,7 +74,7 @@ namespace embree
|
|||
}
|
||||
|
||||
__forceinline int bsf(int v) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _tzcnt_u32(v);
|
||||
#else
|
||||
unsigned long r = 0; _BitScanForward(&r,v); return r;
|
||||
|
@ -77,7 +82,7 @@ namespace embree
|
|||
}
|
||||
|
||||
__forceinline unsigned bsf(unsigned v) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _tzcnt_u32(v);
|
||||
#else
|
||||
unsigned long r = 0; _BitScanForward(&r,v); return r;
|
||||
|
@ -118,7 +123,7 @@ namespace embree
|
|||
#endif
|
||||
|
||||
__forceinline int bsr(int v) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return 31 - _lzcnt_u32(v);
|
||||
#else
|
||||
unsigned long r = 0; _BitScanReverse(&r,v); return r;
|
||||
|
@ -126,7 +131,7 @@ namespace embree
|
|||
}
|
||||
|
||||
__forceinline unsigned bsr(unsigned v) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return 31 - _lzcnt_u32(v);
|
||||
#else
|
||||
unsigned long r = 0; _BitScanReverse(&r,v); return r;
|
||||
|
@ -145,7 +150,7 @@ namespace embree
|
|||
|
||||
__forceinline int lzcnt(const int x)
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _lzcnt_u32(x);
|
||||
#else
|
||||
if (unlikely(x == 0)) return 32;
|
||||
|
@ -214,15 +219,26 @@ namespace embree
|
|||
#elif defined(__X86_ASM__)
|
||||
|
||||
__forceinline void __cpuid(int out[4], int op) {
|
||||
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
|
||||
#if defined(__ARM_NEON)
|
||||
if (op == 0) { // Get CPU name
|
||||
out[0] = 0x41524d20;
|
||||
out[1] = 0x41524d20;
|
||||
out[2] = 0x41524d20;
|
||||
out[3] = 0x41524d20;
|
||||
}
|
||||
#else
|
||||
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#if !defined(__ARM_NEON)
|
||||
__forceinline void __cpuid_count(int out[4], int op1, int op2) {
|
||||
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
__forceinline uint64_t read_tsc() {
|
||||
#if defined(__X86_ASM__)
|
||||
uint32_t high,low;
|
||||
|
@ -235,30 +251,38 @@ namespace embree
|
|||
}
|
||||
|
||||
__forceinline int bsf(int v) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__ARM_NEON)
|
||||
return __builtin_ctz(v);
|
||||
#else
|
||||
#if defined(__AVX2__)
|
||||
return _tzcnt_u32(v);
|
||||
#elif defined(__X86_ASM__)
|
||||
int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
|
||||
#else
|
||||
return __builtin_ctz(v);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__64BIT__)
|
||||
__forceinline unsigned bsf(unsigned v)
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__ARM_NEON)
|
||||
return __builtin_ctz(v);
|
||||
#else
|
||||
#if defined(__AVX2__)
|
||||
return _tzcnt_u32(v);
|
||||
#elif defined(__X86_ASM__)
|
||||
unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
|
||||
#else
|
||||
return __builtin_ctz(v);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
__forceinline size_t bsf(size_t v) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
#if defined(__X86_64__)
|
||||
return _tzcnt_u64(v);
|
||||
#else
|
||||
|
@ -295,7 +319,7 @@ namespace embree
|
|||
}
|
||||
|
||||
__forceinline int bsr(int v) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return 31 - _lzcnt_u32(v);
|
||||
#elif defined(__X86_ASM__)
|
||||
int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
|
||||
|
@ -304,7 +328,7 @@ namespace embree
|
|||
#endif
|
||||
}
|
||||
|
||||
#if defined(__64BIT__)
|
||||
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
|
||||
__forceinline unsigned bsr(unsigned v) {
|
||||
#if defined(__AVX2__)
|
||||
return 31 - _lzcnt_u32(v);
|
||||
|
@ -317,7 +341,7 @@ namespace embree
|
|||
#endif
|
||||
|
||||
__forceinline size_t bsr(size_t v) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
#if defined(__X86_64__)
|
||||
return 63 - _lzcnt_u64(v);
|
||||
#else
|
||||
|
@ -332,7 +356,7 @@ namespace embree
|
|||
|
||||
__forceinline int lzcnt(const int x)
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
return _lzcnt_u32(x);
|
||||
#else
|
||||
if (unlikely(x == 0)) return 32;
|
||||
|
@ -341,18 +365,18 @@ namespace embree
|
|||
}
|
||||
|
||||
__forceinline size_t blsr(size_t v) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__INTEL_COMPILER)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
#if defined(__INTEL_COMPILER)
|
||||
return _blsr_u64(v);
|
||||
#else
|
||||
#if defined(__X86_64__)
|
||||
return __blsr_u64(v);
|
||||
#else
|
||||
return __blsr_u32(v);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if defined(__X86_64__)
|
||||
return __blsr_u64(v);
|
||||
#else
|
||||
return __blsr_u32(v);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
return v & (v-1);
|
||||
return v & (v-1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -368,7 +392,7 @@ namespace embree
|
|||
#if defined(__X86_ASM__)
|
||||
int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
|
||||
#else
|
||||
return (v | (v << i));
|
||||
return (v | (1 << i));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -376,7 +400,7 @@ namespace embree
|
|||
#if defined(__X86_ASM__)
|
||||
int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
|
||||
#else
|
||||
return (v & ~(v << i));
|
||||
return (v & ~(1 << i));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -392,7 +416,7 @@ namespace embree
|
|||
#if defined(__X86_ASM__)
|
||||
size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
|
||||
#else
|
||||
return (v | (v << i));
|
||||
return (v | (1 << i));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -400,7 +424,7 @@ namespace embree
|
|||
#if defined(__X86_ASM__)
|
||||
size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
|
||||
#else
|
||||
return (v & ~(v << i));
|
||||
return (v & ~(1 << i));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -435,8 +459,8 @@ namespace embree
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
|
||||
__forceinline int popcnt(int in) {
|
||||
return _mm_popcnt_u32(in);
|
||||
}
|
||||
|
@ -483,14 +507,14 @@ namespace embree
|
|||
#endif
|
||||
}
|
||||
|
||||
__forceinline void prefetchL1EX(const void* ptr) {
|
||||
prefetchEX(ptr);
|
||||
__forceinline void prefetchL1EX(const void* ptr) {
|
||||
prefetchEX(ptr);
|
||||
}
|
||||
|
||||
__forceinline void prefetchL2EX(const void* ptr) {
|
||||
prefetchEX(ptr);
|
||||
|
||||
__forceinline void prefetchL2EX(const void* ptr) {
|
||||
prefetchEX(ptr);
|
||||
}
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__) && !defined(__aarch64__)
|
||||
__forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
|
||||
__forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
|
||||
#if defined(__X86_64__)
|
||||
|
|
|
@ -36,6 +36,7 @@ namespace embree
|
|||
MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
|
||||
assert(ok);
|
||||
delete (pthread_mutex_t*)mutex;
|
||||
mutex = nullptr;
|
||||
}
|
||||
|
||||
void MutexSys::lock()
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include "intrinsics.h"
|
||||
#include "atomic.h"
|
||||
|
||||
#define CPU_CACHELINE_SIZE 64
|
||||
namespace embree
|
||||
{
|
||||
/*! system mutex */
|
||||
|
@ -83,6 +84,11 @@ namespace embree
|
|||
atomic<bool> flag;
|
||||
};
|
||||
|
||||
class PaddedSpinLock : public SpinLock
|
||||
{
|
||||
private:
|
||||
char padding[CPU_CACHELINE_SIZE - sizeof(SpinLock)];
|
||||
};
|
||||
/*! safe mutex lock and unlock helper */
|
||||
template<typename Mutex> class Lock {
|
||||
public:
|
||||
|
|
|
@ -92,16 +92,19 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef __WIN32__
|
||||
#define dll_export __declspec(dllexport)
|
||||
#define dll_import __declspec(dllimport)
|
||||
# if defined(EMBREE_STATIC_LIB)
|
||||
# define dll_export
|
||||
# define dll_import
|
||||
# else
|
||||
# define dll_export __declspec(dllexport)
|
||||
# define dll_import __declspec(dllimport)
|
||||
# endif
|
||||
#else
|
||||
#define dll_export __attribute__ ((visibility ("default")))
|
||||
#define dll_import
|
||||
# define dll_export __attribute__ ((visibility ("default")))
|
||||
# define dll_import
|
||||
#endif
|
||||
|
||||
// -- GODOT start --
|
||||
#if defined(__WIN32__) && !defined(__MINGW32__)
|
||||
// -- GODOT end --
|
||||
#if !defined(__noinline)
|
||||
#define __noinline __declspec(noinline)
|
||||
#endif
|
||||
|
@ -151,9 +154,7 @@
|
|||
#define DELETED = delete
|
||||
#endif
|
||||
|
||||
// -- GODOT start --
|
||||
#if !defined(likely)
|
||||
// -- GODOT end --
|
||||
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
|
||||
#define likely(expr) (expr)
|
||||
#define unlikely(expr) (expr)
|
||||
|
@ -161,9 +162,7 @@
|
|||
#define likely(expr) __builtin_expect((bool)(expr),true )
|
||||
#define unlikely(expr) __builtin_expect((bool)(expr),false)
|
||||
#endif
|
||||
// -- GODOT start --
|
||||
#endif
|
||||
// -- GODOT end --
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Error handling and debugging
|
||||
|
@ -252,6 +251,7 @@ __forceinline std::string toString(long long value) {
|
|||
#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning)
|
||||
//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data
|
||||
#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data
|
||||
#pragma warning(disable:4267) // conversion from 'size_t' to 'const int', possible loss of data
|
||||
//#pragma warning(disable:4355) // 'this' : used in base member initializer list
|
||||
//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch
|
||||
//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch
|
||||
|
|
|
@ -21,7 +21,11 @@ namespace embree
|
|||
|
||||
std::string getPlatformName()
|
||||
{
|
||||
#if defined(__LINUX__) && !defined(__64BIT__)
|
||||
#if defined(__ANDROID__) && !defined(__64BIT__)
|
||||
return "Android (32bit)";
|
||||
#elif defined(__ANDROID__) && defined(__64BIT__)
|
||||
return "Android (64bit)";
|
||||
#elif defined(__LINUX__) && !defined(__64BIT__)
|
||||
return "Linux (32bit)";
|
||||
#elif defined(__LINUX__) && defined(__64BIT__)
|
||||
return "Linux (64bit)";
|
||||
|
@ -248,9 +252,7 @@ namespace embree
|
|||
#if defined(__X86_ASM__)
|
||||
__noinline int64_t get_xcr0()
|
||||
{
|
||||
// -- GODOT start --
|
||||
#if defined (__WIN32__) && !defined (__MINGW32__)
|
||||
// -- GODOT end --
|
||||
#if defined (__WIN32__) && !defined (__MINGW32__) && defined(_XCR_XFEATURE_ENABLED_MASK)
|
||||
int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
|
||||
xcr0 = _xgetbv(0);
|
||||
return xcr0;
|
||||
|
@ -337,9 +339,24 @@ namespace embree
|
|||
if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
|
||||
|
||||
return cpu_features;
|
||||
#elif defined(__ARM_NEON)
|
||||
/* emulated features with sse2neon */
|
||||
return CPU_FEATURE_SSE|CPU_FEATURE_SSE2|CPU_FEATURE_XMM_ENABLED;
|
||||
|
||||
#elif defined(__ARM_NEON) || defined(__EMSCRIPTEN__)
|
||||
|
||||
int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2;
|
||||
cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42;
|
||||
cpu_features |= CPU_FEATURE_XMM_ENABLED;
|
||||
cpu_features |= CPU_FEATURE_YMM_ENABLED;
|
||||
cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C;
|
||||
cpu_features |= CPU_FEATURE_POPCNT;
|
||||
cpu_features |= CPU_FEATURE_AVX;
|
||||
cpu_features |= CPU_FEATURE_AVX2;
|
||||
cpu_features |= CPU_FEATURE_FMA3;
|
||||
cpu_features |= CPU_FEATURE_LZCNT;
|
||||
cpu_features |= CPU_FEATURE_BMI1;
|
||||
cpu_features |= CPU_FEATURE_BMI2;
|
||||
cpu_features |= CPU_FEATURE_NEON_2X;
|
||||
return cpu_features;
|
||||
|
||||
#else
|
||||
/* Unknown CPU. */
|
||||
return 0;
|
||||
|
@ -376,6 +393,8 @@ namespace embree
|
|||
if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
|
||||
if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
|
||||
if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
|
||||
if (features & CPU_FEATURE_NEON) str += "NEON ";
|
||||
if (features & CPU_FEATURE_NEON_2X) str += "2xNEON ";
|
||||
return str;
|
||||
}
|
||||
|
||||
|
@ -390,6 +409,9 @@ namespace embree
|
|||
if (isa == AVX) return "AVX";
|
||||
if (isa == AVX2) return "AVX2";
|
||||
if (isa == AVX512) return "AVX512";
|
||||
|
||||
if (isa == NEON) return "NEON";
|
||||
if (isa == NEON_2X) return "2xNEON";
|
||||
return "UNKNOWN";
|
||||
}
|
||||
|
||||
|
@ -410,6 +432,9 @@ namespace embree
|
|||
if (hasISA(features,AVXI)) v += "AVXI ";
|
||||
if (hasISA(features,AVX2)) v += "AVX2 ";
|
||||
if (hasISA(features,AVX512)) v += "AVX512 ";
|
||||
|
||||
if (hasISA(features,NEON)) v += "NEON ";
|
||||
if (hasISA(features,NEON_2X)) v += "2xNEON ";
|
||||
return v;
|
||||
}
|
||||
}
|
||||
|
@ -613,6 +638,10 @@ namespace embree
|
|||
#include <sys/time.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
#include <emscripten.h>
|
||||
#endif
|
||||
|
||||
namespace embree
|
||||
{
|
||||
unsigned int getNumberOfLogicalThreads()
|
||||
|
@ -620,12 +649,25 @@ namespace embree
|
|||
static int nThreads = -1;
|
||||
if (nThreads != -1) return nThreads;
|
||||
|
||||
// -- GODOT start --
|
||||
// #if defined(__MACOSX__)
|
||||
#if defined(__MACOSX__) || defined(__ANDROID__)
|
||||
// -- GODOT end --
|
||||
nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
|
||||
assert(nThreads);
|
||||
#elif defined(__EMSCRIPTEN__)
|
||||
// WebAssembly supports pthreads, but not pthread_getaffinity_np. Get the number of logical
|
||||
// threads from the browser or Node.js using JavaScript.
|
||||
nThreads = MAIN_THREAD_EM_ASM_INT({
|
||||
const isBrowser = typeof window !== 'undefined';
|
||||
const isNode = typeof process !== 'undefined' && process.versions != null &&
|
||||
process.versions.node != null;
|
||||
if (isBrowser) {
|
||||
// Return 1 if the browser does not expose hardwareConcurrency.
|
||||
return window.navigator.hardwareConcurrency || 1;
|
||||
} else if (isNode) {
|
||||
return require('os').cpus().length;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
});
|
||||
#else
|
||||
cpu_set_t set;
|
||||
if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
|
||||
|
|
|
@ -55,7 +55,12 @@
|
|||
# define isa sse
|
||||
# define ISA SSE
|
||||
# define ISA_STR "SSE"
|
||||
#else
|
||||
#elif defined(__ARM_NEON)
|
||||
// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment.
|
||||
#define isa sse2
|
||||
#define ISA NEON
|
||||
#define ISA_STR "NEON"
|
||||
#else
|
||||
#error Unknown ISA
|
||||
#endif
|
||||
|
||||
|
@ -133,7 +138,9 @@ namespace embree
|
|||
static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
|
||||
static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
|
||||
static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
|
||||
|
||||
static const int CPU_FEATURE_NEON = 1 << 28;
|
||||
static const int CPU_FEATURE_NEON_2X = 1 << 29;
|
||||
|
||||
/*! get CPU features */
|
||||
int getCPUFeatures();
|
||||
|
||||
|
@ -154,6 +161,8 @@ namespace embree
|
|||
static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND;
|
||||
static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
|
||||
static const int AVX512 = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
|
||||
static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2;
|
||||
static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2;
|
||||
|
||||
/*! converts ISA bitvector into a string */
|
||||
std::string stringOfISA(int features);
|
||||
|
|
|
@ -10,6 +10,9 @@
|
|||
#include "../simd/arm/emulation.h"
|
||||
#else
|
||||
#include <xmmintrin.h>
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
#include "../simd/wasm/emulation.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(PTHREADS_WIN32)
|
||||
|
@ -158,9 +161,7 @@ namespace embree
|
|||
/// Linux Platform
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// -- GODOT start --
|
||||
#if defined(__LINUX__) && !defined(__ANDROID__)
|
||||
// -- GODOT end --
|
||||
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
@ -219,6 +220,8 @@ namespace embree
|
|||
|
||||
/* find correct thread to affinitize to */
|
||||
cpu_set_t set;
|
||||
CPU_ZERO(&set);
|
||||
|
||||
if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
|
||||
{
|
||||
for (int i=0, j=0; i<CPU_SETSIZE; i++)
|
||||
|
@ -241,7 +244,8 @@ namespace embree
|
|||
{
|
||||
cpu_set_t cset;
|
||||
CPU_ZERO(&cset);
|
||||
size_t threadID = mapThreadID(affinity);
|
||||
//size_t threadID = mapThreadID(affinity); // this is not working properly in LXC containers when some processors are disabled
|
||||
size_t threadID = affinity;
|
||||
CPU_SET(threadID, &cset);
|
||||
|
||||
pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
|
||||
|
@ -249,7 +253,6 @@ namespace embree
|
|||
}
|
||||
#endif
|
||||
|
||||
// -- GODOT start --
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Android Platform
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -269,7 +272,6 @@ namespace embree
|
|||
}
|
||||
}
|
||||
#endif
|
||||
// -- GODOT end --
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// FreeBSD Platform
|
||||
|
@ -293,6 +295,21 @@ namespace embree
|
|||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// WebAssembly Platform
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
namespace embree
|
||||
{
|
||||
/*! set affinity of the calling thread */
|
||||
void setAffinity(ssize_t affinity)
|
||||
{
|
||||
// Setting thread affinity is not supported in WASM.
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// MacOSX Platform
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -379,9 +396,7 @@ namespace embree
|
|||
pthread_attr_destroy(&attr);
|
||||
|
||||
/* set affinity */
|
||||
// -- GODOT start --
|
||||
#if defined(__LINUX__) && !defined(__ANDROID__)
|
||||
// -- GODOT end --
|
||||
if (threadID >= 0) {
|
||||
cpu_set_t cset;
|
||||
CPU_ZERO(&cset);
|
||||
|
@ -396,7 +411,6 @@ namespace embree
|
|||
CPU_SET(threadID, &cset);
|
||||
pthread_setaffinity_np(*tid, sizeof(cset), &cset);
|
||||
}
|
||||
// -- GODOT start --
|
||||
#elif defined(__ANDROID__)
|
||||
if (threadID >= 0) {
|
||||
cpu_set_t cset;
|
||||
|
@ -405,7 +419,6 @@ namespace embree
|
|||
sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset);
|
||||
}
|
||||
#endif
|
||||
// -- GODOT end --
|
||||
|
||||
return thread_t(tid);
|
||||
}
|
||||
|
@ -424,14 +437,12 @@ namespace embree
|
|||
|
||||
/*! destroy a hardware thread by its handle */
|
||||
void destroyThread(thread_t tid) {
|
||||
// -- GODOT start --
|
||||
#if defined(__ANDROID__)
|
||||
FATAL("Can't destroy threads on Android.");
|
||||
FATAL("Can't destroy threads on Android."); // pthread_cancel not implemented.
|
||||
#else
|
||||
pthread_cancel(*(pthread_t*)tid);
|
||||
delete (pthread_t*)tid;
|
||||
#endif
|
||||
// -- GODOT end --
|
||||
}
|
||||
|
||||
/*! creates thread local storage */
|
||||
|
|
|
@ -127,14 +127,15 @@ namespace embree
|
|||
{
|
||||
assert(!empty());
|
||||
size_active--;
|
||||
alloc.destroy(&items[size_active]);
|
||||
items[size_active].~T();
|
||||
}
|
||||
|
||||
__forceinline void clear()
|
||||
{
|
||||
/* destroy elements */
|
||||
for (size_t i=0; i<size_active; i++)
|
||||
alloc.destroy(&items[i]);
|
||||
for (size_t i=0; i<size_active; i++){
|
||||
items[i].~T();
|
||||
}
|
||||
|
||||
/* free memory */
|
||||
alloc.deallocate(items,size_alloced);
|
||||
|
@ -178,8 +179,9 @@ namespace embree
|
|||
/* destroy elements */
|
||||
if (new_active < size_active)
|
||||
{
|
||||
for (size_t i=new_active; i<size_active; i++)
|
||||
alloc.destroy(&items[i]);
|
||||
for (size_t i=new_active; i<size_active; i++){
|
||||
items[i].~T();
|
||||
}
|
||||
size_active = new_active;
|
||||
}
|
||||
|
||||
|
@ -195,7 +197,7 @@ namespace embree
|
|||
items = alloc.allocate(new_alloced);
|
||||
for (size_t i=0; i<size_active; i++) {
|
||||
::new (&items[i]) T(std::move(old_items[i]));
|
||||
alloc.destroy(&old_items[i]);
|
||||
old_items[i].~T();
|
||||
}
|
||||
|
||||
for (size_t i=size_active; i<new_active; i++) {
|
||||
|
|
|
@ -143,7 +143,7 @@ namespace embree
|
|||
/* allocate new task on right side of stack */
|
||||
size_t oldStackPtr = stackPtr;
|
||||
TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
|
||||
new (&tasks[right]) Task(func,thread.task,oldStackPtr,size);
|
||||
new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size);
|
||||
right++;
|
||||
|
||||
/* also move left pointer */
|
||||
|
|
|
@ -11,14 +11,8 @@
|
|||
#include "../sys/condition.h"
|
||||
#include "../sys/ref.h"
|
||||
|
||||
#if defined(__WIN32__)
|
||||
// -- GODOT start --
|
||||
#if !defined(NOMINMAX)
|
||||
// -- GODOT end --
|
||||
#if defined(__WIN32__) && !defined(NOMINMAX)
|
||||
# define NOMINMAX
|
||||
// -- GODOT start --
|
||||
#endif
|
||||
// -- GODOT end --
|
||||
#endif
|
||||
|
||||
// We need to define these to avoid implicit linkage against
|
||||
|
|
|
@ -19,9 +19,7 @@ typedef int ssize_t;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
// -- GODOT start --
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
// -- GODOT end --
|
||||
#if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
|
||||
#else
|
||||
# define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
|
@ -6,23 +5,25 @@
|
|||
|
||||
#define RTC_VERSION_MAJOR 3
|
||||
#define RTC_VERSION_MINOR 13
|
||||
#define RTC_VERSION_PATCH 1
|
||||
#define RTC_VERSION 31301
|
||||
#define RTC_VERSION_STRING "3.13.1"
|
||||
#define RTC_VERSION_PATCH 5
|
||||
#define RTC_VERSION 31305
|
||||
#define RTC_VERSION_STRING "3.13.5"
|
||||
|
||||
#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
|
||||
|
||||
#define EMBREE_MIN_WIDTH 0
|
||||
#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
|
||||
|
||||
#define EMBREE_STATIC_LIB
|
||||
/* #undef EMBREE_API_NAMESPACE */
|
||||
#if !defined(EMBREE_STATIC_LIB)
|
||||
# define EMBREE_STATIC_LIB
|
||||
#endif
|
||||
/* #undef EMBREE_API_NAMESPACE*/
|
||||
|
||||
#if defined(EMBREE_API_NAMESPACE)
|
||||
# define RTC_NAMESPACE
|
||||
# define RTC_NAMESPACE_BEGIN namespace {
|
||||
# define RTC_NAMESPACE_BEGIN namespace {
|
||||
# define RTC_NAMESPACE_END }
|
||||
# define RTC_NAMESPACE_USE using namespace ;
|
||||
# define RTC_NAMESPACE_USE using namespace;
|
||||
# define RTC_API_EXTERN_C
|
||||
# undef EMBREE_API_NAMESPACE
|
||||
#else
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
RTC_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* Structure for transformation respresentation as a matrix decomposition using
|
||||
* Structure for transformation representation as a matrix decomposition using
|
||||
* a quaternion
|
||||
*/
|
||||
struct RTC_ALIGN(16) RTCQuaternionDecomposition
|
||||
|
|
|
@ -47,9 +47,12 @@ RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigne
|
|||
/* Detaches the geometry from the scene. */
|
||||
RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID);
|
||||
|
||||
/* Gets a geometry handle from the scene. */
|
||||
/* Gets a geometry handle from the scene. This function is not thread safe and should get used during rendering. */
|
||||
RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID);
|
||||
|
||||
/* Gets a geometry handle from the scene. This function is thread safe and should NOT get used during rendering. */
|
||||
RTC_API RTCGeometry rtcGetGeometryThreadSafe(RTCScene scene, unsigned int geomID);
|
||||
|
||||
|
||||
/* Commits the scene. */
|
||||
RTC_API void rtcCommitScene(RTCScene scene);
|
||||
|
|
|
@ -411,7 +411,7 @@ namespace embree
|
|||
ReductionTy bounds[MAX_BRANCHING_FACTOR];
|
||||
if (current.size() > singleThreadThreshold)
|
||||
{
|
||||
/*! parallel_for is faster than spawing sub-tasks */
|
||||
/*! parallel_for is faster than spawning sub-tasks */
|
||||
parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
|
||||
for (size_t i=r.begin(); i<r.end(); i++) {
|
||||
bounds[i] = recurse(depth+1,children[i],nullptr,true);
|
||||
|
|
|
@ -374,7 +374,7 @@ namespace embree
|
|||
|
||||
const size_t begin = set.begin();
|
||||
const size_t end = set.end();
|
||||
const size_t center = (begin + end)/2;
|
||||
const size_t center = (begin + end + 1) / 2;
|
||||
|
||||
PrimInfoMB linfo = empty;
|
||||
for (size_t i=begin; i<center; i++)
|
||||
|
@ -594,7 +594,7 @@ namespace embree
|
|||
/* spawn tasks */
|
||||
if (unlikely(current.size() > cfg.singleThreadThreshold))
|
||||
{
|
||||
/*! parallel_for is faster than spawing sub-tasks */
|
||||
/*! parallel_for is faster than spawning sub-tasks */
|
||||
parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
|
||||
for (size_t i=r.begin(); i<r.end(); i++) {
|
||||
values[i] = recurse(children[i],nullptr,true);
|
||||
|
|
|
@ -298,7 +298,7 @@ namespace embree
|
|||
/* spawn tasks */
|
||||
if (current.size() > cfg.singleThreadThreshold)
|
||||
{
|
||||
/*! parallel_for is faster than spawing sub-tasks */
|
||||
/*! parallel_for is faster than spawning sub-tasks */
|
||||
parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here
|
||||
for (size_t i=r.begin(); i<r.end(); i++) {
|
||||
values[i] = recurse(children[i],nullptr,true);
|
||||
|
|
|
@ -57,14 +57,12 @@ namespace embree
|
|||
__forceinline Vec3ia bin(const Vec3fa& p) const
|
||||
{
|
||||
const vint4 i = floori((vfloat4(p)-ofs)*scale);
|
||||
#if 1
|
||||
assert(i[0] >= 0 && (size_t)i[0] < num);
|
||||
assert(i[1] >= 0 && (size_t)i[1] < num);
|
||||
assert(i[2] >= 0 && (size_t)i[2] < num);
|
||||
return Vec3ia(i);
|
||||
#else
|
||||
|
||||
// we clamp to handle corner cases that could calculate out of bounds bin
|
||||
return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
|
||||
#endif
|
||||
}
|
||||
|
||||
/*! faster but unsafe binning */
|
||||
|
|
|
@ -275,7 +275,7 @@ namespace embree
|
|||
openNodesBasedOnExtend(set);
|
||||
#endif
|
||||
|
||||
/* disable opening when unsufficient space for opening a node available */
|
||||
/* disable opening when insufficient space for opening a node available */
|
||||
if (set.ext_range_size() < max_open_size-1)
|
||||
set.set_ext_range(set.end()); /* disable opening */
|
||||
}
|
||||
|
|
|
@ -159,72 +159,6 @@ namespace embree
|
|||
assert(binID < BINS);
|
||||
bounds [binID][dim].extend(b);
|
||||
}
|
||||
|
||||
/*! bins an array of triangles */
|
||||
template<typename SplitPrimitive>
|
||||
__forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping)
|
||||
{
|
||||
for (size_t i=0; i<N; i++)
|
||||
{
|
||||
const PrimRef prim = prims[i];
|
||||
unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
|
||||
|
||||
if (unlikely(splits == 1))
|
||||
{
|
||||
const vint4 bin = mapping.bin(center(prim.bounds()));
|
||||
for (size_t dim=0; dim<3; dim++)
|
||||
{
|
||||
assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
|
||||
numBegin[bin[dim]][dim]++;
|
||||
numEnd [bin[dim]][dim]++;
|
||||
bounds [bin[dim]][dim].extend(prim.bounds());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const vint4 bin0 = mapping.bin(prim.bounds().lower);
|
||||
const vint4 bin1 = mapping.bin(prim.bounds().upper);
|
||||
|
||||
for (size_t dim=0; dim<3; dim++)
|
||||
{
|
||||
size_t bin;
|
||||
PrimRef rest = prim;
|
||||
size_t l = bin0[dim];
|
||||
size_t r = bin1[dim];
|
||||
|
||||
// same bin optimization
|
||||
if (likely(l == r))
|
||||
{
|
||||
numBegin[l][dim]++;
|
||||
numEnd [l][dim]++;
|
||||
bounds [l][dim].extend(prim.bounds());
|
||||
continue;
|
||||
}
|
||||
|
||||
for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++)
|
||||
{
|
||||
const float pos = mapping.pos(bin+1,dim);
|
||||
|
||||
PrimRef left,right;
|
||||
splitPrimitive(rest,(int)dim,pos,left,right);
|
||||
if (unlikely(left.bounds().empty())) l++;
|
||||
bounds[bin][dim].extend(left.bounds());
|
||||
rest = right;
|
||||
}
|
||||
if (unlikely(rest.bounds().empty())) r--;
|
||||
numBegin[l][dim]++;
|
||||
numEnd [r][dim]++;
|
||||
bounds [bin][dim].extend(rest.bounds());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*! bins a range of primitives inside an array */
|
||||
template<typename SplitPrimitive>
|
||||
void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) {
|
||||
bin(splitPrimitive,prims+begin,end-begin,mapping);
|
||||
}
|
||||
|
||||
/*! bins an array of primitives */
|
||||
template<typename PrimitiveSplitterFactory>
|
||||
|
@ -232,46 +166,65 @@ namespace embree
|
|||
{
|
||||
for (size_t i=begin; i<end; i++)
|
||||
{
|
||||
const PrimRef &prim = source[i];
|
||||
const vint4 bin0 = mapping.bin(prim.bounds().lower);
|
||||
const vint4 bin1 = mapping.bin(prim.bounds().upper);
|
||||
const PrimRef& prim = source[i];
|
||||
unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
|
||||
|
||||
for (size_t dim=0; dim<3; dim++)
|
||||
if (unlikely(splits <= 1))
|
||||
{
|
||||
if (unlikely(mapping.invalid(dim)))
|
||||
continue;
|
||||
|
||||
size_t bin;
|
||||
size_t l = bin0[dim];
|
||||
size_t r = bin1[dim];
|
||||
|
||||
// same bin optimization
|
||||
if (likely(l == r))
|
||||
const vint4 bin = mapping.bin(center(prim.bounds()));
|
||||
for (size_t dim=0; dim<3; dim++)
|
||||
{
|
||||
add(dim,l,l,l,prim.bounds());
|
||||
continue;
|
||||
assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
|
||||
add(dim,bin[dim],bin[dim],bin[dim],prim.bounds());
|
||||
}
|
||||
const size_t bin_start = bin0[dim];
|
||||
const size_t bin_end = bin1[dim];
|
||||
BBox3fa rest = prim.bounds();
|
||||
const auto splitter = splitterFactory(prim);
|
||||
for (bin=bin_start; bin<bin_end; bin++)
|
||||
{
|
||||
const float pos = mapping.pos(bin+1,dim);
|
||||
BBox3fa left,right;
|
||||
splitter(rest,dim,pos,left,right);
|
||||
if (unlikely(left.empty())) l++;
|
||||
extend(dim,bin,left);
|
||||
rest = right;
|
||||
}
|
||||
if (unlikely(rest.empty())) r--;
|
||||
add(dim,l,r,bin,rest);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const vint4 bin0 = mapping.bin(prim.bounds().lower);
|
||||
const vint4 bin1 = mapping.bin(prim.bounds().upper);
|
||||
|
||||
for (size_t dim=0; dim<3; dim++)
|
||||
{
|
||||
if (unlikely(mapping.invalid(dim)))
|
||||
continue;
|
||||
|
||||
size_t bin;
|
||||
size_t l = bin0[dim];
|
||||
size_t r = bin1[dim];
|
||||
|
||||
// same bin optimization
|
||||
if (likely(l == r))
|
||||
{
|
||||
add(dim,l,l,l,prim.bounds());
|
||||
continue;
|
||||
}
|
||||
size_t bin_start = bin0[dim];
|
||||
size_t bin_end = bin1[dim];
|
||||
BBox3fa rest = prim.bounds();
|
||||
|
||||
/* assure that split position always overlaps the primitive bounds */
|
||||
while (bin_start < bin_end && mapping.pos(bin_start+1,dim) <= rest.lower[dim]) bin_start++;
|
||||
while (bin_start < bin_end && mapping.pos(bin_end ,dim) >= rest.upper[dim]) bin_end--;
|
||||
|
||||
const auto splitter = splitterFactory(prim);
|
||||
for (bin=bin_start; bin<bin_end; bin++)
|
||||
{
|
||||
const float pos = mapping.pos(bin+1,dim);
|
||||
BBox3fa left,right;
|
||||
splitter(rest,dim,pos,left,right);
|
||||
|
||||
if (unlikely(left.empty())) l++;
|
||||
extend(dim,bin,left);
|
||||
rest = right;
|
||||
}
|
||||
if (unlikely(rest.empty())) r--;
|
||||
add(dim,l,r,bin,rest);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*! bins an array of primitives */
|
||||
__forceinline void binSubTreeRefs(const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
|
||||
{
|
||||
|
|
|
@ -241,7 +241,7 @@ namespace embree
|
|||
SpatialBinner binner(empty);
|
||||
const SpatialBinMapping<SPATIAL_BINS> mapping(set);
|
||||
binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping);
|
||||
/* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
|
||||
/* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/
|
||||
return binner.best(mapping,logBlockSize); //,set.ext_size());
|
||||
}
|
||||
|
||||
|
@ -256,7 +256,7 @@ namespace embree
|
|||
binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping);
|
||||
return binner; },
|
||||
[&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); });
|
||||
/* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
|
||||
/* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/
|
||||
return binner.best(mapping,logBlockSize); //,set.ext_size());
|
||||
}
|
||||
|
||||
|
@ -286,6 +286,7 @@ namespace embree
|
|||
//int bin0 = split.mapping.bin(prims0[i].lower)[split.dim];
|
||||
//int bin1 = split.mapping.bin(prims0[i].upper)[split.dim];
|
||||
//if (unlikely(bin0 < split.pos && bin1 >= split.pos))
|
||||
|
||||
if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos))
|
||||
{
|
||||
assert(splits > 1);
|
||||
|
@ -384,8 +385,8 @@ namespace embree
|
|||
new (&lset) PrimInfoExtRange(begin,center,center,local_left);
|
||||
new (&rset) PrimInfoExtRange(center,end,end,local_right);
|
||||
|
||||
assert(area(lset.geomBounds) >= 0.0f);
|
||||
assert(area(rset.geomBounds) >= 0.0f);
|
||||
assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f);
|
||||
assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f);
|
||||
return std::pair<size_t,size_t>(left_weight,right_weight);
|
||||
}
|
||||
|
||||
|
@ -410,7 +411,7 @@ namespace embree
|
|||
begin,end,local_left,local_right,
|
||||
[&] (const PrimRef& ref) {
|
||||
const Vec3fa c = ref.bounds().center();
|
||||
return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask);
|
||||
return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask);
|
||||
},
|
||||
[] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
|
||||
|
||||
|
@ -419,8 +420,8 @@ namespace embree
|
|||
|
||||
new (&lset) PrimInfoExtRange(begin,center,center,local_left);
|
||||
new (&rset) PrimInfoExtRange(center,end,end,local_right);
|
||||
assert(area(lset.geomBounds) >= 0.0f);
|
||||
assert(area(rset.geomBounds) >= 0.0f);
|
||||
assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f);
|
||||
assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f);
|
||||
return std::pair<size_t,size_t>(left_weight,right_weight);
|
||||
}
|
||||
|
||||
|
|
|
@ -184,9 +184,7 @@ namespace embree
|
|||
|
||||
// special variants for grid meshes
|
||||
|
||||
// -- GODOT start --
|
||||
#if defined(EMBREE_GEOMETRY_GRID)
|
||||
// -- GODOT end --
|
||||
PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
|
||||
{
|
||||
PrimInfo pinfo(empty);
|
||||
|
@ -296,9 +294,7 @@ namespace embree
|
|||
|
||||
return pinfo;
|
||||
}
|
||||
// -- GODOT start --
|
||||
#endif
|
||||
// -- GODOT end --
|
||||
|
||||
// ====================================================================================================
|
||||
// ====================================================================================================
|
||||
|
|
|
@ -266,7 +266,7 @@ namespace embree
|
|||
/* anything to split ? */
|
||||
if (center < numPrimitives)
|
||||
{
|
||||
const size_t numPrimitivesToSplit = numPrimitives - center;
|
||||
size_t numPrimitivesToSplit = numPrimitives - center;
|
||||
assert(presplitItem[center].priority >= 1.0f);
|
||||
|
||||
/* sort presplit items in ascending order */
|
||||
|
@ -279,8 +279,8 @@ namespace embree
|
|||
});
|
||||
);
|
||||
|
||||
unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem;
|
||||
unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
|
||||
unsigned int* primOffset0 = (unsigned int*)tmp_presplitItem;
|
||||
unsigned int* primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
|
||||
|
||||
/* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
|
||||
const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
|
||||
|
@ -317,11 +317,16 @@ namespace embree
|
|||
sum += numSubPrims;
|
||||
}
|
||||
new_center++;
|
||||
|
||||
primOffset0 += new_center - center;
|
||||
numPrimitivesToSplit -= new_center - center;
|
||||
center = new_center;
|
||||
assert(numPrimitivesToSplit == (numPrimitives - center));
|
||||
}
|
||||
|
||||
/* parallel prefix sum to compute offsets for storing sub-primitives */
|
||||
const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
|
||||
assert(numPrimitives+offset <= alloc_numPrimitives);
|
||||
|
||||
/* iterate over range, and split primitives into sub primitives and append them to prims array */
|
||||
parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
|
||||
|
@ -338,7 +343,7 @@ namespace embree
|
|||
unsigned int numSubPrims = 0;
|
||||
splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
|
||||
const size_t newID = numPrimitives + primOffset1[j-center];
|
||||
assert(newID+numSubPrims <= alloc_numPrimitives);
|
||||
assert(newID+numSubPrims-1 <= alloc_numPrimitives);
|
||||
prims[primrefID] = subPrims[0];
|
||||
for (size_t i=1;i<numSubPrims;i++)
|
||||
prims[newID+i-1] = subPrims[i];
|
||||
|
|
|
@ -128,28 +128,30 @@ namespace embree
|
|||
const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
|
||||
const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask );
|
||||
QuadMesh::Quad quad = mesh->quad(prim.primID());
|
||||
v[0] = mesh->vertex(quad.v[0]);
|
||||
v[1] = mesh->vertex(quad.v[1]);
|
||||
v[2] = mesh->vertex(quad.v[2]);
|
||||
v[3] = mesh->vertex(quad.v[3]);
|
||||
v[4] = mesh->vertex(quad.v[0]);
|
||||
inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
|
||||
inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
|
||||
inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]);
|
||||
inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]);
|
||||
v[0] = mesh->vertex(quad.v[1]);
|
||||
v[1] = mesh->vertex(quad.v[2]);
|
||||
v[2] = mesh->vertex(quad.v[3]);
|
||||
v[3] = mesh->vertex(quad.v[0]);
|
||||
v[4] = mesh->vertex(quad.v[1]);
|
||||
v[5] = mesh->vertex(quad.v[3]);
|
||||
inv_length[0] = Vec3fa(1.0f) / (v[1] - v[0]);
|
||||
inv_length[1] = Vec3fa(1.0f) / (v[2] - v[1]);
|
||||
inv_length[2] = Vec3fa(1.0f) / (v[3] - v[2]);
|
||||
inv_length[3] = Vec3fa(1.0f) / (v[4] - v[3]);
|
||||
inv_length[4] = Vec3fa(1.0f) / (v[5] - v[4]);
|
||||
}
|
||||
|
||||
__forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
|
||||
splitPolygon<4>(prim,dim,pos,v,left_o,right_o);
|
||||
splitPolygon<5>(prim,dim,pos,v,left_o,right_o);
|
||||
}
|
||||
|
||||
__forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
|
||||
splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o);
|
||||
splitPolygon<5>(prim,dim,pos,v,inv_length,left_o,right_o);
|
||||
}
|
||||
|
||||
private:
|
||||
Vec3fa v[5];
|
||||
Vec3fa inv_length[4];
|
||||
Vec3fa v[6];
|
||||
Vec3fa inv_length[5];
|
||||
};
|
||||
|
||||
struct QuadSplitterFactory
|
||||
|
|
|
@ -183,7 +183,7 @@ namespace embree
|
|||
template class BVHN<8>;
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
|
||||
#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
|
||||
template class BVHN<4>;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -230,7 +230,7 @@ namespace embree
|
|||
continue;
|
||||
|
||||
/* switch to single ray traversal */
|
||||
#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__)
|
||||
#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__))
|
||||
#if FORCE_SINGLE_MODE == 0
|
||||
if (single)
|
||||
#endif
|
||||
|
@ -676,7 +676,7 @@ namespace embree
|
|||
continue;
|
||||
|
||||
/* switch to single ray traversal */
|
||||
#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__)
|
||||
#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__))
|
||||
#if FORCE_SINGLE_MODE == 0
|
||||
if (single)
|
||||
#endif
|
||||
|
|
|
@ -170,12 +170,23 @@ namespace embree
|
|||
TravRayKStream<K,robust> &p = packets[rayID / K];
|
||||
const size_t i = rayID % K;
|
||||
const vint<N> bitmask(shiftTable[rayID]);
|
||||
|
||||
#if defined (__aarch64__)
|
||||
const vfloat<N> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
|
||||
const vfloat<N> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
|
||||
const vfloat<N> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
|
||||
const vfloat<N> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
|
||||
const vfloat<N> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
|
||||
const vfloat<N> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
|
||||
#else
|
||||
const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
|
||||
const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
|
||||
const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
|
||||
const vfloat<N> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
|
||||
const vfloat<N> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
|
||||
const vfloat<N> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]);
|
||||
#endif
|
||||
|
||||
const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
|
||||
const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i]));
|
||||
|
||||
|
|
|
@ -46,6 +46,14 @@ namespace embree
|
|||
template<typename BuildRecord>
|
||||
__forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
// check that empty children are only at the end of the child list
|
||||
bool emptyChild = false;
|
||||
for (size_t i=0; i<num; i++) {
|
||||
emptyChild |= (children[i] == NodeRef::emptyNode);
|
||||
assert(emptyChild == (children[i] == NodeRef::emptyNode));
|
||||
}
|
||||
#endif
|
||||
AABBNode_t* node = ref.getAABBNode();
|
||||
for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
|
||||
return ref;
|
||||
|
@ -60,6 +68,14 @@ namespace embree
|
|||
template<typename BuildRecord>
|
||||
__forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
// check that empty children are only at the end of the child list
|
||||
bool emptyChild = false;
|
||||
for (size_t i=0; i<num; i++) {
|
||||
emptyChild |= (children[i] == NodeRef::emptyNode);
|
||||
assert(emptyChild == (children[i] == NodeRef::emptyNode));
|
||||
}
|
||||
#endif
|
||||
AABBNode_t* node = ref.getAABBNode();
|
||||
for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
|
||||
|
||||
|
|
|
@ -31,6 +31,14 @@ namespace embree
|
|||
template<typename BuildRecord>
|
||||
__forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
// check that empty children are only at the end of the child list
|
||||
bool emptyChild = false;
|
||||
for (size_t i=0; i<num; i++) {
|
||||
emptyChild |= (children[i].ref == NodeRef::emptyNode);
|
||||
assert(emptyChild == (children[i].ref == NodeRef::emptyNode));
|
||||
}
|
||||
#endif
|
||||
AABBNodeMB_t* node = ref.getAABBNodeMB();
|
||||
|
||||
LBBox3fa bounds = empty;
|
||||
|
|
|
@ -41,6 +41,14 @@ namespace embree
|
|||
template<typename BuildRecord>
|
||||
__forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
// check that empty children are only at the end of the child list
|
||||
bool emptyChild = false;
|
||||
for (size_t i=0; i<num; i++) {
|
||||
emptyChild |= (children[i].ref == NodeRef::emptyNode);
|
||||
assert(emptyChild == (children[i].ref == NodeRef::emptyNode));
|
||||
}
|
||||
#endif
|
||||
if (likely(ref.isAABBNodeMB())) {
|
||||
for (size_t i=0; i<num; i++)
|
||||
ref.getAABBNodeMB()->set(i, children[i]);
|
||||
|
|
|
@ -190,6 +190,14 @@ namespace embree
|
|||
template<typename BuildRecord>
|
||||
__forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
// check that empty children are only at the end of the child list
|
||||
bool emptyChild = false;
|
||||
for (size_t i=0; i<num; i++) {
|
||||
emptyChild |= (children[i] == NodeRef::emptyNode);
|
||||
assert(emptyChild == (children[i] == NodeRef::emptyNode));
|
||||
}
|
||||
#endif
|
||||
QuantizedNode_t* node = ref.quantizedNode();
|
||||
for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
|
||||
return ref;
|
||||
|
|
|
@ -162,7 +162,7 @@ namespace embree
|
|||
template class BVHNStatistics<8>;
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
|
||||
#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
|
||||
template class BVHNStatistics<4>;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -5,6 +5,15 @@
|
|||
|
||||
#include "node_intersector.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define __FMA_X4__
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__)
|
||||
#define __FMA_X4__
|
||||
#endif
|
||||
|
||||
|
||||
namespace embree
|
||||
{
|
||||
namespace isa
|
||||
|
@ -29,9 +38,15 @@ namespace embree
|
|||
org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
|
||||
dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
|
||||
rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__FMA_X4__)
|
||||
const Vec3fa ray_org_rdir = ray_org*ray_rdir;
|
||||
#if !defined(__aarch64__)
|
||||
org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
|
||||
#else
|
||||
//for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
|
||||
//x86 will use msub
|
||||
neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z);
|
||||
#endif
|
||||
#endif
|
||||
nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
|
||||
nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
|
||||
|
@ -49,8 +64,12 @@ namespace embree
|
|||
org = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
|
||||
dir = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
|
||||
rdir = Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
org_rdir = org*rdir;
|
||||
#if defined(__FMA_X4__)
|
||||
#if !defined(__aarch64__)
|
||||
org_rdir = org*rdir;
|
||||
#else
|
||||
neg_org_rdir = -(org*rdir);
|
||||
#endif
|
||||
#endif
|
||||
nearX = nearXYZ.x[k];
|
||||
nearY = nearXYZ.y[k];
|
||||
|
@ -62,8 +81,14 @@ namespace embree
|
|||
|
||||
Vec3fa org_xyz, dir_xyz;
|
||||
Vec3vf<N> org, dir, rdir;
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__FMA_X4__)
|
||||
#if !defined(__aarch64__)
|
||||
Vec3vf<N> org_rdir;
|
||||
#else
|
||||
//aarch64 version are keeping negation of the org_rdir and use madd
|
||||
//x86 uses msub
|
||||
Vec3vf<N> neg_org_rdir;
|
||||
#endif
|
||||
#endif
|
||||
size_t nearX, nearY, nearZ;
|
||||
size_t farX, farY, farZ;
|
||||
|
@ -404,13 +429,22 @@ namespace embree
|
|||
template<>
|
||||
__forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist)
|
||||
{
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__FMA_X4__)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat4 tFarX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat4 tFarY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat4 tFarZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#else
|
||||
const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
|
||||
const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
|
||||
#endif
|
||||
#else
|
||||
const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
|
||||
const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
|
||||
|
@ -450,13 +484,23 @@ namespace embree
|
|||
template<>
|
||||
__forceinline size_t intersectNode<8>(const typename BVH8::AABBNode* node, const TravRay<8,false>& ray, vfloat8& dist)
|
||||
{
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat8 tFarX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat8 tFarY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat8 tFarZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#else
|
||||
const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
|
||||
const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
|
||||
#endif
|
||||
|
||||
#else
|
||||
const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
|
||||
const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
|
||||
|
@ -522,13 +566,22 @@ namespace embree
|
|||
const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
|
||||
const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
|
||||
const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__FMA_X4__)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#else
|
||||
const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
|
||||
const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
|
||||
#endif
|
||||
#else
|
||||
const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
|
||||
const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
|
||||
|
@ -537,7 +590,7 @@ namespace embree
|
|||
const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
|
||||
const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
|
||||
#endif
|
||||
#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
|
||||
#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW
|
||||
const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
|
||||
const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
|
||||
const vbool<N> vmask = asInt(tNear) > asInt(tFar);
|
||||
|
@ -598,13 +651,22 @@ namespace embree
|
|||
const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
|
||||
const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
|
||||
const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
|
||||
#if defined (__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined (__FMA_X4__)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#else
|
||||
const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
|
||||
const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
|
||||
#endif
|
||||
#else
|
||||
const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
|
||||
const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
|
||||
|
@ -613,7 +675,7 @@ namespace embree
|
|||
const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
|
||||
const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
|
||||
#endif
|
||||
#if defined(__AVX2__) && !defined(__AVX512F__)
|
||||
#if defined(__FMA_X4__) && !defined(__AVX512F__)
|
||||
const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
|
||||
const vfloat<N> tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
|
||||
#else
|
||||
|
@ -687,13 +749,22 @@ namespace embree
|
|||
const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
|
||||
const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z);
|
||||
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__FMA_X4__)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat4 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat4 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat4 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#else
|
||||
const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
|
||||
const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
|
||||
#endif
|
||||
#else
|
||||
const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
|
||||
const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
|
||||
|
@ -703,7 +774,7 @@ namespace embree
|
|||
const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z;
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
|
||||
#if defined(__aarch64__) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
|
||||
const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
|
||||
const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
|
||||
const vbool4 vmask = asInt(tNear) > asInt(tFar);
|
||||
|
@ -775,13 +846,22 @@ namespace embree
|
|||
const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
|
||||
const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z);
|
||||
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat8 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat8 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat8 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#else
|
||||
const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
|
||||
const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
|
||||
#endif
|
||||
#else
|
||||
const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
|
||||
const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
|
||||
|
@ -857,13 +937,22 @@ namespace embree
|
|||
const vfloat<N> upper_y = node->dequantizeUpperY(time);
|
||||
const vfloat<N> lower_z = node->dequantizeLowerZ(time);
|
||||
const vfloat<N> upper_z = node->dequantizeUpperZ(time);
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__FMA_X4__)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat<N> tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<N> tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<N> tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#else
|
||||
const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
|
||||
const vfloat<N> tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<N> tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<N> tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
|
||||
#endif
|
||||
#else
|
||||
const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
|
||||
const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;
|
||||
|
|
|
@ -75,9 +75,13 @@ namespace embree
|
|||
min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
|
||||
max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
|
||||
|
||||
#if defined (__aarch64__)
|
||||
neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
|
||||
neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
|
||||
#else
|
||||
min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
|
||||
max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
|
||||
|
||||
#endif
|
||||
min_dist = reduced_min_dist;
|
||||
max_dist = reduced_max_dist;
|
||||
|
||||
|
@ -95,9 +99,13 @@ namespace embree
|
|||
Vec3fa min_rdir;
|
||||
Vec3fa max_rdir;
|
||||
|
||||
#if defined (__aarch64__)
|
||||
Vec3fa neg_min_org_rdir;
|
||||
Vec3fa neg_max_org_rdir;
|
||||
#else
|
||||
Vec3fa min_org_rdir;
|
||||
Vec3fa max_org_rdir;
|
||||
|
||||
#endif
|
||||
float min_dist;
|
||||
float max_dist;
|
||||
};
|
||||
|
@ -191,13 +199,21 @@ namespace embree
|
|||
const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
|
||||
const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
|
||||
|
||||
#if defined (__aarch64__)
|
||||
const vfloat<N> fminX = madd(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.neg_min_org_rdir.x));
|
||||
const vfloat<N> fminY = madd(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.neg_min_org_rdir.y));
|
||||
const vfloat<N> fminZ = madd(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.neg_min_org_rdir.z));
|
||||
const vfloat<N> fmaxX = madd(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.neg_max_org_rdir.x));
|
||||
const vfloat<N> fmaxY = madd(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.neg_max_org_rdir.y));
|
||||
const vfloat<N> fmaxZ = madd(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.neg_max_org_rdir.z));
|
||||
#else
|
||||
const vfloat<N> fminX = msub(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.min_org_rdir.x));
|
||||
const vfloat<N> fminY = msub(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.min_org_rdir.y));
|
||||
const vfloat<N> fminZ = msub(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.min_org_rdir.z));
|
||||
const vfloat<N> fmaxX = msub(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.max_org_rdir.x));
|
||||
const vfloat<N> fmaxY = msub(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.max_org_rdir.y));
|
||||
const vfloat<N> fmaxZ = msub(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.max_org_rdir.z));
|
||||
|
||||
#endif
|
||||
const vfloat<N> fmin = maxi(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist));
|
||||
dist = fmin;
|
||||
const vfloat<N> fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist));
|
||||
|
|
|
@ -39,7 +39,9 @@ namespace embree
|
|||
org = ray_org;
|
||||
dir = ray_dir;
|
||||
rdir = rcp_safe(ray_dir);
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__aarch64__)
|
||||
neg_org_rdir = -(org * rdir);
|
||||
#elif defined(__AVX2__)
|
||||
org_rdir = org * rdir;
|
||||
#endif
|
||||
|
||||
|
@ -55,7 +57,9 @@ namespace embree
|
|||
Vec3vf<K> org;
|
||||
Vec3vf<K> dir;
|
||||
Vec3vf<K> rdir;
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__aarch64__)
|
||||
Vec3vf<K> neg_org_rdir;
|
||||
#elif defined(__AVX2__)
|
||||
Vec3vf<K> org_rdir;
|
||||
#endif
|
||||
Vec3vi<K> nearXYZ;
|
||||
|
@ -119,7 +123,14 @@ namespace embree
|
|||
const TravRayKFast<K>& ray, vfloat<K>& dist)
|
||||
|
||||
{
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#elif defined(__AVX2__)
|
||||
const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
|
||||
|
@ -199,7 +210,14 @@ namespace embree
|
|||
const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
|
||||
const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
|
||||
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#elif defined(__AVX2__)
|
||||
const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
|
||||
|
@ -302,7 +320,14 @@ namespace embree
|
|||
const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
|
||||
const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
|
||||
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#elif defined(__AVX2__)
|
||||
const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
|
||||
|
@ -464,7 +489,14 @@ namespace embree
|
|||
const vfloat<N> lower_z = node->dequantizeLowerZ();
|
||||
const vfloat<N> upper_z = node->dequantizeUpperZ();
|
||||
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#elif defined(__AVX2__)
|
||||
const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
|
||||
|
@ -549,7 +581,14 @@ namespace embree
|
|||
const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time);
|
||||
const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time);
|
||||
|
||||
#if defined(__AVX2__) || defined(__ARM_NEON)
|
||||
#if defined(__aarch64__)
|
||||
const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#elif defined(__AVX2__)
|
||||
const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
|
||||
|
|
|
@ -32,11 +32,19 @@ namespace embree
|
|||
__forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
|
||||
{
|
||||
rdir = rcp_safe(ray_dir);
|
||||
#if defined(__aarch64__)
|
||||
neg_org_rdir = -(ray_org * rdir);
|
||||
#else
|
||||
org_rdir = ray_org * rdir;
|
||||
#endif
|
||||
}
|
||||
|
||||
Vec3vf<K> rdir;
|
||||
#if defined(__aarch64__)
|
||||
Vec3vf<K> neg_org_rdir;
|
||||
#else
|
||||
Vec3vf<K> org_rdir;
|
||||
#endif
|
||||
vfloat<K> tnear;
|
||||
vfloat<K> tfar;
|
||||
};
|
||||
|
@ -87,12 +95,21 @@ namespace embree
|
|||
const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
|
||||
const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
|
||||
|
||||
#if defined (__aarch64__)
|
||||
const vfloat<N> rminX = madd(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k]));
|
||||
const vfloat<N> rminY = madd(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k]));
|
||||
const vfloat<N> rminZ = madd(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k]));
|
||||
const vfloat<N> rmaxX = madd(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k]));
|
||||
const vfloat<N> rmaxY = madd(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k]));
|
||||
const vfloat<N> rmaxZ = madd(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k]));
|
||||
#else
|
||||
const vfloat<N> rminX = msub(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
|
||||
const vfloat<N> rminY = msub(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
|
||||
const vfloat<N> rminZ = msub(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
|
||||
const vfloat<N> rmaxX = msub(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
|
||||
const vfloat<N> rmaxY = msub(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
|
||||
const vfloat<N> rmaxZ = msub(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
|
||||
#endif
|
||||
const vfloat<N> rmin = maxi(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k]));
|
||||
const vfloat<N> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k]));
|
||||
|
||||
|
@ -113,12 +130,21 @@ namespace embree
|
|||
const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
|
||||
const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
|
||||
|
||||
#if defined (__aarch64__)
|
||||
const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
|
||||
const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
|
||||
const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
|
||||
#else
|
||||
const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
|
||||
const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
|
||||
const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
|
||||
const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
|
||||
#endif
|
||||
|
||||
const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear);
|
||||
const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
|
||||
|
|
|
@ -332,7 +332,7 @@ namespace embree
|
|||
intersectorN.intersect(this,rayN,N,context);
|
||||
}
|
||||
|
||||
#if defined(__SSE__)
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
__forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
|
||||
const vint<4> mask = valid.mask32();
|
||||
intersect4(&mask,(RTCRayHit4&)ray,context);
|
||||
|
@ -388,7 +388,7 @@ namespace embree
|
|||
intersectorN.occluded(this,rayN,N,context);
|
||||
}
|
||||
|
||||
#if defined(__SSE__)
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
__forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
|
||||
const vint<4> mask = valid.mask32();
|
||||
occluded4(&mask,(RTCRay4&)ray,context);
|
||||
|
|
|
@ -97,7 +97,7 @@ namespace embree
|
|||
for (size_t i=0; i<This->accels.size(); i++) {
|
||||
if (This->accels[i]->isEmpty()) continue;
|
||||
This->accels[i]->intersectors.occluded4(valid,ray,context);
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
vbool4 valid0 = asBool(((vint4*)valid)[0]);
|
||||
vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
|
||||
if (unlikely(none(valid0 & hit0))) break;
|
||||
|
@ -111,7 +111,7 @@ namespace embree
|
|||
for (size_t i=0; i<This->accels.size(); i++) {
|
||||
if (This->accels[i]->isEmpty()) continue;
|
||||
This->accels[i]->intersectors.occluded8(valid,ray,context);
|
||||
#if defined(__SSE2__) // FIXME: use higher ISA
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
|
||||
vbool4 valid0 = asBool(((vint4*)valid)[0]);
|
||||
vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
|
||||
vbool4 valid1 = asBool(((vint4*)valid)[1]);
|
||||
|
@ -127,7 +127,7 @@ namespace embree
|
|||
for (size_t i=0; i<This->accels.size(); i++) {
|
||||
if (This->accels[i]->isEmpty()) continue;
|
||||
This->accels[i]->intersectors.occluded16(valid,ray,context);
|
||||
#if defined(__SSE2__) // FIXME: use higher ISA
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
|
||||
vbool4 valid0 = asBool(((vint4*)valid)[0]);
|
||||
vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
|
||||
vbool4 valid1 = asBool(((vint4*)valid)[1]);
|
||||
|
|
|
@ -14,21 +14,14 @@ namespace embree
|
|||
struct IntersectFunctionNArguments;
|
||||
struct OccludedFunctionNArguments;
|
||||
|
||||
typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
|
||||
typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
|
||||
|
||||
struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
|
||||
{
|
||||
IntersectContext* internal_context;
|
||||
Geometry* geometry;
|
||||
ReportIntersectionFunc report;
|
||||
};
|
||||
|
||||
struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
|
||||
{
|
||||
IntersectContext* internal_context;
|
||||
Geometry* geometry;
|
||||
ReportOcclusionFunc report;
|
||||
};
|
||||
|
||||
/*! Base class for set of acceleration structures. */
|
||||
|
@ -145,7 +138,7 @@ namespace embree
|
|||
public:
|
||||
|
||||
/*! Intersects a single ray with the scene. */
|
||||
__forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report)
|
||||
__forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
|
||||
{
|
||||
assert(primID < size());
|
||||
assert(intersectorN.intersect);
|
||||
|
@ -159,15 +152,13 @@ namespace embree
|
|||
args.N = 1;
|
||||
args.geomID = geomID;
|
||||
args.primID = primID;
|
||||
args.internal_context = context;
|
||||
args.geometry = this;
|
||||
args.report = report;
|
||||
|
||||
intersectorN.intersect(&args);
|
||||
}
|
||||
|
||||
/*! Tests if single ray is occluded by the scene. */
|
||||
__forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
|
||||
__forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
|
||||
{
|
||||
assert(primID < size());
|
||||
assert(intersectorN.occluded);
|
||||
|
@ -181,16 +172,14 @@ namespace embree
|
|||
args.N = 1;
|
||||
args.geomID = geomID;
|
||||
args.primID = primID;
|
||||
args.internal_context = context;
|
||||
args.geometry = this;
|
||||
args.report = report;
|
||||
|
||||
intersectorN.occluded(&args);
|
||||
}
|
||||
|
||||
/*! Intersects a packet of K rays with the scene. */
|
||||
template<int K>
|
||||
__forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report)
|
||||
__forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
|
||||
{
|
||||
assert(primID < size());
|
||||
assert(intersectorN.intersect);
|
||||
|
@ -204,16 +193,14 @@ namespace embree
|
|||
args.N = K;
|
||||
args.geomID = geomID;
|
||||
args.primID = primID;
|
||||
args.internal_context = context;
|
||||
args.geometry = this;
|
||||
args.report = report;
|
||||
|
||||
intersectorN.intersect(&args);
|
||||
}
|
||||
|
||||
/*! Tests if a packet of K rays is occluded by the scene. */
|
||||
template<int K>
|
||||
__forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
|
||||
__forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
|
||||
{
|
||||
assert(primID < size());
|
||||
assert(intersectorN.occluded);
|
||||
|
@ -227,9 +214,7 @@ namespace embree
|
|||
args.N = K;
|
||||
args.geomID = geomID;
|
||||
args.primID = primID;
|
||||
args.internal_context = context;
|
||||
args.geometry = this;
|
||||
args.report = report;
|
||||
|
||||
intersectorN.occluded(&args);
|
||||
}
|
||||
|
|
|
@ -3,6 +3,9 @@
|
|||
|
||||
#include "alloc.h"
|
||||
#include "../../common/sys/thread.h"
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
#include "../../common/sys/barrier.h"
|
||||
#endif
|
||||
|
||||
namespace embree
|
||||
{
|
||||
|
|
|
@ -8,6 +8,10 @@
|
|||
#include "scene.h"
|
||||
#include "primref.h"
|
||||
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
#include <mutex>
|
||||
#endif
|
||||
|
||||
namespace embree
|
||||
{
|
||||
class FastAllocator
|
||||
|
@ -26,7 +30,7 @@ namespace embree
|
|||
public:
|
||||
|
||||
struct ThreadLocal2;
|
||||
enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
|
||||
enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
|
||||
|
||||
/*! Per thread structure holding the current memory block. */
|
||||
struct __aligned(64) ThreadLocal
|
||||
|
@ -132,7 +136,11 @@ namespace embree
|
|||
{
|
||||
assert(alloc_i);
|
||||
if (alloc.load() == alloc_i) return;
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::scoped_lock lock(mutex);
|
||||
#else
|
||||
Lock<SpinLock> lock(mutex);
|
||||
#endif
|
||||
//if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
|
||||
if (alloc.load()) {
|
||||
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
|
||||
|
@ -150,7 +158,11 @@ namespace embree
|
|||
{
|
||||
assert(alloc_i);
|
||||
if (alloc.load() != alloc_i) return;
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::scoped_lock lock(mutex);
|
||||
#else
|
||||
Lock<SpinLock> lock(mutex);
|
||||
#endif
|
||||
if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
|
||||
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
|
||||
alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
|
||||
|
@ -161,7 +173,11 @@ namespace embree
|
|||
}
|
||||
|
||||
public:
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::mutex mutex;
|
||||
#else
|
||||
SpinLock mutex; //!< required as unbind is called from other threads
|
||||
#endif
|
||||
std::atomic<FastAllocator*> alloc; //!< parent allocator
|
||||
ThreadLocal alloc0;
|
||||
ThreadLocal alloc1;
|
||||
|
@ -169,7 +185,7 @@ namespace embree
|
|||
|
||||
FastAllocator (Device* device, bool osAllocation)
|
||||
: device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
|
||||
growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
|
||||
growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
|
||||
primrefarray(device,0)
|
||||
{
|
||||
for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
|
||||
|
@ -206,7 +222,7 @@ namespace embree
|
|||
|
||||
void setOSallocation(bool flag)
|
||||
{
|
||||
atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
|
||||
atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -217,7 +233,11 @@ namespace embree
|
|||
ThreadLocal2* alloc = thread_local_allocator2;
|
||||
if (alloc == nullptr) {
|
||||
thread_local_allocator2 = alloc = new ThreadLocal2;
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::scoped_lock lock(s_thread_local_allocators_lock);
|
||||
#else
|
||||
Lock<SpinLock> lock(s_thread_local_allocators_lock);
|
||||
#endif
|
||||
s_thread_local_allocators.push_back(make_unique(alloc));
|
||||
}
|
||||
return alloc;
|
||||
|
@ -227,7 +247,11 @@ namespace embree
|
|||
|
||||
__forceinline void join(ThreadLocal2* alloc)
|
||||
{
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::scoped_lock lock(s_thread_local_allocators_lock);
|
||||
#else
|
||||
Lock<SpinLock> lock(thread_local_allocators_lock);
|
||||
#endif
|
||||
thread_local_allocators.push_back(alloc);
|
||||
}
|
||||
|
||||
|
@ -492,7 +516,11 @@ namespace embree
|
|||
/* parallel block creation in case of no freeBlocks, avoids single global mutex */
|
||||
if (likely(freeBlocks.load() == nullptr))
|
||||
{
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::scoped_lock lock(slotMutex[slot]);
|
||||
#else
|
||||
Lock<SpinLock> lock(slotMutex[slot]);
|
||||
#endif
|
||||
if (myUsedBlocks == threadUsedBlocks[slot]) {
|
||||
const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
|
||||
const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
|
||||
|
@ -505,7 +533,11 @@ namespace embree
|
|||
|
||||
/* if this fails allocate new block */
|
||||
{
|
||||
Lock<SpinLock> lock(mutex);
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::scoped_lock lock(mutex);
|
||||
#else
|
||||
Lock<SpinLock> lock(mutex);
|
||||
#endif
|
||||
if (myUsedBlocks == threadUsedBlocks[slot])
|
||||
{
|
||||
if (freeBlocks.load() != nullptr) {
|
||||
|
@ -527,7 +559,11 @@ namespace embree
|
|||
/*! add new block */
|
||||
void addBlock(void* ptr, ssize_t bytes)
|
||||
{
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::scoped_lock lock(mutex);
|
||||
#else
|
||||
Lock<SpinLock> lock(mutex);
|
||||
#endif
|
||||
const size_t sizeof_Header = offsetof(Block,data[0]);
|
||||
void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
|
||||
size_t ofs = (size_t) aptr - (size_t) ptr;
|
||||
|
@ -613,8 +649,8 @@ namespace embree
|
|||
bytesWasted(alloc->bytesWasted),
|
||||
stat_all(alloc,ANY_TYPE),
|
||||
stat_malloc(alloc,ALIGNED_MALLOC),
|
||||
stat_4K(alloc,OS_MALLOC,false),
|
||||
stat_2M(alloc,OS_MALLOC,true),
|
||||
stat_4K(alloc,EMBREE_OS_MALLOC,false),
|
||||
stat_2M(alloc,EMBREE_OS_MALLOC,true),
|
||||
stat_shared(alloc,SHARED) {}
|
||||
|
||||
AllStatistics (size_t bytesUsed,
|
||||
|
@ -707,7 +743,7 @@ namespace embree
|
|||
/* We avoid using os_malloc for small blocks as this could
|
||||
* cause a risk of fragmenting the virtual address space and
|
||||
* reach the limit of vm.max_map_count = 65k under Linux. */
|
||||
if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
|
||||
if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
|
||||
atype = ALIGNED_MALLOC;
|
||||
|
||||
/* we need to additionally allocate some header */
|
||||
|
@ -716,7 +752,7 @@ namespace embree
|
|||
bytesReserve = sizeof_Header+bytesReserve;
|
||||
|
||||
/* consume full 4k pages with using os_malloc */
|
||||
if (atype == OS_MALLOC) {
|
||||
if (atype == EMBREE_OS_MALLOC) {
|
||||
bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
|
||||
bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
|
||||
}
|
||||
|
@ -748,11 +784,11 @@ namespace embree
|
|||
return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
|
||||
}
|
||||
}
|
||||
else if (atype == OS_MALLOC)
|
||||
else if (atype == EMBREE_OS_MALLOC)
|
||||
{
|
||||
if (device) device->memoryMonitor(bytesAllocate,false);
|
||||
bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
|
||||
return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
|
||||
return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
|
||||
}
|
||||
else
|
||||
assert(false);
|
||||
|
@ -796,7 +832,7 @@ namespace embree
|
|||
if (device) device->memoryMonitor(-sizeof_Alloced,true);
|
||||
}
|
||||
|
||||
else if (atype == OS_MALLOC) {
|
||||
else if (atype == EMBREE_OS_MALLOC) {
|
||||
size_t sizeof_This = sizeof_Header+reserveEnd;
|
||||
os_free(this,sizeof_This,huge_pages);
|
||||
if (device) device->memoryMonitor(-sizeof_Alloced,true);
|
||||
|
@ -857,7 +893,7 @@ namespace embree
|
|||
bool hasType(AllocationType atype_i, bool huge_pages_i) const
|
||||
{
|
||||
if (atype_i == ANY_TYPE ) return true;
|
||||
else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
|
||||
else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
|
||||
else return atype_i == atype;
|
||||
}
|
||||
|
||||
|
@ -906,7 +942,7 @@ namespace embree
|
|||
void print_block() const
|
||||
{
|
||||
if (atype == ALIGNED_MALLOC) std::cout << "A";
|
||||
else if (atype == OS_MALLOC) std::cout << "O";
|
||||
else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
|
||||
else if (atype == SHARED) std::cout << "S";
|
||||
if (huge_pages) std::cout << "H";
|
||||
size_t bytesUsed = getBlockUsedBytes();
|
||||
|
@ -936,7 +972,11 @@ namespace embree
|
|||
std::atomic<Block*> freeBlocks;
|
||||
|
||||
std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
|
||||
SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
|
||||
#else
|
||||
PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
|
||||
#endif
|
||||
|
||||
bool use_single_mode;
|
||||
size_t defaultBlockSize;
|
||||
|
@ -950,7 +990,11 @@ namespace embree
|
|||
static __thread ThreadLocal2* thread_local_allocator2;
|
||||
static SpinLock s_thread_local_allocators_lock;
|
||||
static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
|
||||
#if defined(APPLE) && defined(__aarch64__)
|
||||
std::mutex thread_local_allocators_lock;
|
||||
#else
|
||||
SpinLock thread_local_allocators_lock;
|
||||
#endif
|
||||
std::vector<ThreadLocal2*> thread_local_allocators;
|
||||
AllocationType atype;
|
||||
mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
|
||||
|
|
|
@ -66,7 +66,11 @@ namespace embree
|
|||
case CPU::CORE1: frequency_level = FREQUENCY_SIMD128; break;
|
||||
case CPU::XEON_PHI_KNIGHTS_MILL : frequency_level = FREQUENCY_SIMD512; break;
|
||||
case CPU::XEON_PHI_KNIGHTS_LANDING: frequency_level = FREQUENCY_SIMD512; break;
|
||||
#if defined(__APPLE__)
|
||||
case CPU::ARM: frequency_level = FREQUENCY_SIMD256; break; // Apple M1 supports high throughput for SIMD4
|
||||
#else
|
||||
case CPU::ARM: frequency_level = FREQUENCY_SIMD128; break;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* initialize global state */
|
||||
|
|
|
@ -91,7 +91,7 @@ namespace embree
|
|||
|
||||
size_t numFilterFunctions; //!< number of geometries with filter functions enabled
|
||||
size_t numTriangles; //!< number of enabled triangles
|
||||
size_t numMBTriangles; //!< number of enabled motion blured triangles
|
||||
size_t numMBTriangles; //!< number of enabled motion blurred triangles
|
||||
size_t numQuads; //!< number of enabled quads
|
||||
size_t numMBQuads; //!< number of enabled motion blurred quads
|
||||
size_t numBezierCurves; //!< number of enabled curves
|
||||
|
@ -99,7 +99,7 @@ namespace embree
|
|||
size_t numLineSegments; //!< number of enabled line segments
|
||||
size_t numMBLineSegments; //!< number of enabled line motion blurred segments
|
||||
size_t numSubdivPatches; //!< number of enabled subdivision patches
|
||||
size_t numMBSubdivPatches; //!< number of enabled motion blured subdivision patches
|
||||
size_t numMBSubdivPatches; //!< number of enabled motion blurred subdivision patches
|
||||
size_t numUserGeometries; //!< number of enabled user geometries
|
||||
size_t numMBUserGeometries; //!< number of enabled motion blurred user geometries
|
||||
size_t numInstancesCheap; //!< number of enabled cheap instances
|
||||
|
|
|
@ -44,7 +44,7 @@ namespace embree
|
|||
#define SELECT_SYMBOL_DEFAULT(features,intersector) \
|
||||
intersector = isa::intersector;
|
||||
|
||||
#if defined(__SSE__)
|
||||
#if defined(__SSE__) || defined(__ARM_NEON)
|
||||
#if !defined(EMBREE_TARGET_SIMD4)
|
||||
#define EMBREE_TARGET_SIMD4
|
||||
#endif
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
#include "default.h"
|
||||
#include "instance_stack.h"
|
||||
|
||||
// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted
|
||||
// FIXME: if ray gets separated into ray* and hit, uload4 needs to be adjusted
|
||||
|
||||
namespace embree
|
||||
{
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include "device.h"
|
||||
#include "scene.h"
|
||||
#include "context.h"
|
||||
#include "../geometry/filter.h"
|
||||
#include "../../include/embree3/rtcore_ray.h"
|
||||
using namespace embree;
|
||||
|
||||
|
@ -482,7 +483,7 @@ RTC_NAMESPACE_BEGIN;
|
|||
|
||||
IntersectContext context(scene,user_context);
|
||||
#if !defined(EMBREE_RAY_PACKETS)
|
||||
Ray4* ray4 = (Ray4*) rayhit;
|
||||
RayHit4* ray4 = (RayHit4*) rayhit;
|
||||
for (size_t i=0; i<4; i++) {
|
||||
if (!valid[i]) continue;
|
||||
RayHit ray1; ray4->get(i,ray1);
|
||||
|
@ -513,7 +514,7 @@ RTC_NAMESPACE_BEGIN;
|
|||
|
||||
IntersectContext context(scene,user_context);
|
||||
#if !defined(EMBREE_RAY_PACKETS)
|
||||
Ray8* ray8 = (Ray8*) rayhit;
|
||||
RayHit8* ray8 = (RayHit8*) rayhit;
|
||||
for (size_t i=0; i<8; i++) {
|
||||
if (!valid[i]) continue;
|
||||
RayHit ray1; ray8->get(i,ray1);
|
||||
|
@ -546,7 +547,7 @@ RTC_NAMESPACE_BEGIN;
|
|||
|
||||
IntersectContext context(scene,user_context);
|
||||
#if !defined(EMBREE_RAY_PACKETS)
|
||||
Ray16* ray16 = (Ray16*) rayhit;
|
||||
RayHit16* ray16 = (RayHit16*) rayhit;
|
||||
for (size_t i=0; i<16; i++) {
|
||||
if (!valid[i]) continue;
|
||||
RayHit ray1; ray16->get(i,ray1);
|
||||
|
@ -1097,13 +1098,13 @@ RTC_NAMESPACE_BEGIN;
|
|||
RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
|
||||
{
|
||||
IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
|
||||
args->report(args,filter_args);
|
||||
isa::reportIntersection1(args, filter_args);
|
||||
}
|
||||
|
||||
RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
|
||||
{
|
||||
OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
|
||||
args->report(args,filter_args);
|
||||
isa::reportOcclusion1(args,filter_args);
|
||||
}
|
||||
|
||||
RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
|
||||
|
@ -1763,4 +1764,19 @@ RTC_NAMESPACE_BEGIN;
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
RTC_API RTCGeometry rtcGetGeometryThreadSafe (RTCScene hscene, unsigned int geomID)
|
||||
{
|
||||
Scene* scene = (Scene*) hscene;
|
||||
RTC_CATCH_BEGIN;
|
||||
RTC_TRACE(rtcGetGeometryThreadSafe);
|
||||
#if defined(DEBUG)
|
||||
RTC_VERIFY_HANDLE(hscene);
|
||||
RTC_VERIFY_GEOMID(geomID);
|
||||
#endif
|
||||
Ref<Geometry> geom = scene->get_locked(geomID);
|
||||
return (RTCGeometry) geom.ptr;
|
||||
RTC_CATCH_END2(scene);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
RTC_NAMESPACE_END
|
||||
|
|
|
@ -26,56 +26,59 @@ namespace embree
|
|||
|
||||
/*! Macros used in the rtcore API implementation */
|
||||
// -- GODOT start --
|
||||
// #define RTC_CATCH_BEGIN try {
|
||||
#define RTC_CATCH_BEGIN
|
||||
|
||||
// #define RTC_CATCH_END(device) \
|
||||
// } catch (std::bad_alloc&) { \
|
||||
// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
|
||||
// } catch (rtcore_error& e) { \
|
||||
// Device::process_error(device,e.error,e.what()); \
|
||||
// } catch (std::exception& e) { \
|
||||
// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
|
||||
// } catch (...) { \
|
||||
// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
|
||||
// }
|
||||
#define RTC_CATCH_END(device)
|
||||
|
||||
// #define RTC_CATCH_END2(scene) \
|
||||
// } catch (std::bad_alloc&) { \
|
||||
// Device* device = scene ? scene->device : nullptr; \
|
||||
// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
|
||||
// } catch (rtcore_error& e) { \
|
||||
// Device* device = scene ? scene->device : nullptr; \
|
||||
// Device::process_error(device,e.error,e.what()); \
|
||||
// } catch (std::exception& e) { \
|
||||
// Device* device = scene ? scene->device : nullptr; \
|
||||
// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
|
||||
// } catch (...) { \
|
||||
// Device* device = scene ? scene->device : nullptr; \
|
||||
// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
|
||||
// }
|
||||
#define RTC_CATCH_END2(scene)
|
||||
|
||||
// #define RTC_CATCH_END2_FALSE(scene) \
|
||||
// } catch (std::bad_alloc&) { \
|
||||
// Device* device = scene ? scene->device : nullptr; \
|
||||
// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
|
||||
// return false; \
|
||||
// } catch (rtcore_error& e) { \
|
||||
// Device* device = scene ? scene->device : nullptr; \
|
||||
// Device::process_error(device,e.error,e.what()); \
|
||||
// return false; \
|
||||
// } catch (std::exception& e) { \
|
||||
// Device* device = scene ? scene->device : nullptr; \
|
||||
// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
|
||||
// return false; \
|
||||
// } catch (...) { \
|
||||
// Device* device = scene ? scene->device : nullptr; \
|
||||
// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
|
||||
// return false; \
|
||||
// }
|
||||
#define RTC_CATCH_END2_FALSE(scene) return false;
|
||||
|
||||
#if 0
|
||||
#define RTC_CATCH_BEGIN try {
|
||||
|
||||
#define RTC_CATCH_END(device) \
|
||||
} catch (std::bad_alloc&) { \
|
||||
Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
|
||||
} catch (rtcore_error& e) { \
|
||||
Device::process_error(device,e.error,e.what()); \
|
||||
} catch (std::exception& e) { \
|
||||
Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
|
||||
} catch (...) { \
|
||||
Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
|
||||
}
|
||||
|
||||
#define RTC_CATCH_END2(scene) \
|
||||
} catch (std::bad_alloc&) { \
|
||||
Device* device = scene ? scene->device : nullptr; \
|
||||
Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
|
||||
} catch (rtcore_error& e) { \
|
||||
Device* device = scene ? scene->device : nullptr; \
|
||||
Device::process_error(device,e.error,e.what()); \
|
||||
} catch (std::exception& e) { \
|
||||
Device* device = scene ? scene->device : nullptr; \
|
||||
Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
|
||||
} catch (...) { \
|
||||
Device* device = scene ? scene->device : nullptr; \
|
||||
Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
|
||||
}
|
||||
|
||||
#define RTC_CATCH_END2_FALSE(scene) \
|
||||
} catch (std::bad_alloc&) { \
|
||||
Device* device = scene ? scene->device : nullptr; \
|
||||
Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
|
||||
return false; \
|
||||
} catch (rtcore_error& e) { \
|
||||
Device* device = scene ? scene->device : nullptr; \
|
||||
Device::process_error(device,e.error,e.what()); \
|
||||
return false; \
|
||||
} catch (std::exception& e) { \
|
||||
Device* device = scene ? scene->device : nullptr; \
|
||||
Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
|
||||
return false; \
|
||||
} catch (...) { \
|
||||
Device* device = scene ? scene->device : nullptr; \
|
||||
Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
|
||||
return false; \
|
||||
}
|
||||
#endif
|
||||
// -- GODOT end --
|
||||
|
||||
#define RTC_VERIFY_HANDLE(handle) \
|
||||
|
@ -103,39 +106,35 @@ namespace embree
|
|||
#define RTC_TRACE(x)
|
||||
#endif
|
||||
|
||||
// -- GODOT begin --
|
||||
// /*! used to throw embree API errors */
|
||||
// struct rtcore_error : public std::exception
|
||||
// {
|
||||
// __forceinline rtcore_error(RTCError error, const std::string& str)
|
||||
// : error(error), str(str) {}
|
||||
//
|
||||
// ~rtcore_error() throw() {}
|
||||
//
|
||||
// const char* what () const throw () {
|
||||
// return str.c_str();
|
||||
// }
|
||||
//
|
||||
// RTCError error;
|
||||
// std::string str;
|
||||
// };
|
||||
// -- GODOT end --
|
||||
// -- GODOT start --
|
||||
#if 0
|
||||
/*! used to throw embree API errors */
|
||||
struct rtcore_error : public std::exception
|
||||
{
|
||||
__forceinline rtcore_error(RTCError error, const std::string& str)
|
||||
: error(error), str(str) {}
|
||||
|
||||
~rtcore_error() throw() {}
|
||||
|
||||
const char* what () const throw () {
|
||||
return str.c_str();
|
||||
}
|
||||
|
||||
RTCError error;
|
||||
std::string str;
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(DEBUG) // only report file and line in debug mode
|
||||
// -- GODOT begin --
|
||||
// #define throw_RTCError(error,str) \
|
||||
// throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
|
||||
#define throw_RTCError(error,str) \
|
||||
printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort();
|
||||
// -- GODOT end --
|
||||
// throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
|
||||
#else
|
||||
// -- GODOT begin --
|
||||
// #define throw_RTCError(error,str) \
|
||||
// throw rtcore_error(error,str);
|
||||
#define throw_RTCError(error,str) \
|
||||
abort();
|
||||
// -- GODOT end --
|
||||
// throw rtcore_error(error,str);
|
||||
#endif
|
||||
// -- GODOT end --
|
||||
|
||||
#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
|
||||
(settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member)))
|
||||
|
|
|
@ -371,7 +371,7 @@ RTC_NAMESPACE_BEGIN
|
|||
bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa));
|
||||
bvh->allocator.reset();
|
||||
|
||||
/* switch between differnet builders based on quality level */
|
||||
/* switch between different builders based on quality level */
|
||||
if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW)
|
||||
return rtcBuildBVHMorton(arguments);
|
||||
else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM)
|
||||
|
|
|
@ -629,9 +629,7 @@ namespace embree
|
|||
if (geometry == null)
|
||||
throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
|
||||
|
||||
if (geometry->isEnabled()) {
|
||||
setModified ();
|
||||
}
|
||||
setModified ();
|
||||
accels_deleteGeometry(unsigned(geomID));
|
||||
id_pool.deallocate((unsigned)geomID);
|
||||
geometries[geomID] = null;
|
||||
|
|
|
@ -452,6 +452,10 @@ namespace embree
|
|||
const Vec3fa n1 = normal(index+1,itime);
|
||||
if (!isvalid(n0) || !isvalid(n1))
|
||||
return false;
|
||||
|
||||
const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds();
|
||||
if (!isvalid(b))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -612,6 +616,10 @@ namespace embree
|
|||
const Vec3fa dn1 = dnormal(index+1,itime);
|
||||
if (!isvalid(dn0) || !isvalid(dn1))
|
||||
return false;
|
||||
|
||||
const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds();
|
||||
if (!isvalid(b))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -144,7 +144,20 @@ namespace embree
|
|||
}
|
||||
|
||||
bool State::checkISASupport() {
|
||||
#if defined(__ARM_NEON)
|
||||
/*
|
||||
* NEON CPU type is a mixture of NEON and SSE2
|
||||
*/
|
||||
|
||||
bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
|
||||
|
||||
/* this will be true when explicitly initialize Device with `isa=neon` config */
|
||||
bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
|
||||
|
||||
return hasSSE2 || hasNEON;
|
||||
#else
|
||||
return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
|
||||
#endif
|
||||
}
|
||||
|
||||
void State::verify()
|
||||
|
@ -157,8 +170,10 @@ namespace embree
|
|||
* functions */
|
||||
#if defined(DEBUG)
|
||||
#if defined(EMBREE_TARGET_SSE2)
|
||||
#if !defined(__ARM_NEON)
|
||||
assert(sse2::getISA() <= SSE2);
|
||||
#endif
|
||||
#endif
|
||||
#if defined(EMBREE_TARGET_SSE42)
|
||||
assert(sse42::getISA() <= SSE42);
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
|
||||
// Copyright 2009-2020 Intel Corporation
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/* #undef EMBREE_RAY_MASK */
|
||||
|
@ -20,6 +19,7 @@
|
|||
/* #undef EMBREE_COMPACT_POLYS */
|
||||
|
||||
#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
|
||||
#define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE
|
||||
|
||||
#if defined(EMBREE_GEOMETRY_TRIANGLE)
|
||||
#define IF_ENABLED_TRIS(x) x
|
||||
|
|
|
@ -225,7 +225,7 @@ namespace embree
|
|||
/* exit if convergence cannot get proven, but terminate if we are very small */
|
||||
if (unlikely(!subset(K,x) && !very_small)) return false;
|
||||
|
||||
/* solve using newton raphson iteration of convergence is guarenteed */
|
||||
/* solve using newton raphson iteration of convergence is guaranteed */
|
||||
solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J);
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -60,7 +60,7 @@ namespace embree
|
|||
const Vec3fa dir = ray.dir;
|
||||
const float length_ray_dir = length(dir);
|
||||
|
||||
/* error of curve evaluations is propertional to largest coordinate */
|
||||
/* error of curve evaluations is proportional to largest coordinate */
|
||||
const BBox3ff box = curve.bounds();
|
||||
const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper)));
|
||||
|
||||
|
|
|
@ -68,15 +68,15 @@ namespace embree
|
|||
const Vec3vf<M> center = v0.xyz();
|
||||
const vfloat<M> radius = v0.w;
|
||||
|
||||
/* compute ray distance projC0 to hit point with ray oriented plane */
|
||||
const Vec3vf<M> c0 = center - ray_org;
|
||||
const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
|
||||
|
||||
valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar));
|
||||
if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
|
||||
valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale; // ignore self intersections
|
||||
if (unlikely(none(valid)))
|
||||
return false;
|
||||
|
||||
|
||||
/* check if hit point lies inside disc */
|
||||
const Vec3vf<M> perp = c0 - projC0 * ray_dir;
|
||||
const vfloat<M> l2 = dot(perp, perp);
|
||||
const vfloat<M> r2 = radius * radius;
|
||||
|
@ -84,6 +84,15 @@ namespace embree
|
|||
if (unlikely(none(valid)))
|
||||
return false;
|
||||
|
||||
/* We reject hits where the ray origin lies inside the ray
|
||||
* oriented disc to avoid self intersections. */
|
||||
#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE)
|
||||
const vfloat<M> m2 = dot(c0, c0);
|
||||
valid &= (m2 > r2);
|
||||
if (unlikely(none(valid)))
|
||||
return false;
|
||||
#endif
|
||||
|
||||
DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
|
||||
return epilog(valid, hit);
|
||||
}
|
||||
|
@ -152,15 +161,15 @@ namespace embree
|
|||
const Vec3vf<M> center = v0.xyz();
|
||||
const vfloat<M> radius = v0.w;
|
||||
|
||||
/* compute ray distance projC0 to hit point with ray oriented plane */
|
||||
const Vec3vf<M> c0 = center - ray_org;
|
||||
const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
|
||||
|
||||
valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k]));
|
||||
if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
|
||||
valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k]; // ignore self intersections
|
||||
if (unlikely(none(valid)))
|
||||
return false;
|
||||
|
||||
/* check if hit point lies inside disc */
|
||||
const Vec3vf<M> perp = c0 - projC0 * ray_dir;
|
||||
const vfloat<M> l2 = dot(perp, perp);
|
||||
const vfloat<M> r2 = radius * radius;
|
||||
|
@ -168,6 +177,15 @@ namespace embree
|
|||
if (unlikely(none(valid)))
|
||||
return false;
|
||||
|
||||
/* We reject hits where the ray origin lies inside the ray
|
||||
* oriented disc to avoid self intersections. */
|
||||
#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE)
|
||||
const vfloat<M> m2 = dot(c0, c0);
|
||||
valid &= (m2 > r2);
|
||||
if (unlikely(none(valid)))
|
||||
return false;
|
||||
#endif
|
||||
|
||||
DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
|
||||
return epilog(valid, hit);
|
||||
}
|
||||
|
|
|
@ -51,20 +51,11 @@ namespace embree
|
|||
__forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
|
||||
{
|
||||
#if defined(EMBREE_FILTER_FUNCTION)
|
||||
IntersectContext* MAYBE_UNUSED context = args->internal_context;
|
||||
const Geometry* const geometry = args->geometry;
|
||||
if (geometry->intersectionFilterN) {
|
||||
assert(context->scene->hasGeometryFilterFunction());
|
||||
geometry->intersectionFilterN(filter_args);
|
||||
}
|
||||
if (args->geometry->intersectionFilterN)
|
||||
args->geometry->intersectionFilterN(filter_args);
|
||||
|
||||
//if (args->valid[0] == 0)
|
||||
// return;
|
||||
|
||||
if (context->user->filter) {
|
||||
assert(context->scene->hasContextFilterFunction());
|
||||
context->user->filter(filter_args);
|
||||
}
|
||||
if (args->context->filter)
|
||||
args->context->filter(filter_args);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -105,20 +96,11 @@ namespace embree
|
|||
__forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
|
||||
{
|
||||
#if defined(EMBREE_FILTER_FUNCTION)
|
||||
IntersectContext* MAYBE_UNUSED context = args->internal_context;
|
||||
const Geometry* const geometry = args->geometry;
|
||||
if (geometry->occlusionFilterN) {
|
||||
assert(context->scene->hasGeometryFilterFunction());
|
||||
geometry->occlusionFilterN(filter_args);
|
||||
}
|
||||
if (args->geometry->occlusionFilterN)
|
||||
args->geometry->occlusionFilterN(filter_args);
|
||||
|
||||
//if (args->valid[0] == 0)
|
||||
// return false;
|
||||
|
||||
if (context->user->filter) {
|
||||
assert(context->scene->hasContextFilterFunction());
|
||||
context->user->filter(filter_args);
|
||||
}
|
||||
if (args->context->filter)
|
||||
args->context->filter(filter_args);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ namespace embree
|
|||
return;
|
||||
#endif
|
||||
|
||||
accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1);
|
||||
accel->intersect(ray,prim.geomID(),prim.primID(),context);
|
||||
}
|
||||
|
||||
static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
|
||||
|
@ -44,7 +44,7 @@ namespace embree
|
|||
return false;
|
||||
#endif
|
||||
|
||||
accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
|
||||
accel->occluded(ray,prim.geomID(),prim.primID(),context);
|
||||
return ray.tfar < 0.0f;
|
||||
}
|
||||
|
||||
|
@ -89,7 +89,7 @@ namespace embree
|
|||
valid &= (ray.mask & accel->mask) != 0;
|
||||
if (none(valid)) return;
|
||||
#endif
|
||||
accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1);
|
||||
accel->intersect(valid,ray,prim.geomID(),prim.primID(),context);
|
||||
}
|
||||
|
||||
static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim)
|
||||
|
@ -102,7 +102,7 @@ namespace embree
|
|||
valid &= (ray.mask & accel->mask) != 0;
|
||||
if (none(valid)) return false;
|
||||
#endif
|
||||
accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
|
||||
accel->occluded(valid,ray,prim.geomID(),prim.primID(),context);
|
||||
return ray.tfar < 0.0f;
|
||||
}
|
||||
|
||||
|
|
|
@ -152,7 +152,7 @@ namespace embree
|
|||
Vec3vf<M> v0; // 1st vertex of the quads
|
||||
Vec3vf<M> v1; // 2nd vertex of the quads
|
||||
Vec3vf<M> v2; // 3rd vertex of the quads
|
||||
Vec3vf<M> v3; // 4rd vertex of the quads
|
||||
Vec3vf<M> v3; // 4th vertex of the quads
|
||||
private:
|
||||
vuint<M> geomIDs; // geometry ID
|
||||
vuint<M> primIDs; // primitive ID
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
For multiple connected round linear curve segments this construction
|
||||
yield a proper shape when viewed from the outside. Using the
|
||||
following CSG we can also handle the interiour in most common cases:
|
||||
following CSG we can also handle the interior in most common cases:
|
||||
|
||||
round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) =
|
||||
cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr)
|
||||
|
@ -431,7 +431,7 @@ namespace embree
|
|||
Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP
|
||||
|
||||
Inserting the definition of w0 and dw and refactoring
|
||||
yield a furhter scaled Ng'':
|
||||
yield a further scaled Ng'':
|
||||
|
||||
Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue